diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 545 |
1 files changed, 228 insertions, 317 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1b36cf3e3d..2a2a9b40db1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -59,15 +59,6 @@ enum { | |||
59 | AUDIT_POST_SYNC | 59 | AUDIT_POST_SYNC |
60 | }; | 60 | }; |
61 | 61 | ||
62 | char *audit_point_name[] = { | ||
63 | "pre page fault", | ||
64 | "post page fault", | ||
65 | "pre pte write", | ||
66 | "post pte write", | ||
67 | "pre sync", | ||
68 | "post sync" | ||
69 | }; | ||
70 | |||
71 | #undef MMU_DEBUG | 62 | #undef MMU_DEBUG |
72 | 63 | ||
73 | #ifdef MMU_DEBUG | 64 | #ifdef MMU_DEBUG |
@@ -87,9 +78,6 @@ static int dbg = 0; | |||
87 | module_param(dbg, bool, 0644); | 78 | module_param(dbg, bool, 0644); |
88 | #endif | 79 | #endif |
89 | 80 | ||
90 | static int oos_shadow = 1; | ||
91 | module_param(oos_shadow, bool, 0644); | ||
92 | |||
93 | #ifndef MMU_DEBUG | 81 | #ifndef MMU_DEBUG |
94 | #define ASSERT(x) do { } while (0) | 82 | #define ASSERT(x) do { } while (0) |
95 | #else | 83 | #else |
@@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | |||
593 | return 0; | 581 | return 0; |
594 | } | 582 | } |
595 | 583 | ||
584 | static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) | ||
585 | { | ||
586 | return cache->nobjs; | ||
587 | } | ||
588 | |||
596 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, | 589 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, |
597 | struct kmem_cache *cache) | 590 | struct kmem_cache *cache) |
598 | { | 591 | { |
@@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) | |||
953 | } | 946 | } |
954 | } | 947 | } |
955 | 948 | ||
949 | static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, | ||
950 | struct kvm_memory_slot *slot) | ||
951 | { | ||
952 | struct kvm_lpage_info *linfo; | ||
953 | |||
954 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | ||
955 | return &slot->rmap[gfn - slot->base_gfn]; | ||
956 | |||
957 | linfo = lpage_info_slot(gfn, slot, level); | ||
958 | return &linfo->rmap_pde; | ||
959 | } | ||
960 | |||
956 | /* | 961 | /* |
957 | * Take gfn and return the reverse mapping to it. | 962 | * Take gfn and return the reverse mapping to it. |
958 | */ | 963 | */ |
959 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) | 964 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) |
960 | { | 965 | { |
961 | struct kvm_memory_slot *slot; | 966 | struct kvm_memory_slot *slot; |
962 | struct kvm_lpage_info *linfo; | ||
963 | 967 | ||
964 | slot = gfn_to_memslot(kvm, gfn); | 968 | slot = gfn_to_memslot(kvm, gfn); |
965 | if (likely(level == PT_PAGE_TABLE_LEVEL)) | 969 | return __gfn_to_rmap(kvm, gfn, level, slot); |
966 | return &slot->rmap[gfn - slot->base_gfn]; | 970 | } |
967 | 971 | ||
968 | linfo = lpage_info_slot(gfn, slot, level); | 972 | static bool rmap_can_add(struct kvm_vcpu *vcpu) |
973 | { | ||
974 | struct kvm_mmu_memory_cache *cache; | ||
969 | 975 | ||
970 | return &linfo->rmap_pde; | 976 | cache = &vcpu->arch.mmu_pte_list_desc_cache; |
977 | return mmu_memory_cache_free_objects(cache); | ||
971 | } | 978 | } |
972 | 979 | ||
973 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 980 | static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) |
@@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
1004 | rmap_remove(kvm, sptep); | 1011 | rmap_remove(kvm, sptep); |
1005 | } | 1012 | } |
1006 | 1013 | ||
1007 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | 1014 | int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, |
1015 | struct kvm_memory_slot *slot) | ||
1008 | { | 1016 | { |
1009 | unsigned long *rmapp; | 1017 | unsigned long *rmapp; |
1010 | u64 *spte; | 1018 | u64 *spte; |
1011 | int i, write_protected = 0; | 1019 | int i, write_protected = 0; |
1012 | 1020 | ||
1013 | rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); | 1021 | rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); |
1014 | |||
1015 | spte = rmap_next(kvm, rmapp, NULL); | 1022 | spte = rmap_next(kvm, rmapp, NULL); |
1016 | while (spte) { | 1023 | while (spte) { |
1017 | BUG_ON(!spte); | ||
1018 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1024 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1019 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1025 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); |
1020 | if (is_writable_pte(*spte)) { | 1026 | if (is_writable_pte(*spte)) { |
@@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1027 | /* check for huge page mappings */ | 1033 | /* check for huge page mappings */ |
1028 | for (i = PT_DIRECTORY_LEVEL; | 1034 | for (i = PT_DIRECTORY_LEVEL; |
1029 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { | 1035 | i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { |
1030 | rmapp = gfn_to_rmap(kvm, gfn, i); | 1036 | rmapp = __gfn_to_rmap(kvm, gfn, i, slot); |
1031 | spte = rmap_next(kvm, rmapp, NULL); | 1037 | spte = rmap_next(kvm, rmapp, NULL); |
1032 | while (spte) { | 1038 | while (spte) { |
1033 | BUG_ON(!spte); | ||
1034 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1039 | BUG_ON(!(*spte & PT_PRESENT_MASK)); |
1035 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | 1040 | BUG_ON(!is_large_pte(*spte)); |
1036 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | 1041 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); |
1037 | if (is_writable_pte(*spte)) { | 1042 | if (is_writable_pte(*spte)) { |
1038 | drop_spte(kvm, spte); | 1043 | drop_spte(kvm, spte); |
@@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1047 | return write_protected; | 1052 | return write_protected; |
1048 | } | 1053 | } |
1049 | 1054 | ||
1055 | static int rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
1056 | { | ||
1057 | struct kvm_memory_slot *slot; | ||
1058 | |||
1059 | slot = gfn_to_memslot(kvm, gfn); | ||
1060 | return kvm_mmu_rmap_write_protect(kvm, gfn, slot); | ||
1061 | } | ||
1062 | |||
1050 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1063 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1051 | unsigned long data) | 1064 | unsigned long data) |
1052 | { | 1065 | { |
@@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, | |||
1103 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, | 1116 | int (*handler)(struct kvm *kvm, unsigned long *rmapp, |
1104 | unsigned long data)) | 1117 | unsigned long data)) |
1105 | { | 1118 | { |
1106 | int i, j; | 1119 | int j; |
1107 | int ret; | 1120 | int ret; |
1108 | int retval = 0; | 1121 | int retval = 0; |
1109 | struct kvm_memslots *slots; | 1122 | struct kvm_memslots *slots; |
1123 | struct kvm_memory_slot *memslot; | ||
1110 | 1124 | ||
1111 | slots = kvm_memslots(kvm); | 1125 | slots = kvm_memslots(kvm); |
1112 | 1126 | ||
1113 | for (i = 0; i < slots->nmemslots; i++) { | 1127 | kvm_for_each_memslot(memslot, slots) { |
1114 | struct kvm_memory_slot *memslot = &slots->memslots[i]; | ||
1115 | unsigned long start = memslot->userspace_addr; | 1128 | unsigned long start = memslot->userspace_addr; |
1116 | unsigned long end; | 1129 | unsigned long end; |
1117 | 1130 | ||
@@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1324 | PAGE_SIZE); | 1337 | PAGE_SIZE); |
1325 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1338 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1326 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1339 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1327 | bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); | 1340 | bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); |
1328 | sp->parent_ptes = 0; | 1341 | sp->parent_ptes = 0; |
1329 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1342 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
1330 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | 1343 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
@@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, | |||
1511 | return ret; | 1524 | return ret; |
1512 | } | 1525 | } |
1513 | 1526 | ||
1527 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
1528 | #include "mmu_audit.c" | ||
1529 | #else | ||
1530 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } | ||
1531 | static void mmu_audit_disable(void) { } | ||
1532 | #endif | ||
1533 | |||
1514 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 1534 | static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
1515 | struct list_head *invalid_list) | 1535 | struct list_head *invalid_list) |
1516 | { | 1536 | { |
@@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp) | |||
1640 | sp->spt[i] = 0ull; | 1660 | sp->spt[i] = 0ull; |
1641 | } | 1661 | } |
1642 | 1662 | ||
1663 | static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) | ||
1664 | { | ||
1665 | sp->write_flooding_count = 0; | ||
1666 | } | ||
1667 | |||
1668 | static void clear_sp_write_flooding_count(u64 *spte) | ||
1669 | { | ||
1670 | struct kvm_mmu_page *sp = page_header(__pa(spte)); | ||
1671 | |||
1672 | __clear_sp_write_flooding_count(sp); | ||
1673 | } | ||
1674 | |||
1643 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1675 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1644 | gfn_t gfn, | 1676 | gfn_t gfn, |
1645 | gva_t gaddr, | 1677 | gva_t gaddr, |
@@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1683 | } else if (sp->unsync) | 1715 | } else if (sp->unsync) |
1684 | kvm_mmu_mark_parents_unsync(sp); | 1716 | kvm_mmu_mark_parents_unsync(sp); |
1685 | 1717 | ||
1718 | __clear_sp_write_flooding_count(sp); | ||
1686 | trace_kvm_mmu_get_page(sp, false); | 1719 | trace_kvm_mmu_get_page(sp, false); |
1687 | return sp; | 1720 | return sp; |
1688 | } | 1721 | } |
@@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
1796 | } | 1829 | } |
1797 | } | 1830 | } |
1798 | 1831 | ||
1799 | static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | 1832 | static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, |
1800 | u64 *spte) | 1833 | u64 *spte) |
1801 | { | 1834 | { |
1802 | u64 pte; | 1835 | u64 pte; |
@@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1804 | 1837 | ||
1805 | pte = *spte; | 1838 | pte = *spte; |
1806 | if (is_shadow_present_pte(pte)) { | 1839 | if (is_shadow_present_pte(pte)) { |
1807 | if (is_last_spte(pte, sp->role.level)) | 1840 | if (is_last_spte(pte, sp->role.level)) { |
1808 | drop_spte(kvm, spte); | 1841 | drop_spte(kvm, spte); |
1809 | else { | 1842 | if (is_large_pte(pte)) |
1843 | --kvm->stat.lpages; | ||
1844 | } else { | ||
1810 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1845 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
1811 | drop_parent_pte(child, spte); | 1846 | drop_parent_pte(child, spte); |
1812 | } | 1847 | } |
1813 | } else if (is_mmio_spte(pte)) | 1848 | return true; |
1849 | } | ||
1850 | |||
1851 | if (is_mmio_spte(pte)) | ||
1814 | mmu_spte_clear_no_track(spte); | 1852 | mmu_spte_clear_no_track(spte); |
1815 | 1853 | ||
1816 | if (is_large_pte(pte)) | 1854 | return false; |
1817 | --kvm->stat.lpages; | ||
1818 | } | 1855 | } |
1819 | 1856 | ||
1820 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | 1857 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, |
@@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
1831 | mmu_page_remove_parent_pte(sp, parent_pte); | 1868 | mmu_page_remove_parent_pte(sp, parent_pte); |
1832 | } | 1869 | } |
1833 | 1870 | ||
1834 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
1835 | { | ||
1836 | int i; | ||
1837 | struct kvm_vcpu *vcpu; | ||
1838 | |||
1839 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
1840 | vcpu->arch.last_pte_updated = NULL; | ||
1841 | } | ||
1842 | |||
1843 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1871 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
1844 | { | 1872 | { |
1845 | u64 *parent_pte; | 1873 | u64 *parent_pte; |
@@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1899 | } | 1927 | } |
1900 | 1928 | ||
1901 | sp->role.invalid = 1; | 1929 | sp->role.invalid = 1; |
1902 | kvm_mmu_reset_last_pte_updated(kvm); | ||
1903 | return ret; | 1930 | return ret; |
1904 | } | 1931 | } |
1905 | 1932 | ||
@@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) | |||
1985 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; | 2012 | kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; |
1986 | } | 2013 | } |
1987 | 2014 | ||
1988 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | 2015 | int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) |
1989 | { | 2016 | { |
1990 | struct kvm_mmu_page *sp; | 2017 | struct kvm_mmu_page *sp; |
1991 | struct hlist_node *node; | 2018 | struct hlist_node *node; |
@@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
1994 | 2021 | ||
1995 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); | 2022 | pgprintk("%s: looking for gfn %llx\n", __func__, gfn); |
1996 | r = 0; | 2023 | r = 0; |
1997 | 2024 | spin_lock(&kvm->mmu_lock); | |
1998 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 2025 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { |
1999 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, | 2026 | pgprintk("%s: gfn %llx role %x\n", __func__, gfn, |
2000 | sp->role.word); | 2027 | sp->role.word); |
@@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
2002 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | 2029 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); |
2003 | } | 2030 | } |
2004 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 2031 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
2005 | return r; | 2032 | spin_unlock(&kvm->mmu_lock); |
2006 | } | ||
2007 | |||
2008 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
2009 | { | ||
2010 | struct kvm_mmu_page *sp; | ||
2011 | struct hlist_node *node; | ||
2012 | LIST_HEAD(invalid_list); | ||
2013 | 2033 | ||
2014 | for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { | 2034 | return r; |
2015 | pgprintk("%s: zap %llx %x\n", | ||
2016 | __func__, gfn, sp->role.word); | ||
2017 | kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | ||
2018 | } | ||
2019 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
2020 | } | 2035 | } |
2036 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); | ||
2021 | 2037 | ||
2022 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | 2038 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) |
2023 | { | 2039 | { |
@@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2169 | return 1; | 2185 | return 1; |
2170 | 2186 | ||
2171 | if (!need_unsync && !s->unsync) { | 2187 | if (!need_unsync && !s->unsync) { |
2172 | if (!oos_shadow) | ||
2173 | return 1; | ||
2174 | need_unsync = true; | 2188 | need_unsync = true; |
2175 | } | 2189 | } |
2176 | } | 2190 | } |
@@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2191 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2205 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) |
2192 | return 0; | 2206 | return 0; |
2193 | 2207 | ||
2194 | /* | ||
2195 | * We don't set the accessed bit, since we sometimes want to see | ||
2196 | * whether the guest actually used the pte (in order to detect | ||
2197 | * demand paging). | ||
2198 | */ | ||
2199 | spte = PT_PRESENT_MASK; | 2208 | spte = PT_PRESENT_MASK; |
2200 | if (!speculative) | 2209 | if (!speculative) |
2201 | spte |= shadow_accessed_mask; | 2210 | spte |= shadow_accessed_mask; |
@@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2346 | } | 2355 | } |
2347 | } | 2356 | } |
2348 | kvm_release_pfn_clean(pfn); | 2357 | kvm_release_pfn_clean(pfn); |
2349 | if (speculative) { | ||
2350 | vcpu->arch.last_pte_updated = sptep; | ||
2351 | vcpu->arch.last_pte_gfn = gfn; | ||
2352 | } | ||
2353 | } | 2358 | } |
2354 | 2359 | ||
2355 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | 2360 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) |
@@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2840 | return; | 2845 | return; |
2841 | 2846 | ||
2842 | vcpu_clear_mmio_info(vcpu, ~0ul); | 2847 | vcpu_clear_mmio_info(vcpu, ~0ul); |
2843 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); | 2848 | kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); |
2844 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { | 2849 | if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { |
2845 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2850 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2846 | sp = page_header(root); | 2851 | sp = page_header(root); |
2847 | mmu_sync_children(vcpu, sp); | 2852 | mmu_sync_children(vcpu, sp); |
2848 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | 2853 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); |
2849 | return; | 2854 | return; |
2850 | } | 2855 | } |
2851 | for (i = 0; i < 4; ++i) { | 2856 | for (i = 0; i < 4; ++i) { |
@@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) | |||
2857 | mmu_sync_children(vcpu, sp); | 2862 | mmu_sync_children(vcpu, sp); |
2858 | } | 2863 | } |
2859 | } | 2864 | } |
2860 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); | 2865 | kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); |
2861 | } | 2866 | } |
2862 | 2867 | ||
2863 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) | 2868 | void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) |
@@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, | |||
3510 | kvm_mmu_flush_tlb(vcpu); | 3515 | kvm_mmu_flush_tlb(vcpu); |
3511 | } | 3516 | } |
3512 | 3517 | ||
3513 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | 3518 | static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, |
3519 | const u8 *new, int *bytes) | ||
3514 | { | 3520 | { |
3515 | u64 *spte = vcpu->arch.last_pte_updated; | 3521 | u64 gentry; |
3522 | int r; | ||
3523 | |||
3524 | /* | ||
3525 | * Assume that the pte write on a page table of the same type | ||
3526 | * as the current vcpu paging mode since we update the sptes only | ||
3527 | * when they have the same mode. | ||
3528 | */ | ||
3529 | if (is_pae(vcpu) && *bytes == 4) { | ||
3530 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
3531 | *gpa &= ~(gpa_t)7; | ||
3532 | *bytes = 8; | ||
3533 | r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); | ||
3534 | if (r) | ||
3535 | gentry = 0; | ||
3536 | new = (const u8 *)&gentry; | ||
3537 | } | ||
3516 | 3538 | ||
3517 | return !!(spte && (*spte & shadow_accessed_mask)); | 3539 | switch (*bytes) { |
3540 | case 4: | ||
3541 | gentry = *(const u32 *)new; | ||
3542 | break; | ||
3543 | case 8: | ||
3544 | gentry = *(const u64 *)new; | ||
3545 | break; | ||
3546 | default: | ||
3547 | gentry = 0; | ||
3548 | break; | ||
3549 | } | ||
3550 | |||
3551 | return gentry; | ||
3518 | } | 3552 | } |
3519 | 3553 | ||
3520 | static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) | 3554 | /* |
3555 | * If we're seeing too many writes to a page, it may no longer be a page table, | ||
3556 | * or we may be forking, in which case it is better to unmap the page. | ||
3557 | */ | ||
3558 | static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) | ||
3521 | { | 3559 | { |
3522 | u64 *spte = vcpu->arch.last_pte_updated; | 3560 | /* |
3561 | * Skip write-flooding detected for the sp whose level is 1, because | ||
3562 | * it can become unsync, then the guest page is not write-protected. | ||
3563 | */ | ||
3564 | if (sp->role.level == 1) | ||
3565 | return false; | ||
3523 | 3566 | ||
3524 | if (spte | 3567 | return ++sp->write_flooding_count >= 3; |
3525 | && vcpu->arch.last_pte_gfn == gfn | 3568 | } |
3526 | && shadow_accessed_mask | 3569 | |
3527 | && !(*spte & shadow_accessed_mask) | 3570 | /* |
3528 | && is_shadow_present_pte(*spte)) | 3571 | * Misaligned accesses are too much trouble to fix up; also, they usually |
3529 | set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | 3572 | * indicate a page is not used as a page table. |
3573 | */ | ||
3574 | static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, | ||
3575 | int bytes) | ||
3576 | { | ||
3577 | unsigned offset, pte_size, misaligned; | ||
3578 | |||
3579 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
3580 | gpa, bytes, sp->role.word); | ||
3581 | |||
3582 | offset = offset_in_page(gpa); | ||
3583 | pte_size = sp->role.cr4_pae ? 8 : 4; | ||
3584 | |||
3585 | /* | ||
3586 | * Sometimes, the OS only writes the last one bytes to update status | ||
3587 | * bits, for example, in linux, andb instruction is used in clear_bit(). | ||
3588 | */ | ||
3589 | if (!(offset & (pte_size - 1)) && bytes == 1) | ||
3590 | return false; | ||
3591 | |||
3592 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
3593 | misaligned |= bytes < 4; | ||
3594 | |||
3595 | return misaligned; | ||
3596 | } | ||
3597 | |||
3598 | static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) | ||
3599 | { | ||
3600 | unsigned page_offset, quadrant; | ||
3601 | u64 *spte; | ||
3602 | int level; | ||
3603 | |||
3604 | page_offset = offset_in_page(gpa); | ||
3605 | level = sp->role.level; | ||
3606 | *nspte = 1; | ||
3607 | if (!sp->role.cr4_pae) { | ||
3608 | page_offset <<= 1; /* 32->64 */ | ||
3609 | /* | ||
3610 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
3611 | * only 2MB. So we need to double the offset again | ||
3612 | * and zap two pdes instead of one. | ||
3613 | */ | ||
3614 | if (level == PT32_ROOT_LEVEL) { | ||
3615 | page_offset &= ~7; /* kill rounding error */ | ||
3616 | page_offset <<= 1; | ||
3617 | *nspte = 2; | ||
3618 | } | ||
3619 | quadrant = page_offset >> PAGE_SHIFT; | ||
3620 | page_offset &= ~PAGE_MASK; | ||
3621 | if (quadrant != sp->role.quadrant) | ||
3622 | return NULL; | ||
3623 | } | ||
3624 | |||
3625 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
3626 | return spte; | ||
3530 | } | 3627 | } |
3531 | 3628 | ||
3532 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 3629 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
3533 | const u8 *new, int bytes, | 3630 | const u8 *new, int bytes) |
3534 | bool guest_initiated) | ||
3535 | { | 3631 | { |
3536 | gfn_t gfn = gpa >> PAGE_SHIFT; | 3632 | gfn_t gfn = gpa >> PAGE_SHIFT; |
3537 | union kvm_mmu_page_role mask = { .word = 0 }; | 3633 | union kvm_mmu_page_role mask = { .word = 0 }; |
@@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3539 | struct hlist_node *node; | 3635 | struct hlist_node *node; |
3540 | LIST_HEAD(invalid_list); | 3636 | LIST_HEAD(invalid_list); |
3541 | u64 entry, gentry, *spte; | 3637 | u64 entry, gentry, *spte; |
3542 | unsigned pte_size, page_offset, misaligned, quadrant, offset; | 3638 | int npte; |
3543 | int level, npte, invlpg_counter, r, flooded = 0; | ||
3544 | bool remote_flush, local_flush, zap_page; | 3639 | bool remote_flush, local_flush, zap_page; |
3545 | 3640 | ||
3546 | /* | 3641 | /* |
@@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3551 | return; | 3646 | return; |
3552 | 3647 | ||
3553 | zap_page = remote_flush = local_flush = false; | 3648 | zap_page = remote_flush = local_flush = false; |
3554 | offset = offset_in_page(gpa); | ||
3555 | 3649 | ||
3556 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); | 3650 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
3557 | 3651 | ||
3558 | invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); | 3652 | gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); |
3559 | 3653 | ||
3560 | /* | 3654 | /* |
3561 | * Assume that the pte write on a page table of the same type | 3655 | * No need to care whether allocation memory is successful |
3562 | * as the current vcpu paging mode since we update the sptes only | 3656 | * or not since pte prefetch is skiped if it does not have |
3563 | * when they have the same mode. | 3657 | * enough objects in the cache. |
3564 | */ | 3658 | */ |
3565 | if ((is_pae(vcpu) && bytes == 4) || !new) { | 3659 | mmu_topup_memory_caches(vcpu); |
3566 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
3567 | if (is_pae(vcpu)) { | ||
3568 | gpa &= ~(gpa_t)7; | ||
3569 | bytes = 8; | ||
3570 | } | ||
3571 | r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); | ||
3572 | if (r) | ||
3573 | gentry = 0; | ||
3574 | new = (const u8 *)&gentry; | ||
3575 | } | ||
3576 | |||
3577 | switch (bytes) { | ||
3578 | case 4: | ||
3579 | gentry = *(const u32 *)new; | ||
3580 | break; | ||
3581 | case 8: | ||
3582 | gentry = *(const u64 *)new; | ||
3583 | break; | ||
3584 | default: | ||
3585 | gentry = 0; | ||
3586 | break; | ||
3587 | } | ||
3588 | 3660 | ||
3589 | spin_lock(&vcpu->kvm->mmu_lock); | 3661 | spin_lock(&vcpu->kvm->mmu_lock); |
3590 | if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) | ||
3591 | gentry = 0; | ||
3592 | kvm_mmu_free_some_pages(vcpu); | ||
3593 | ++vcpu->kvm->stat.mmu_pte_write; | 3662 | ++vcpu->kvm->stat.mmu_pte_write; |
3594 | trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); | 3663 | kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); |
3595 | if (guest_initiated) { | ||
3596 | kvm_mmu_access_page(vcpu, gfn); | ||
3597 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
3598 | && !last_updated_pte_accessed(vcpu)) { | ||
3599 | ++vcpu->arch.last_pt_write_count; | ||
3600 | if (vcpu->arch.last_pt_write_count >= 3) | ||
3601 | flooded = 1; | ||
3602 | } else { | ||
3603 | vcpu->arch.last_pt_write_gfn = gfn; | ||
3604 | vcpu->arch.last_pt_write_count = 1; | ||
3605 | vcpu->arch.last_pte_updated = NULL; | ||
3606 | } | ||
3607 | } | ||
3608 | 3664 | ||
3609 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; | 3665 | mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; |
3610 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { | 3666 | for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { |
3611 | pte_size = sp->role.cr4_pae ? 8 : 4; | 3667 | spte = get_written_sptes(sp, gpa, &npte); |
3612 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | 3668 | |
3613 | misaligned |= bytes < 4; | 3669 | if (detect_write_misaligned(sp, gpa, bytes) || |
3614 | if (misaligned || flooded) { | 3670 | detect_write_flooding(sp, spte)) { |
3615 | /* | ||
3616 | * Misaligned accesses are too much trouble to fix | ||
3617 | * up; also, they usually indicate a page is not used | ||
3618 | * as a page table. | ||
3619 | * | ||
3620 | * If we're seeing too many writes to a page, | ||
3621 | * it may no longer be a page table, or we may be | ||
3622 | * forking, in which case it is better to unmap the | ||
3623 | * page. | ||
3624 | */ | ||
3625 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
3626 | gpa, bytes, sp->role.word); | ||
3627 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, | 3671 | zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, |
3628 | &invalid_list); | 3672 | &invalid_list); |
3629 | ++vcpu->kvm->stat.mmu_flooded; | 3673 | ++vcpu->kvm->stat.mmu_flooded; |
3630 | continue; | 3674 | continue; |
3631 | } | 3675 | } |
3632 | page_offset = offset; | 3676 | |
3633 | level = sp->role.level; | 3677 | spte = get_written_sptes(sp, gpa, &npte); |
3634 | npte = 1; | 3678 | if (!spte) |
3635 | if (!sp->role.cr4_pae) { | 3679 | continue; |
3636 | page_offset <<= 1; /* 32->64 */ | 3680 | |
3637 | /* | ||
3638 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
3639 | * only 2MB. So we need to double the offset again | ||
3640 | * and zap two pdes instead of one. | ||
3641 | */ | ||
3642 | if (level == PT32_ROOT_LEVEL) { | ||
3643 | page_offset &= ~7; /* kill rounding error */ | ||
3644 | page_offset <<= 1; | ||
3645 | npte = 2; | ||
3646 | } | ||
3647 | quadrant = page_offset >> PAGE_SHIFT; | ||
3648 | page_offset &= ~PAGE_MASK; | ||
3649 | if (quadrant != sp->role.quadrant) | ||
3650 | continue; | ||
3651 | } | ||
3652 | local_flush = true; | 3681 | local_flush = true; |
3653 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
3654 | while (npte--) { | 3682 | while (npte--) { |
3655 | entry = *spte; | 3683 | entry = *spte; |
3656 | mmu_page_zap_pte(vcpu->kvm, sp, spte); | 3684 | mmu_page_zap_pte(vcpu->kvm, sp, spte); |
3657 | if (gentry && | 3685 | if (gentry && |
3658 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) | 3686 | !((sp->role.word ^ vcpu->arch.mmu.base_role.word) |
3659 | & mask.word)) | 3687 | & mask.word) && rmap_can_add(vcpu)) |
3660 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); | 3688 | mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); |
3661 | if (!remote_flush && need_remote_flush(entry, *spte)) | 3689 | if (!remote_flush && need_remote_flush(entry, *spte)) |
3662 | remote_flush = true; | 3690 | remote_flush = true; |
@@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
3665 | } | 3693 | } |
3666 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); | 3694 | mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); |
3667 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3695 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
3668 | trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); | 3696 | kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); |
3669 | spin_unlock(&vcpu->kvm->mmu_lock); | 3697 | spin_unlock(&vcpu->kvm->mmu_lock); |
3670 | } | 3698 | } |
3671 | 3699 | ||
@@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
3679 | 3707 | ||
3680 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); | 3708 | gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); |
3681 | 3709 | ||
3682 | spin_lock(&vcpu->kvm->mmu_lock); | ||
3683 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 3710 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
3684 | spin_unlock(&vcpu->kvm->mmu_lock); | 3711 | |
3685 | return r; | 3712 | return r; |
3686 | } | 3713 | } |
3687 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); | 3714 | EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); |
@@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | |||
3702 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 3729 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
3703 | } | 3730 | } |
3704 | 3731 | ||
3732 | static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) | ||
3733 | { | ||
3734 | if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) | ||
3735 | return vcpu_match_mmio_gpa(vcpu, addr); | ||
3736 | |||
3737 | return vcpu_match_mmio_gva(vcpu, addr); | ||
3738 | } | ||
3739 | |||
3705 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | 3740 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, |
3706 | void *insn, int insn_len) | 3741 | void *insn, int insn_len) |
3707 | { | 3742 | { |
3708 | int r; | 3743 | int r, emulation_type = EMULTYPE_RETRY; |
3709 | enum emulation_result er; | 3744 | enum emulation_result er; |
3710 | 3745 | ||
3711 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); | 3746 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); |
@@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | |||
3717 | goto out; | 3752 | goto out; |
3718 | } | 3753 | } |
3719 | 3754 | ||
3720 | r = mmu_topup_memory_caches(vcpu); | 3755 | if (is_mmio_page_fault(vcpu, cr2)) |
3721 | if (r) | 3756 | emulation_type = 0; |
3722 | goto out; | ||
3723 | 3757 | ||
3724 | er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); | 3758 | er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); |
3725 | 3759 | ||
3726 | switch (er) { | 3760 | switch (er) { |
3727 | case EMULATE_DONE: | 3761 | case EMULATE_DONE: |
@@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | |||
3792 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | 3826 | int kvm_mmu_create(struct kvm_vcpu *vcpu) |
3793 | { | 3827 | { |
3794 | ASSERT(vcpu); | 3828 | ASSERT(vcpu); |
3795 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3829 | |
3830 | vcpu->arch.walk_mmu = &vcpu->arch.mmu; | ||
3831 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
3832 | vcpu->arch.mmu.translate_gpa = translate_gpa; | ||
3833 | vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; | ||
3796 | 3834 | ||
3797 | return alloc_mmu_pages(vcpu); | 3835 | return alloc_mmu_pages(vcpu); |
3798 | } | 3836 | } |
@@ -3852,14 +3890,14 @@ restart: | |||
3852 | spin_unlock(&kvm->mmu_lock); | 3890 | spin_unlock(&kvm->mmu_lock); |
3853 | } | 3891 | } |
3854 | 3892 | ||
3855 | static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, | 3893 | static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, |
3856 | struct list_head *invalid_list) | 3894 | struct list_head *invalid_list) |
3857 | { | 3895 | { |
3858 | struct kvm_mmu_page *page; | 3896 | struct kvm_mmu_page *page; |
3859 | 3897 | ||
3860 | page = container_of(kvm->arch.active_mmu_pages.prev, | 3898 | page = container_of(kvm->arch.active_mmu_pages.prev, |
3861 | struct kvm_mmu_page, link); | 3899 | struct kvm_mmu_page, link); |
3862 | return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); | 3900 | kvm_mmu_prepare_zap_page(kvm, page, invalid_list); |
3863 | } | 3901 | } |
3864 | 3902 | ||
3865 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 3903 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
@@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
3874 | raw_spin_lock(&kvm_lock); | 3912 | raw_spin_lock(&kvm_lock); |
3875 | 3913 | ||
3876 | list_for_each_entry(kvm, &vm_list, vm_list) { | 3914 | list_for_each_entry(kvm, &vm_list, vm_list) { |
3877 | int idx, freed_pages; | 3915 | int idx; |
3878 | LIST_HEAD(invalid_list); | 3916 | LIST_HEAD(invalid_list); |
3879 | 3917 | ||
3880 | idx = srcu_read_lock(&kvm->srcu); | 3918 | idx = srcu_read_lock(&kvm->srcu); |
3881 | spin_lock(&kvm->mmu_lock); | 3919 | spin_lock(&kvm->mmu_lock); |
3882 | if (!kvm_freed && nr_to_scan > 0 && | 3920 | if (!kvm_freed && nr_to_scan > 0 && |
3883 | kvm->arch.n_used_mmu_pages > 0) { | 3921 | kvm->arch.n_used_mmu_pages > 0) { |
3884 | freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, | 3922 | kvm_mmu_remove_some_alloc_mmu_pages(kvm, |
3885 | &invalid_list); | 3923 | &invalid_list); |
3886 | kvm_freed = kvm; | 3924 | kvm_freed = kvm; |
3887 | } | 3925 | } |
3888 | nr_to_scan--; | 3926 | nr_to_scan--; |
@@ -3944,15 +3982,15 @@ nomem: | |||
3944 | */ | 3982 | */ |
3945 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | 3983 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) |
3946 | { | 3984 | { |
3947 | int i; | ||
3948 | unsigned int nr_mmu_pages; | 3985 | unsigned int nr_mmu_pages; |
3949 | unsigned int nr_pages = 0; | 3986 | unsigned int nr_pages = 0; |
3950 | struct kvm_memslots *slots; | 3987 | struct kvm_memslots *slots; |
3988 | struct kvm_memory_slot *memslot; | ||
3951 | 3989 | ||
3952 | slots = kvm_memslots(kvm); | 3990 | slots = kvm_memslots(kvm); |
3953 | 3991 | ||
3954 | for (i = 0; i < slots->nmemslots; i++) | 3992 | kvm_for_each_memslot(memslot, slots) |
3955 | nr_pages += slots->memslots[i].npages; | 3993 | nr_pages += memslot->npages; |
3956 | 3994 | ||
3957 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | 3995 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; |
3958 | nr_mmu_pages = max(nr_mmu_pages, | 3996 | nr_mmu_pages = max(nr_mmu_pages, |
@@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | |||
3961 | return nr_mmu_pages; | 3999 | return nr_mmu_pages; |
3962 | } | 4000 | } |
3963 | 4001 | ||
3964 | static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
3965 | unsigned len) | ||
3966 | { | ||
3967 | if (len > buffer->len) | ||
3968 | return NULL; | ||
3969 | return buffer->ptr; | ||
3970 | } | ||
3971 | |||
3972 | static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
3973 | unsigned len) | ||
3974 | { | ||
3975 | void *ret; | ||
3976 | |||
3977 | ret = pv_mmu_peek_buffer(buffer, len); | ||
3978 | if (!ret) | ||
3979 | return ret; | ||
3980 | buffer->ptr += len; | ||
3981 | buffer->len -= len; | ||
3982 | buffer->processed += len; | ||
3983 | return ret; | ||
3984 | } | ||
3985 | |||
3986 | static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | ||
3987 | gpa_t addr, gpa_t value) | ||
3988 | { | ||
3989 | int bytes = 8; | ||
3990 | int r; | ||
3991 | |||
3992 | if (!is_long_mode(vcpu) && !is_pae(vcpu)) | ||
3993 | bytes = 4; | ||
3994 | |||
3995 | r = mmu_topup_memory_caches(vcpu); | ||
3996 | if (r) | ||
3997 | return r; | ||
3998 | |||
3999 | if (!emulator_write_phys(vcpu, addr, &value, bytes)) | ||
4000 | return -EFAULT; | ||
4001 | |||
4002 | return 1; | ||
4003 | } | ||
4004 | |||
4005 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
4006 | { | ||
4007 | (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); | ||
4008 | return 1; | ||
4009 | } | ||
4010 | |||
4011 | static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) | ||
4012 | { | ||
4013 | spin_lock(&vcpu->kvm->mmu_lock); | ||
4014 | mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); | ||
4015 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
4016 | return 1; | ||
4017 | } | ||
4018 | |||
4019 | static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, | ||
4020 | struct kvm_pv_mmu_op_buffer *buffer) | ||
4021 | { | ||
4022 | struct kvm_mmu_op_header *header; | ||
4023 | |||
4024 | header = pv_mmu_peek_buffer(buffer, sizeof *header); | ||
4025 | if (!header) | ||
4026 | return 0; | ||
4027 | switch (header->op) { | ||
4028 | case KVM_MMU_OP_WRITE_PTE: { | ||
4029 | struct kvm_mmu_op_write_pte *wpte; | ||
4030 | |||
4031 | wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); | ||
4032 | if (!wpte) | ||
4033 | return 0; | ||
4034 | return kvm_pv_mmu_write(vcpu, wpte->pte_phys, | ||
4035 | wpte->pte_val); | ||
4036 | } | ||
4037 | case KVM_MMU_OP_FLUSH_TLB: { | ||
4038 | struct kvm_mmu_op_flush_tlb *ftlb; | ||
4039 | |||
4040 | ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); | ||
4041 | if (!ftlb) | ||
4042 | return 0; | ||
4043 | return kvm_pv_mmu_flush_tlb(vcpu); | ||
4044 | } | ||
4045 | case KVM_MMU_OP_RELEASE_PT: { | ||
4046 | struct kvm_mmu_op_release_pt *rpt; | ||
4047 | |||
4048 | rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); | ||
4049 | if (!rpt) | ||
4050 | return 0; | ||
4051 | return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); | ||
4052 | } | ||
4053 | default: return 0; | ||
4054 | } | ||
4055 | } | ||
4056 | |||
4057 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | ||
4058 | gpa_t addr, unsigned long *ret) | ||
4059 | { | ||
4060 | int r; | ||
4061 | struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; | ||
4062 | |||
4063 | buffer->ptr = buffer->buf; | ||
4064 | buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); | ||
4065 | buffer->processed = 0; | ||
4066 | |||
4067 | r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); | ||
4068 | if (r) | ||
4069 | goto out; | ||
4070 | |||
4071 | while (buffer->len) { | ||
4072 | r = kvm_pv_mmu_op_one(vcpu, buffer); | ||
4073 | if (r < 0) | ||
4074 | goto out; | ||
4075 | if (r == 0) | ||
4076 | break; | ||
4077 | } | ||
4078 | |||
4079 | r = 1; | ||
4080 | out: | ||
4081 | *ret = buffer->processed; | ||
4082 | return r; | ||
4083 | } | ||
4084 | |||
4085 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) | 4002 | int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) |
4086 | { | 4003 | { |
4087 | struct kvm_shadow_walk_iterator iterator; | 4004 | struct kvm_shadow_walk_iterator iterator; |
@@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | |||
4110 | mmu_free_memory_caches(vcpu); | 4027 | mmu_free_memory_caches(vcpu); |
4111 | } | 4028 | } |
4112 | 4029 | ||
4113 | #ifdef CONFIG_KVM_MMU_AUDIT | ||
4114 | #include "mmu_audit.c" | ||
4115 | #else | ||
4116 | static void mmu_audit_disable(void) { } | ||
4117 | #endif | ||
4118 | |||
4119 | void kvm_mmu_module_exit(void) | 4030 | void kvm_mmu_module_exit(void) |
4120 | { | 4031 | { |
4121 | mmu_destroy_caches(); | 4032 | mmu_destroy_caches(); |