summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2019-11-04 06:22:02 -0500
committerThomas Gleixner <tglx@linutronix.de>2019-11-04 06:22:02 -0500
commitb8e8c8303ff28c61046a4d0f6ea99aea609a7dc0 (patch)
treef5e0c8bb8b968eb158c07b274d9fdc465fdf95e8
parent731dc9df975a5da21237a18c3384f811a7a41cc6 (diff)
kvm: mmu: ITLB_MULTIHIT mitigation
With some Intel processors, putting the same virtual address in the TLB as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit and cause the processor to issue a machine check resulting in a CPU lockup. Unfortunately when EPT page tables use huge pages, it is possible for a malicious guest to cause this situation. Add a knob to mark huge pages as non-executable. When the nx_huge_pages parameter is enabled (and we are using EPT), all huge pages are marked as NX. If the guest attempts to execute in one of those pages, the page is broken down into 4K pages, which are then marked executable. This is not an issue for shadow paging (except nested EPT), because then the host is in control of TLB flushes and the problematic situation cannot happen. With nested EPT, again the nested guest can cause problems shadow and direct EPT is treated in the same way. [ tglx: Fixup default to auto and massage wording a bit ] Originally-by: Junaid Shahid <junaids@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt19
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/kernel/cpu/bugs.c13
-rw-r--r--arch/x86/kvm/mmu.c141
-rw-r--r--arch/x86/kvm/paging_tmpl.h29
-rw-r--r--arch/x86/kvm/x86.c9
6 files changed, 200 insertions, 13 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fa8f03ddff24..9d5f123cc218 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2055,6 +2055,19 @@
2055 KVM MMU at runtime. 2055 KVM MMU at runtime.
2056 Default is 0 (off) 2056 Default is 0 (off)
2057 2057
2058 kvm.nx_huge_pages=
2059 [KVM] Controls the software workaround for the
2060 X86_BUG_ITLB_MULTIHIT bug.
2061 force : Always deploy workaround.
2062 off : Never deploy workaround.
2063 auto : Deploy workaround based on the presence of
2064 X86_BUG_ITLB_MULTIHIT.
2065
2066 Default is 'auto'.
2067
2068 If the software workaround is enabled for the host,
2069 guests do need not to enable it for nested guests.
2070
2058 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. 2071 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
2059 Default is 1 (enabled) 2072 Default is 1 (enabled)
2060 2073
@@ -2637,6 +2650,12 @@
2637 l1tf=off [X86] 2650 l1tf=off [X86]
2638 mds=off [X86] 2651 mds=off [X86]
2639 tsx_async_abort=off [X86] 2652 tsx_async_abort=off [X86]
2653 kvm.nx_huge_pages=off [X86]
2654
2655 Exceptions:
2656 This does not have any effect on
2657 kvm.nx_huge_pages when
2658 kvm.nx_huge_pages=force.
2640 2659
2641 auto (default) 2660 auto (default)
2642 Mitigate all CPU vulnerabilities, but leave SMT 2661 Mitigate all CPU vulnerabilities, but leave SMT
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24d6598dea29..a37b03483b66 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,6 +315,7 @@ struct kvm_mmu_page {
315 bool unsync; 315 bool unsync;
316 u8 mmu_valid_gen; 316 u8 mmu_valid_gen;
317 bool mmio_cached; 317 bool mmio_cached;
318 bool lpage_disallowed; /* Can't be replaced by an equiv large page */
318 319
319 /* 320 /*
320 * The following two entries are used to key the shadow page in the 321 * The following two entries are used to key the shadow page in the
@@ -946,6 +947,7 @@ struct kvm_vm_stat {
946 ulong mmu_unsync; 947 ulong mmu_unsync;
947 ulong remote_tlb_flush; 948 ulong remote_tlb_flush;
948 ulong lpages; 949 ulong lpages;
950 ulong nx_lpage_splits;
949 ulong max_mmu_page_hash_collisions; 951 ulong max_mmu_page_hash_collisions;
950}; 952};
951 953
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5364beda8c61..850005590167 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1257,6 +1257,9 @@ void x86_spec_ctrl_setup_ap(void)
1257 x86_amd_ssb_disable(); 1257 x86_amd_ssb_disable();
1258} 1258}
1259 1259
1260bool itlb_multihit_kvm_mitigation;
1261EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
1262
1260#undef pr_fmt 1263#undef pr_fmt
1261#define pr_fmt(fmt) "L1TF: " fmt 1264#define pr_fmt(fmt) "L1TF: " fmt
1262 1265
@@ -1412,17 +1415,25 @@ static ssize_t l1tf_show_state(char *buf)
1412 l1tf_vmx_states[l1tf_vmx_mitigation], 1415 l1tf_vmx_states[l1tf_vmx_mitigation],
1413 sched_smt_active() ? "vulnerable" : "disabled"); 1416 sched_smt_active() ? "vulnerable" : "disabled");
1414} 1417}
1418
1419static ssize_t itlb_multihit_show_state(char *buf)
1420{
1421 if (itlb_multihit_kvm_mitigation)
1422 return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
1423 else
1424 return sprintf(buf, "KVM: Vulnerable\n");
1425}
1415#else 1426#else
1416static ssize_t l1tf_show_state(char *buf) 1427static ssize_t l1tf_show_state(char *buf)
1417{ 1428{
1418 return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); 1429 return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
1419} 1430}
1420#endif
1421 1431
1422static ssize_t itlb_multihit_show_state(char *buf) 1432static ssize_t itlb_multihit_show_state(char *buf)
1423{ 1433{
1424 return sprintf(buf, "Processor vulnerable\n"); 1434 return sprintf(buf, "Processor vulnerable\n");
1425} 1435}
1436#endif
1426 1437
1427static ssize_t mds_show_state(char *buf) 1438static ssize_t mds_show_state(char *buf)
1428{ 1439{
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24c23c66b226..bedf6864b092 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -47,6 +47,20 @@
47#include <asm/kvm_page_track.h> 47#include <asm/kvm_page_track.h>
48#include "trace.h" 48#include "trace.h"
49 49
50extern bool itlb_multihit_kvm_mitigation;
51
52static int __read_mostly nx_huge_pages = -1;
53
54static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
55
56static struct kernel_param_ops nx_huge_pages_ops = {
57 .set = set_nx_huge_pages,
58 .get = param_get_bool,
59};
60
61module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
62__MODULE_PARM_TYPE(nx_huge_pages, "bool");
63
50/* 64/*
51 * When setting this variable to true it enables Two-Dimensional-Paging 65 * When setting this variable to true it enables Two-Dimensional-Paging
52 * where the hardware walks 2 page tables: 66 * where the hardware walks 2 page tables:
@@ -352,6 +366,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
352 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; 366 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
353} 367}
354 368
369static bool is_nx_huge_page_enabled(void)
370{
371 return READ_ONCE(nx_huge_pages);
372}
373
355static inline u64 spte_shadow_accessed_mask(u64 spte) 374static inline u64 spte_shadow_accessed_mask(u64 spte)
356{ 375{
357 MMU_WARN_ON(is_mmio_spte(spte)); 376 MMU_WARN_ON(is_mmio_spte(spte));
@@ -1190,6 +1209,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1190 kvm_mmu_gfn_disallow_lpage(slot, gfn); 1209 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1191} 1210}
1192 1211
1212static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1213{
1214 if (sp->lpage_disallowed)
1215 return;
1216
1217 ++kvm->stat.nx_lpage_splits;
1218 sp->lpage_disallowed = true;
1219}
1220
1193static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 1221static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1194{ 1222{
1195 struct kvm_memslots *slots; 1223 struct kvm_memslots *slots;
@@ -1207,6 +1235,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1207 kvm_mmu_gfn_allow_lpage(slot, gfn); 1235 kvm_mmu_gfn_allow_lpage(slot, gfn);
1208} 1236}
1209 1237
1238static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1239{
1240 --kvm->stat.nx_lpage_splits;
1241 sp->lpage_disallowed = false;
1242}
1243
1210static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, 1244static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1211 struct kvm_memory_slot *slot) 1245 struct kvm_memory_slot *slot)
1212{ 1246{
@@ -2792,6 +2826,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2792 kvm_reload_remote_mmus(kvm); 2826 kvm_reload_remote_mmus(kvm);
2793 } 2827 }
2794 2828
2829 if (sp->lpage_disallowed)
2830 unaccount_huge_nx_page(kvm, sp);
2831
2795 sp->role.invalid = 1; 2832 sp->role.invalid = 1;
2796 return list_unstable; 2833 return list_unstable;
2797} 2834}
@@ -3013,6 +3050,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3013 if (!speculative) 3050 if (!speculative)
3014 spte |= spte_shadow_accessed_mask(spte); 3051 spte |= spte_shadow_accessed_mask(spte);
3015 3052
3053 if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3054 is_nx_huge_page_enabled()) {
3055 pte_access &= ~ACC_EXEC_MASK;
3056 }
3057
3016 if (pte_access & ACC_EXEC_MASK) 3058 if (pte_access & ACC_EXEC_MASK)
3017 spte |= shadow_x_mask; 3059 spte |= shadow_x_mask;
3018 else 3060 else
@@ -3233,9 +3275,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3233 __direct_pte_prefetch(vcpu, sp, sptep); 3275 __direct_pte_prefetch(vcpu, sp, sptep);
3234} 3276}
3235 3277
3278static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3279 gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3280{
3281 int level = *levelp;
3282 u64 spte = *it.sptep;
3283
3284 if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3285 is_nx_huge_page_enabled() &&
3286 is_shadow_present_pte(spte) &&
3287 !is_large_pte(spte)) {
3288 /*
3289 * A small SPTE exists for this pfn, but FNAME(fetch)
3290 * and __direct_map would like to create a large PTE
3291 * instead: just force them to go down another level,
3292 * patching back for them into pfn the next 9 bits of
3293 * the address.
3294 */
3295 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3296 *pfnp |= gfn & page_mask;
3297 (*levelp)--;
3298 }
3299}
3300
3236static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, 3301static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3237 int map_writable, int level, kvm_pfn_t pfn, 3302 int map_writable, int level, kvm_pfn_t pfn,
3238 bool prefault) 3303 bool prefault, bool lpage_disallowed)
3239{ 3304{
3240 struct kvm_shadow_walk_iterator it; 3305 struct kvm_shadow_walk_iterator it;
3241 struct kvm_mmu_page *sp; 3306 struct kvm_mmu_page *sp;
@@ -3248,6 +3313,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3248 3313
3249 trace_kvm_mmu_spte_requested(gpa, level, pfn); 3314 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3250 for_each_shadow_entry(vcpu, gpa, it) { 3315 for_each_shadow_entry(vcpu, gpa, it) {
3316 /*
3317 * We cannot overwrite existing page tables with an NX
3318 * large page, as the leaf could be executable.
3319 */
3320 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3321
3251 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 3322 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3252 if (it.level == level) 3323 if (it.level == level)
3253 break; 3324 break;
@@ -3258,6 +3329,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3258 it.level - 1, true, ACC_ALL); 3329 it.level - 1, true, ACC_ALL);
3259 3330
3260 link_shadow_page(vcpu, it.sptep, sp); 3331 link_shadow_page(vcpu, it.sptep, sp);
3332 if (lpage_disallowed)
3333 account_huge_nx_page(vcpu->kvm, sp);
3261 } 3334 }
3262 } 3335 }
3263 3336
@@ -3550,11 +3623,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3550{ 3623{
3551 int r; 3624 int r;
3552 int level; 3625 int level;
3553 bool force_pt_level = false; 3626 bool force_pt_level;
3554 kvm_pfn_t pfn; 3627 kvm_pfn_t pfn;
3555 unsigned long mmu_seq; 3628 unsigned long mmu_seq;
3556 bool map_writable, write = error_code & PFERR_WRITE_MASK; 3629 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3630 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3631 is_nx_huge_page_enabled();
3557 3632
3633 force_pt_level = lpage_disallowed;
3558 level = mapping_level(vcpu, gfn, &force_pt_level); 3634 level = mapping_level(vcpu, gfn, &force_pt_level);
3559 if (likely(!force_pt_level)) { 3635 if (likely(!force_pt_level)) {
3560 /* 3636 /*
@@ -3588,7 +3664,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3588 goto out_unlock; 3664 goto out_unlock;
3589 if (likely(!force_pt_level)) 3665 if (likely(!force_pt_level))
3590 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 3666 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3591 r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); 3667 r = __direct_map(vcpu, v, write, map_writable, level, pfn,
3668 prefault, false);
3592out_unlock: 3669out_unlock:
3593 spin_unlock(&vcpu->kvm->mmu_lock); 3670 spin_unlock(&vcpu->kvm->mmu_lock);
3594 kvm_release_pfn_clean(pfn); 3671 kvm_release_pfn_clean(pfn);
@@ -4174,6 +4251,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4174 unsigned long mmu_seq; 4251 unsigned long mmu_seq;
4175 int write = error_code & PFERR_WRITE_MASK; 4252 int write = error_code & PFERR_WRITE_MASK;
4176 bool map_writable; 4253 bool map_writable;
4254 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4255 is_nx_huge_page_enabled();
4177 4256
4178 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); 4257 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4179 4258
@@ -4184,8 +4263,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4184 if (r) 4263 if (r)
4185 return r; 4264 return r;
4186 4265
4187 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, 4266 force_pt_level =
4188 PT_DIRECTORY_LEVEL); 4267 lpage_disallowed ||
4268 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4189 level = mapping_level(vcpu, gfn, &force_pt_level); 4269 level = mapping_level(vcpu, gfn, &force_pt_level);
4190 if (likely(!force_pt_level)) { 4270 if (likely(!force_pt_level)) {
4191 if (level > PT_DIRECTORY_LEVEL && 4271 if (level > PT_DIRECTORY_LEVEL &&
@@ -4214,7 +4294,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4214 goto out_unlock; 4294 goto out_unlock;
4215 if (likely(!force_pt_level)) 4295 if (likely(!force_pt_level))
4216 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); 4296 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4217 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); 4297 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4298 prefault, lpage_disallowed);
4218out_unlock: 4299out_unlock:
4219 spin_unlock(&vcpu->kvm->mmu_lock); 4300 spin_unlock(&vcpu->kvm->mmu_lock);
4220 kvm_release_pfn_clean(pfn); 4301 kvm_release_pfn_clean(pfn);
@@ -6155,10 +6236,58 @@ static void kvm_set_mmio_spte_mask(void)
6155 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); 6236 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6156} 6237}
6157 6238
6239static bool get_nx_auto_mode(void)
6240{
6241 /* Return true when CPU has the bug, and mitigations are ON */
6242 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6243}
6244
6245static void __set_nx_huge_pages(bool val)
6246{
6247 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6248}
6249
6250static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6251{
6252 bool old_val = nx_huge_pages;
6253 bool new_val;
6254
6255 /* In "auto" mode deploy workaround only if CPU has the bug. */
6256 if (sysfs_streq(val, "off"))
6257 new_val = 0;
6258 else if (sysfs_streq(val, "force"))
6259 new_val = 1;
6260 else if (sysfs_streq(val, "auto"))
6261 new_val = get_nx_auto_mode();
6262 else if (strtobool(val, &new_val) < 0)
6263 return -EINVAL;
6264
6265 __set_nx_huge_pages(new_val);
6266
6267 if (new_val != old_val) {
6268 struct kvm *kvm;
6269 int idx;
6270
6271 mutex_lock(&kvm_lock);
6272
6273 list_for_each_entry(kvm, &vm_list, vm_list) {
6274 idx = srcu_read_lock(&kvm->srcu);
6275 kvm_mmu_zap_all_fast(kvm);
6276 srcu_read_unlock(&kvm->srcu, idx);
6277 }
6278 mutex_unlock(&kvm_lock);
6279 }
6280
6281 return 0;
6282}
6283
6158int kvm_mmu_module_init(void) 6284int kvm_mmu_module_init(void)
6159{ 6285{
6160 int ret = -ENOMEM; 6286 int ret = -ENOMEM;
6161 6287
6288 if (nx_huge_pages == -1)
6289 __set_nx_huge_pages(get_nx_auto_mode());
6290
6162 /* 6291 /*
6163 * MMU roles use union aliasing which is, generally speaking, an 6292 * MMU roles use union aliasing which is, generally speaking, an
6164 * undefined behavior. However, we supposedly know how compilers behave 6293 * undefined behavior. However, we supposedly know how compilers behave
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7d5cdb3af594..97b21e7fd013 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
614static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 614static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
615 struct guest_walker *gw, 615 struct guest_walker *gw,
616 int write_fault, int hlevel, 616 int write_fault, int hlevel,
617 kvm_pfn_t pfn, bool map_writable, bool prefault) 617 kvm_pfn_t pfn, bool map_writable, bool prefault,
618 bool lpage_disallowed)
618{ 619{
619 struct kvm_mmu_page *sp = NULL; 620 struct kvm_mmu_page *sp = NULL;
620 struct kvm_shadow_walk_iterator it; 621 struct kvm_shadow_walk_iterator it;
621 unsigned direct_access, access = gw->pt_access; 622 unsigned direct_access, access = gw->pt_access;
622 int top_level, ret; 623 int top_level, ret;
623 gfn_t base_gfn; 624 gfn_t gfn, base_gfn;
624 625
625 direct_access = gw->pte_access; 626 direct_access = gw->pte_access;
626 627
@@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
665 link_shadow_page(vcpu, it.sptep, sp); 666 link_shadow_page(vcpu, it.sptep, sp);
666 } 667 }
667 668
668 base_gfn = gw->gfn; 669 /*
670 * FNAME(page_fault) might have clobbered the bottom bits of
671 * gw->gfn, restore them from the virtual address.
672 */
673 gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
674 base_gfn = gfn;
669 675
670 trace_kvm_mmu_spte_requested(addr, gw->level, pfn); 676 trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
671 677
672 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { 678 for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
673 clear_sp_write_flooding_count(it.sptep); 679 clear_sp_write_flooding_count(it.sptep);
674 base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 680
681 /*
682 * We cannot overwrite existing page tables with an NX
683 * large page, as the leaf could be executable.
684 */
685 disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
686
687 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
675 if (it.level == hlevel) 688 if (it.level == hlevel)
676 break; 689 break;
677 690
@@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
683 sp = kvm_mmu_get_page(vcpu, base_gfn, addr, 696 sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
684 it.level - 1, true, direct_access); 697 it.level - 1, true, direct_access);
685 link_shadow_page(vcpu, it.sptep, sp); 698 link_shadow_page(vcpu, it.sptep, sp);
699 if (lpage_disallowed)
700 account_huge_nx_page(vcpu->kvm, sp);
686 } 701 }
687 } 702 }
688 703
@@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
759 int r; 774 int r;
760 kvm_pfn_t pfn; 775 kvm_pfn_t pfn;
761 int level = PT_PAGE_TABLE_LEVEL; 776 int level = PT_PAGE_TABLE_LEVEL;
762 bool force_pt_level = false;
763 unsigned long mmu_seq; 777 unsigned long mmu_seq;
764 bool map_writable, is_self_change_mapping; 778 bool map_writable, is_self_change_mapping;
779 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
780 is_nx_huge_page_enabled();
781 bool force_pt_level = lpage_disallowed;
765 782
766 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 783 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
767 784
@@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
851 if (!force_pt_level) 868 if (!force_pt_level)
852 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); 869 transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
853 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 870 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
854 level, pfn, map_writable, prefault); 871 level, pfn, map_writable, prefault, lpage_disallowed);
855 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 872 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
856 873
857out_unlock: 874out_unlock:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32d70ca2a7fd..b087d178a774 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -213,6 +213,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
213 { "mmu_unsync", VM_STAT(mmu_unsync) }, 213 { "mmu_unsync", VM_STAT(mmu_unsync) },
214 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 214 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
215 { "largepages", VM_STAT(lpages, .mode = 0444) }, 215 { "largepages", VM_STAT(lpages, .mode = 0444) },
216 { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
216 { "max_mmu_page_hash_collisions", 217 { "max_mmu_page_hash_collisions",
217 VM_STAT(max_mmu_page_hash_collisions) }, 218 VM_STAT(max_mmu_page_hash_collisions) },
218 { NULL } 219 { NULL }
@@ -1280,6 +1281,14 @@ static u64 kvm_get_arch_capabilities(void)
1280 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); 1281 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1281 1282
1282 /* 1283 /*
1284 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1285 * the nested hypervisor runs with NX huge pages. If it is not,
1286 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1287 * L1 guests, so it need not worry about its own (L2) guests.
1288 */
1289 data |= ARCH_CAP_PSCHANGE_MC_NO;
1290
1291 /*
1283 * If we're doing cache flushes (either "always" or "cond") 1292 * If we're doing cache flushes (either "always" or "cond")
1284 * we will do one whenever the guest does a vmlaunch/vmresume. 1293 * we will do one whenever the guest does a vmlaunch/vmresume.
1285 * If an outer hypervisor is doing the cache flush for us 1294 * If an outer hypervisor is doing the cache flush for us