aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-17 13:33:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-17 13:33:30 -0400
commit0ef0fd351550130129bbdb77362488befd7b69d2 (patch)
tree23186172f5f85c06e18e3ee1a9619879df03c5df /arch
parent4489da7183099f569a7d3dd819c975073c04bc72 (diff)
parentc011d23ba046826ccf8c4a4a6c1d01c9ccaa1403 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "ARM: - support for SVE and Pointer Authentication in guests - PMU improvements POWER: - support for direct access to the POWER9 XIVE interrupt controller - memory and performance optimizations x86: - support for accessing memory not backed by struct page - fixes and refactoring Generic: - dirty page tracking improvements" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (155 commits) kvm: fix compilation on aarch64 Revert "KVM: nVMX: Expose RDPMC-exiting only when guest supports PMU" kvm: x86: Fix L1TF mitigation for shadow MMU KVM: nVMX: Disable intercept for FS/GS base MSRs in vmcs02 when possible KVM: PPC: Book3S: Remove useless checks in 'release' method of KVM device KVM: PPC: Book3S HV: XIVE: Fix spelling mistake "acessing" -> "accessing" KVM: PPC: Book3S HV: Make sure to load LPID for radix VCPUs kvm: nVMX: Set nested_run_pending in vmx_set_nested_state after checks complete tests: kvm: Add tests for KVM_SET_NESTED_STATE KVM: nVMX: KVM_SET_NESTED_STATE - Tear down old EVMCS state before setting new state tests: kvm: Add tests for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_CPU_ID tests: kvm: Add tests to .gitignore KVM: Introduce KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 KVM: Fix kvm_clear_dirty_log_protect off-by-(minus-)one KVM: Fix the bitmap range to copy during clear dirty KVM: arm64: Fix ptrauth ID register masking logic KVM: x86: use direct accessors for RIP and RSP KVM: VMX: Use accessors for GPRs outside of dedicated caching logic KVM: x86: Omit caching logic for always-available GPRs kvm, x86: Properly check whether a pfn is an MMIO or not ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_emulate.h2
-rw-r--r--arch/arm/include/asm/kvm_host.h26
-rw-r--r--arch/arm64/Kconfig6
-rw-r--r--arch/arm64/include/asm/fpsimd.h29
-rw-r--r--arch/arm64/include/asm/kvm_asm.h3
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h16
-rw-r--r--arch/arm64/include/asm/kvm_host.h101
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h1
-rw-r--r--arch/arm64/include/asm/kvm_ptrauth.h111
-rw-r--r--arch/arm64/include/asm/sysreg.h3
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h43
-rw-r--r--arch/arm64/kernel/asm-offsets.c7
-rw-r--r--arch/arm64/kernel/cpufeature.c2
-rw-r--r--arch/arm64/kernel/fpsimd.c179
-rw-r--r--arch/arm64/kernel/perf_event.c50
-rw-r--r--arch/arm64/kernel/signal.c5
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/fpsimd.c17
-rw-r--r--arch/arm64/kvm/guest.c415
-rw-r--r--arch/arm64/kvm/handle_exit.c36
-rw-r--r--arch/arm64/kvm/hyp/entry.S15
-rw-r--r--arch/arm64/kvm/hyp/switch.c80
-rw-r--r--arch/arm64/kvm/pmu.c239
-rw-r--r--arch/arm64/kvm/reset.c167
-rw-r--r--arch/arm64/kvm/sys_regs.c183
-rw-r--r--arch/arm64/kvm/sys_regs.h25
-rw-r--r--arch/powerpc/include/asm/kvm_host.h11
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h41
-rw-r--r--arch/powerpc/include/asm/xive.h3
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h46
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s.c42
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c96
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c105
-rw-r--r--arch/powerpc/kvm/book3s_hv.c152
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c57
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c144
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S86
-rw-r--r--arch/powerpc/kvm/book3s_xive.c250
-rw-r--r--arch/powerpc/kvm/book3s_xive.h37
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c1249
-rw-r--r--arch/powerpc/kvm/book3s_xive_template.c78
-rw-r--r--arch/powerpc/kvm/powerpc.c40
-rw-r--r--arch/powerpc/sysdev/xive/native.c11
-rw-r--r--arch/s390/include/asm/cpacf.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/include/uapi/asm/kvm.h5
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/interrupt.c11
-rw-r--r--arch/s390/kvm/kvm-s390.c120
-rw-r--r--arch/s390/kvm/vsie.c13
-rw-r--r--arch/s390/tools/gen_facilities.c3
-rw-r--r--arch/x86/events/intel/core.c6
-rw-r--r--arch/x86/include/asm/e820/api.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h7
-rw-r--r--arch/x86/include/asm/msr-index.h8
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kvm/cpuid.c12
-rw-r--r--arch/x86/kvm/hyperv.c24
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h42
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/mmu.c23
-rw-r--r--arch/x86/kvm/mtrr.c10
-rw-r--r--arch/x86/kvm/paging_tmpl.h38
-rw-r--r--arch/x86/kvm/svm.c128
-rw-r--r--arch/x86/kvm/vmx/capabilities.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c348
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c90
-rw-r--r--arch/x86/kvm/vmx/vmx.h11
-rw-r--r--arch/x86/kvm/x86.c199
-rw-r--r--arch/x86/kvm/x86.h10
72 files changed, 4481 insertions, 911 deletions
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 8927cae7c966..efb0e2c0d84c 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -343,4 +343,6 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
343 } 343 }
344} 344}
345 345
346static inline void vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu) {}
347
346#endif /* __ARM_KVM_EMULATE_H__ */ 348#endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 770d73257ad9..075e1921fdd9 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -19,6 +19,7 @@
19#ifndef __ARM_KVM_HOST_H__ 19#ifndef __ARM_KVM_HOST_H__
20#define __ARM_KVM_HOST_H__ 20#define __ARM_KVM_HOST_H__
21 21
22#include <linux/errno.h>
22#include <linux/types.h> 23#include <linux/types.h>
23#include <linux/kvm_types.h> 24#include <linux/kvm_types.h>
24#include <asm/cputype.h> 25#include <asm/cputype.h>
@@ -53,6 +54,8 @@
53 54
54DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 55DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
55 56
57static inline int kvm_arm_init_sve(void) { return 0; }
58
56u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); 59u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
57int __attribute_const__ kvm_target_cpu(void); 60int __attribute_const__ kvm_target_cpu(void);
58int kvm_reset_vcpu(struct kvm_vcpu *vcpu); 61int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -150,9 +153,13 @@ struct kvm_cpu_context {
150 u32 cp15[NR_CP15_REGS]; 153 u32 cp15[NR_CP15_REGS];
151}; 154};
152 155
153typedef struct kvm_cpu_context kvm_cpu_context_t; 156struct kvm_host_data {
157 struct kvm_cpu_context host_ctxt;
158};
159
160typedef struct kvm_host_data kvm_host_data_t;
154 161
155static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, 162static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt,
156 int cpu) 163 int cpu)
157{ 164{
158 /* The host's MPIDR is immutable, so let's set it up at boot time */ 165 /* The host's MPIDR is immutable, so let's set it up at boot time */
@@ -182,7 +189,7 @@ struct kvm_vcpu_arch {
182 struct kvm_vcpu_fault_info fault; 189 struct kvm_vcpu_fault_info fault;
183 190
184 /* Host FP context */ 191 /* Host FP context */
185 kvm_cpu_context_t *host_cpu_context; 192 struct kvm_cpu_context *host_cpu_context;
186 193
187 /* VGIC state */ 194 /* VGIC state */
188 struct vgic_cpu vgic_cpu; 195 struct vgic_cpu vgic_cpu;
@@ -361,6 +368,9 @@ static inline void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) {}
361static inline void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) {} 368static inline void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) {}
362static inline void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) {} 369static inline void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) {}
363 370
371static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {}
372static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {}
373
364static inline void kvm_arm_vhe_guest_enter(void) {} 374static inline void kvm_arm_vhe_guest_enter(void) {}
365static inline void kvm_arm_vhe_guest_exit(void) {} 375static inline void kvm_arm_vhe_guest_exit(void) {}
366 376
@@ -409,4 +419,14 @@ static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
409 return 0; 419 return 0;
410} 420}
411 421
422static inline int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature)
423{
424 return -EINVAL;
425}
426
427static inline bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
428{
429 return true;
430}
431
412#endif /* __ARM_KVM_HOST_H__ */ 432#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 69a59a5d1143..4780eb7af842 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1341,6 +1341,7 @@ menu "ARMv8.3 architectural features"
1341config ARM64_PTR_AUTH 1341config ARM64_PTR_AUTH
1342 bool "Enable support for pointer authentication" 1342 bool "Enable support for pointer authentication"
1343 default y 1343 default y
1344 depends on !KVM || ARM64_VHE
1344 help 1345 help
1345 Pointer authentication (part of the ARMv8.3 Extensions) provides 1346 Pointer authentication (part of the ARMv8.3 Extensions) provides
1346 instructions for signing and authenticating pointers against secret 1347 instructions for signing and authenticating pointers against secret
@@ -1354,8 +1355,9 @@ config ARM64_PTR_AUTH
1354 context-switched along with the process. 1355 context-switched along with the process.
1355 1356
1356 The feature is detected at runtime. If the feature is not present in 1357 The feature is detected at runtime. If the feature is not present in
1357 hardware it will not be advertised to userspace nor will it be 1358 hardware it will not be advertised to userspace/KVM guest nor will it
1358 enabled. 1359 be enabled. However, KVM guest also require VHE mode and hence
1360 CONFIG_ARM64_VHE=y option to use this feature.
1359 1361
1360endmenu 1362endmenu
1361 1363
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index dd1ad3950ef5..df62bbd33a9a 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -24,10 +24,13 @@
24 24
25#ifndef __ASSEMBLY__ 25#ifndef __ASSEMBLY__
26 26
27#include <linux/bitmap.h>
27#include <linux/build_bug.h> 28#include <linux/build_bug.h>
29#include <linux/bug.h>
28#include <linux/cache.h> 30#include <linux/cache.h>
29#include <linux/init.h> 31#include <linux/init.h>
30#include <linux/stddef.h> 32#include <linux/stddef.h>
33#include <linux/types.h>
31 34
32#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 35#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
33/* Masks for extracting the FPSR and FPCR from the FPSCR */ 36/* Masks for extracting the FPSR and FPCR from the FPSCR */
@@ -56,7 +59,8 @@ extern void fpsimd_restore_current_state(void);
56extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); 59extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);
57 60
58extern void fpsimd_bind_task_to_cpu(void); 61extern void fpsimd_bind_task_to_cpu(void);
59extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state); 62extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
63 void *sve_state, unsigned int sve_vl);
60 64
61extern void fpsimd_flush_task_state(struct task_struct *target); 65extern void fpsimd_flush_task_state(struct task_struct *target);
62extern void fpsimd_flush_cpu_state(void); 66extern void fpsimd_flush_cpu_state(void);
@@ -87,6 +91,29 @@ extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused);
87extern u64 read_zcr_features(void); 91extern u64 read_zcr_features(void);
88 92
89extern int __ro_after_init sve_max_vl; 93extern int __ro_after_init sve_max_vl;
94extern int __ro_after_init sve_max_virtualisable_vl;
95extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
96
97/*
98 * Helpers to translate bit indices in sve_vq_map to VQ values (and
99 * vice versa). This allows find_next_bit() to be used to find the
100 * _maximum_ VQ not exceeding a certain value.
101 */
102static inline unsigned int __vq_to_bit(unsigned int vq)
103{
104 return SVE_VQ_MAX - vq;
105}
106
107static inline unsigned int __bit_to_vq(unsigned int bit)
108{
109 return SVE_VQ_MAX - bit;
110}
111
112/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */
113static inline bool sve_vq_available(unsigned int vq)
114{
115 return test_bit(__vq_to_bit(vq), sve_vq_map);
116}
90 117
91#ifdef CONFIG_ARM64_SVE 118#ifdef CONFIG_ARM64_SVE
92 119
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index f5b79e995f40..ff73f5462aca 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -108,7 +108,8 @@ extern u32 __kvm_get_mdcr_el2(void);
108.endm 108.endm
109 109
110.macro get_host_ctxt reg, tmp 110.macro get_host_ctxt reg, tmp
111 hyp_adr_this_cpu \reg, kvm_host_cpu_state, \tmp 111 hyp_adr_this_cpu \reg, kvm_host_data, \tmp
112 add \reg, \reg, #HOST_DATA_CONTEXT
112.endm 113.endm
113 114
114.macro get_vcpu_ptr vcpu, ctxt 115.macro get_vcpu_ptr vcpu, ctxt
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index d3842791e1c4..613427fafff9 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -98,6 +98,22 @@ static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu)
98 vcpu->arch.hcr_el2 |= HCR_TWE; 98 vcpu->arch.hcr_el2 |= HCR_TWE;
99} 99}
100 100
101static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu)
102{
103 vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
104}
105
106static inline void vcpu_ptrauth_disable(struct kvm_vcpu *vcpu)
107{
108 vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
109}
110
111static inline void vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu)
112{
113 if (vcpu_has_ptrauth(vcpu))
114 vcpu_ptrauth_disable(vcpu);
115}
116
101static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu) 117static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu)
102{ 118{
103 return vcpu->arch.vsesr_el2; 119 return vcpu->arch.vsesr_el2;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a01fe087e022..2a8d3f8ca22c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -22,9 +22,13 @@
22#ifndef __ARM64_KVM_HOST_H__ 22#ifndef __ARM64_KVM_HOST_H__
23#define __ARM64_KVM_HOST_H__ 23#define __ARM64_KVM_HOST_H__
24 24
25#include <linux/bitmap.h>
25#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/jump_label.h>
26#include <linux/kvm_types.h> 28#include <linux/kvm_types.h>
29#include <linux/percpu.h>
27#include <asm/arch_gicv3.h> 30#include <asm/arch_gicv3.h>
31#include <asm/barrier.h>
28#include <asm/cpufeature.h> 32#include <asm/cpufeature.h>
29#include <asm/daifflags.h> 33#include <asm/daifflags.h>
30#include <asm/fpsimd.h> 34#include <asm/fpsimd.h>
@@ -45,7 +49,7 @@
45 49
46#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS 50#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
47 51
48#define KVM_VCPU_MAX_FEATURES 4 52#define KVM_VCPU_MAX_FEATURES 7
49 53
50#define KVM_REQ_SLEEP \ 54#define KVM_REQ_SLEEP \
51 KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 55 KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -54,8 +58,12 @@
54 58
55DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); 59DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
56 60
61extern unsigned int kvm_sve_max_vl;
62int kvm_arm_init_sve(void);
63
57int __attribute_const__ kvm_target_cpu(void); 64int __attribute_const__ kvm_target_cpu(void);
58int kvm_reset_vcpu(struct kvm_vcpu *vcpu); 65int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
66void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
59int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); 67int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
60void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); 68void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
61 69
@@ -117,6 +125,7 @@ enum vcpu_sysreg {
117 SCTLR_EL1, /* System Control Register */ 125 SCTLR_EL1, /* System Control Register */
118 ACTLR_EL1, /* Auxiliary Control Register */ 126 ACTLR_EL1, /* Auxiliary Control Register */
119 CPACR_EL1, /* Coprocessor Access Control */ 127 CPACR_EL1, /* Coprocessor Access Control */
128 ZCR_EL1, /* SVE Control */
120 TTBR0_EL1, /* Translation Table Base Register 0 */ 129 TTBR0_EL1, /* Translation Table Base Register 0 */
121 TTBR1_EL1, /* Translation Table Base Register 1 */ 130 TTBR1_EL1, /* Translation Table Base Register 1 */
122 TCR_EL1, /* Translation Control Register */ 131 TCR_EL1, /* Translation Control Register */
@@ -152,6 +161,18 @@ enum vcpu_sysreg {
152 PMSWINC_EL0, /* Software Increment Register */ 161 PMSWINC_EL0, /* Software Increment Register */
153 PMUSERENR_EL0, /* User Enable Register */ 162 PMUSERENR_EL0, /* User Enable Register */
154 163
164 /* Pointer Authentication Registers in a strict increasing order. */
165 APIAKEYLO_EL1,
166 APIAKEYHI_EL1,
167 APIBKEYLO_EL1,
168 APIBKEYHI_EL1,
169 APDAKEYLO_EL1,
170 APDAKEYHI_EL1,
171 APDBKEYLO_EL1,
172 APDBKEYHI_EL1,
173 APGAKEYLO_EL1,
174 APGAKEYHI_EL1,
175
155 /* 32bit specific registers. Keep them at the end of the range */ 176 /* 32bit specific registers. Keep them at the end of the range */
156 DACR32_EL2, /* Domain Access Control Register */ 177 DACR32_EL2, /* Domain Access Control Register */
157 IFSR32_EL2, /* Instruction Fault Status Register */ 178 IFSR32_EL2, /* Instruction Fault Status Register */
@@ -212,7 +233,17 @@ struct kvm_cpu_context {
212 struct kvm_vcpu *__hyp_running_vcpu; 233 struct kvm_vcpu *__hyp_running_vcpu;
213}; 234};
214 235
215typedef struct kvm_cpu_context kvm_cpu_context_t; 236struct kvm_pmu_events {
237 u32 events_host;
238 u32 events_guest;
239};
240
241struct kvm_host_data {
242 struct kvm_cpu_context host_ctxt;
243 struct kvm_pmu_events pmu_events;
244};
245
246typedef struct kvm_host_data kvm_host_data_t;
216 247
217struct vcpu_reset_state { 248struct vcpu_reset_state {
218 unsigned long pc; 249 unsigned long pc;
@@ -223,6 +254,8 @@ struct vcpu_reset_state {
223 254
224struct kvm_vcpu_arch { 255struct kvm_vcpu_arch {
225 struct kvm_cpu_context ctxt; 256 struct kvm_cpu_context ctxt;
257 void *sve_state;
258 unsigned int sve_max_vl;
226 259
227 /* HYP configuration */ 260 /* HYP configuration */
228 u64 hcr_el2; 261 u64 hcr_el2;
@@ -255,7 +288,7 @@ struct kvm_vcpu_arch {
255 struct kvm_guest_debug_arch external_debug_state; 288 struct kvm_guest_debug_arch external_debug_state;
256 289
257 /* Pointer to host CPU context */ 290 /* Pointer to host CPU context */
258 kvm_cpu_context_t *host_cpu_context; 291 struct kvm_cpu_context *host_cpu_context;
259 292
260 struct thread_info *host_thread_info; /* hyp VA */ 293 struct thread_info *host_thread_info; /* hyp VA */
261 struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */ 294 struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */
@@ -318,12 +351,40 @@ struct kvm_vcpu_arch {
318 bool sysregs_loaded_on_cpu; 351 bool sysregs_loaded_on_cpu;
319}; 352};
320 353
354/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
355#define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \
356 sve_ffr_offset((vcpu)->arch.sve_max_vl)))
357
358#define vcpu_sve_state_size(vcpu) ({ \
359 size_t __size_ret; \
360 unsigned int __vcpu_vq; \
361 \
362 if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \
363 __size_ret = 0; \
364 } else { \
365 __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl); \
366 __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \
367 } \
368 \
369 __size_ret; \
370})
371
321/* vcpu_arch flags field values: */ 372/* vcpu_arch flags field values: */
322#define KVM_ARM64_DEBUG_DIRTY (1 << 0) 373#define KVM_ARM64_DEBUG_DIRTY (1 << 0)
323#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ 374#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */
324#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ 375#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */
325#define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ 376#define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */
326#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ 377#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */
378#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */
379#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */
380#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */
381
382#define vcpu_has_sve(vcpu) (system_supports_sve() && \
383 ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
384
385#define vcpu_has_ptrauth(vcpu) ((system_supports_address_auth() || \
386 system_supports_generic_auth()) && \
387 ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH))
327 388
328#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs) 389#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs)
329 390
@@ -432,9 +493,9 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
432 493
433struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); 494struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
434 495
435DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); 496DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
436 497
437static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt, 498static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt,
438 int cpu) 499 int cpu)
439{ 500{
440 /* The host's MPIDR is immutable, so let's set it up at boot time */ 501 /* The host's MPIDR is immutable, so let's set it up at boot time */
@@ -452,8 +513,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
452 * kernel's mapping to the linear mapping, and store it in tpidr_el2 513 * kernel's mapping to the linear mapping, and store it in tpidr_el2
453 * so that we can use adr_l to access per-cpu variables in EL2. 514 * so that we can use adr_l to access per-cpu variables in EL2.
454 */ 515 */
455 u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_cpu_state) - 516 u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) -
456 (u64)kvm_ksym_ref(kvm_host_cpu_state)); 517 (u64)kvm_ksym_ref(kvm_host_data));
457 518
458 /* 519 /*
459 * Call initialization code, and switch to the full blown HYP code. 520 * Call initialization code, and switch to the full blown HYP code.
@@ -491,9 +552,10 @@ static inline bool kvm_arch_requires_vhe(void)
491 return false; 552 return false;
492} 553}
493 554
555void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
556
494static inline void kvm_arch_hardware_unsetup(void) {} 557static inline void kvm_arch_hardware_unsetup(void) {}
495static inline void kvm_arch_sync_events(struct kvm *kvm) {} 558static inline void kvm_arch_sync_events(struct kvm *kvm) {}
496static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
497static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 559static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
498static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} 560static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
499 561
@@ -516,11 +578,28 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
516void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); 578void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
517void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); 579void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu);
518 580
581static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
582{
583 return (!has_vhe() && attr->exclude_host);
584}
585
519#ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */ 586#ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */
520static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) 587static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
521{ 588{
522 return kvm_arch_vcpu_run_map_fp(vcpu); 589 return kvm_arch_vcpu_run_map_fp(vcpu);
523} 590}
591
592void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr);
593void kvm_clr_pmu_events(u32 clr);
594
595void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt);
596bool __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt);
597
598void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
599void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
600#else
601static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
602static inline void kvm_clr_pmu_events(u32 clr) {}
524#endif 603#endif
525 604
526static inline void kvm_arm_vhe_guest_enter(void) 605static inline void kvm_arm_vhe_guest_enter(void)
@@ -594,4 +673,10 @@ void kvm_arch_free_vm(struct kvm *kvm);
594 673
595int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); 674int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
596 675
676int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
677bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
678
679#define kvm_arm_vcpu_sve_finalized(vcpu) \
680 ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
681
597#endif /* __ARM64_KVM_HOST_H__ */ 682#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index c3060833b7a5..09fe8bd15f6e 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -149,7 +149,6 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu);
149 149
150void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); 150void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
151void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); 151void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
152bool __fpsimd_enabled(void);
153 152
154void activate_traps_vhe_load(struct kvm_vcpu *vcpu); 153void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
155void deactivate_traps_vhe_put(void); 154void deactivate_traps_vhe_put(void);
diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
new file mode 100644
index 000000000000..6301813dcace
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -0,0 +1,111 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/* arch/arm64/include/asm/kvm_ptrauth.h: Guest/host ptrauth save/restore
3 * Copyright 2019 Arm Limited
4 * Authors: Mark Rutland <mark.rutland@arm.com>
5 * Amit Daniel Kachhap <amit.kachhap@arm.com>
6 */
7
8#ifndef __ASM_KVM_PTRAUTH_H
9#define __ASM_KVM_PTRAUTH_H
10
11#ifdef __ASSEMBLY__
12
13#include <asm/sysreg.h>
14
15#ifdef CONFIG_ARM64_PTR_AUTH
16
17#define PTRAUTH_REG_OFFSET(x) (x - CPU_APIAKEYLO_EL1)
18
19/*
20 * CPU_AP*_EL1 values exceed immediate offset range (512) for stp
21 * instruction so below macros takes CPU_APIAKEYLO_EL1 as base and
22 * calculates the offset of the keys from this base to avoid an extra add
23 * instruction. These macros assumes the keys offsets follow the order of
24 * the sysreg enum in kvm_host.h.
25 */
26.macro ptrauth_save_state base, reg1, reg2
27 mrs_s \reg1, SYS_APIAKEYLO_EL1
28 mrs_s \reg2, SYS_APIAKEYHI_EL1
29 stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIAKEYLO_EL1)]
30 mrs_s \reg1, SYS_APIBKEYLO_EL1
31 mrs_s \reg2, SYS_APIBKEYHI_EL1
32 stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIBKEYLO_EL1)]
33 mrs_s \reg1, SYS_APDAKEYLO_EL1
34 mrs_s \reg2, SYS_APDAKEYHI_EL1
35 stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDAKEYLO_EL1)]
36 mrs_s \reg1, SYS_APDBKEYLO_EL1
37 mrs_s \reg2, SYS_APDBKEYHI_EL1
38 stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDBKEYLO_EL1)]
39 mrs_s \reg1, SYS_APGAKEYLO_EL1
40 mrs_s \reg2, SYS_APGAKEYHI_EL1
41 stp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APGAKEYLO_EL1)]
42.endm
43
44.macro ptrauth_restore_state base, reg1, reg2
45 ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIAKEYLO_EL1)]
46 msr_s SYS_APIAKEYLO_EL1, \reg1
47 msr_s SYS_APIAKEYHI_EL1, \reg2
48 ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APIBKEYLO_EL1)]
49 msr_s SYS_APIBKEYLO_EL1, \reg1
50 msr_s SYS_APIBKEYHI_EL1, \reg2
51 ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDAKEYLO_EL1)]
52 msr_s SYS_APDAKEYLO_EL1, \reg1
53 msr_s SYS_APDAKEYHI_EL1, \reg2
54 ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APDBKEYLO_EL1)]
55 msr_s SYS_APDBKEYLO_EL1, \reg1
56 msr_s SYS_APDBKEYHI_EL1, \reg2
57 ldp \reg1, \reg2, [\base, #PTRAUTH_REG_OFFSET(CPU_APGAKEYLO_EL1)]
58 msr_s SYS_APGAKEYLO_EL1, \reg1
59 msr_s SYS_APGAKEYHI_EL1, \reg2
60.endm
61
62/*
63 * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
64 * check for the presence of one of the cpufeature flag
65 * ARM64_HAS_ADDRESS_AUTH_ARCH or ARM64_HAS_ADDRESS_AUTH_IMP_DEF and
66 * then proceed ahead with the save/restore of Pointer Authentication
67 * key registers.
68 */
69.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
70alternative_if ARM64_HAS_ADDRESS_AUTH_ARCH
71 b 1000f
72alternative_else_nop_endif
73alternative_if_not ARM64_HAS_ADDRESS_AUTH_IMP_DEF
74 b 1001f
75alternative_else_nop_endif
761000:
77 ldr \reg1, [\g_ctxt, #(VCPU_HCR_EL2 - VCPU_CONTEXT)]
78 and \reg1, \reg1, #(HCR_API | HCR_APK)
79 cbz \reg1, 1001f
80 add \reg1, \g_ctxt, #CPU_APIAKEYLO_EL1
81 ptrauth_restore_state \reg1, \reg2, \reg3
821001:
83.endm
84
85.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
86alternative_if ARM64_HAS_ADDRESS_AUTH_ARCH
87 b 2000f
88alternative_else_nop_endif
89alternative_if_not ARM64_HAS_ADDRESS_AUTH_IMP_DEF
90 b 2001f
91alternative_else_nop_endif
922000:
93 ldr \reg1, [\g_ctxt, #(VCPU_HCR_EL2 - VCPU_CONTEXT)]
94 and \reg1, \reg1, #(HCR_API | HCR_APK)
95 cbz \reg1, 2001f
96 add \reg1, \g_ctxt, #CPU_APIAKEYLO_EL1
97 ptrauth_save_state \reg1, \reg2, \reg3
98 add \reg1, \h_ctxt, #CPU_APIAKEYLO_EL1
99 ptrauth_restore_state \reg1, \reg2, \reg3
100 isb
1012001:
102.endm
103
104#else /* !CONFIG_ARM64_PTR_AUTH */
105.macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
106.endm
107.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
108.endm
109#endif /* CONFIG_ARM64_PTR_AUTH */
110#endif /* __ASSEMBLY__ */
111#endif /* __ASM_KVM_PTRAUTH_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 3f7b917e8f3a..902d75b60914 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -454,6 +454,9 @@
454#define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6) 454#define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6)
455#define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7) 455#define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7)
456 456
457/* VHE encodings for architectural EL0/1 system registers */
458#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0)
459
457/* Common SCTLR_ELx flags. */ 460/* Common SCTLR_ELx flags. */
458#define SCTLR_ELx_DSSBS (_BITUL(44)) 461#define SCTLR_ELx_DSSBS (_BITUL(44))
459#define SCTLR_ELx_ENIA (_BITUL(31)) 462#define SCTLR_ELx_ENIA (_BITUL(31))
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 97c3478ee6e7..7b7ac0f6cec9 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -35,6 +35,7 @@
35#include <linux/psci.h> 35#include <linux/psci.h>
36#include <linux/types.h> 36#include <linux/types.h>
37#include <asm/ptrace.h> 37#include <asm/ptrace.h>
38#include <asm/sve_context.h>
38 39
39#define __KVM_HAVE_GUEST_DEBUG 40#define __KVM_HAVE_GUEST_DEBUG
40#define __KVM_HAVE_IRQ_LINE 41#define __KVM_HAVE_IRQ_LINE
@@ -102,6 +103,9 @@ struct kvm_regs {
102#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ 103#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
103#define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */ 104#define KVM_ARM_VCPU_PSCI_0_2 2 /* CPU uses PSCI v0.2 */
104#define KVM_ARM_VCPU_PMU_V3 3 /* Support guest PMUv3 */ 105#define KVM_ARM_VCPU_PMU_V3 3 /* Support guest PMUv3 */
106#define KVM_ARM_VCPU_SVE 4 /* enable SVE for this CPU */
107#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */
108#define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */
105 109
106struct kvm_vcpu_init { 110struct kvm_vcpu_init {
107 __u32 target; 111 __u32 target;
@@ -226,6 +230,45 @@ struct kvm_vcpu_events {
226 KVM_REG_ARM_FW | ((r) & 0xffff)) 230 KVM_REG_ARM_FW | ((r) & 0xffff))
227#define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) 231#define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0)
228 232
233/* SVE registers */
234#define KVM_REG_ARM64_SVE (0x15 << KVM_REG_ARM_COPROC_SHIFT)
235
236/* Z- and P-regs occupy blocks at the following offsets within this range: */
237#define KVM_REG_ARM64_SVE_ZREG_BASE 0
238#define KVM_REG_ARM64_SVE_PREG_BASE 0x400
239#define KVM_REG_ARM64_SVE_FFR_BASE 0x600
240
241#define KVM_ARM64_SVE_NUM_ZREGS __SVE_NUM_ZREGS
242#define KVM_ARM64_SVE_NUM_PREGS __SVE_NUM_PREGS
243
244#define KVM_ARM64_SVE_MAX_SLICES 32
245
246#define KVM_REG_ARM64_SVE_ZREG(n, i) \
247 (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_ZREG_BASE | \
248 KVM_REG_SIZE_U2048 | \
249 (((n) & (KVM_ARM64_SVE_NUM_ZREGS - 1)) << 5) | \
250 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
251
252#define KVM_REG_ARM64_SVE_PREG(n, i) \
253 (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_PREG_BASE | \
254 KVM_REG_SIZE_U256 | \
255 (((n) & (KVM_ARM64_SVE_NUM_PREGS - 1)) << 5) | \
256 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
257
258#define KVM_REG_ARM64_SVE_FFR(i) \
259 (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | KVM_REG_ARM64_SVE_FFR_BASE | \
260 KVM_REG_SIZE_U256 | \
261 ((i) & (KVM_ARM64_SVE_MAX_SLICES - 1)))
262
263#define KVM_ARM64_SVE_VQ_MIN __SVE_VQ_MIN
264#define KVM_ARM64_SVE_VQ_MAX __SVE_VQ_MAX
265
266/* Vector lengths pseudo-register: */
267#define KVM_REG_ARM64_SVE_VLS (KVM_REG_ARM64 | KVM_REG_ARM64_SVE | \
268 KVM_REG_SIZE_U512 | 0xffff)
269#define KVM_ARM64_SVE_VLS_WORDS \
270 ((KVM_ARM64_SVE_VQ_MAX - KVM_ARM64_SVE_VQ_MIN) / 64 + 1)
271
229/* Device Control API: ARM VGIC */ 272/* Device Control API: ARM VGIC */
230#define KVM_DEV_ARM_VGIC_GRP_ADDR 0 273#define KVM_DEV_ARM_VGIC_GRP_ADDR 0
231#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 274#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index e10e2a5d9ddc..947e39896e28 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -125,9 +125,16 @@ int main(void)
125 DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); 125 DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt));
126 DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); 126 DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1));
127 DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags)); 127 DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags));
128 DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
128 DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); 129 DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs));
130 DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
131 DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1]));
132 DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1]));
133 DEFINE(CPU_APDBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDBKEYLO_EL1]));
134 DEFINE(CPU_APGAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APGAKEYLO_EL1]));
129 DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); 135 DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs));
130 DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu)); 136 DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
137 DEFINE(HOST_DATA_CONTEXT, offsetof(struct kvm_host_data, host_ctxt));
131#endif 138#endif
132#ifdef CONFIG_CPU_PM 139#ifdef CONFIG_CPU_PM
133 DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); 140 DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 2b807f129e60..ca27e08e3d8a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1913,7 +1913,7 @@ static void verify_sve_features(void)
1913 unsigned int len = zcr & ZCR_ELx_LEN_MASK; 1913 unsigned int len = zcr & ZCR_ELx_LEN_MASK;
1914 1914
1915 if (len < safe_len || sve_verify_vq_map()) { 1915 if (len < safe_len || sve_verify_vq_map()) {
1916 pr_crit("CPU%d: SVE: required vector length(s) missing\n", 1916 pr_crit("CPU%d: SVE: vector length support mismatch\n",
1917 smp_processor_id()); 1917 smp_processor_id());
1918 cpu_die_early(); 1918 cpu_die_early();
1919 } 1919 }
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 735cf1f8b109..a38bf74bcca8 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/bitmap.h> 20#include <linux/bitmap.h>
21#include <linux/bitops.h>
21#include <linux/bottom_half.h> 22#include <linux/bottom_half.h>
22#include <linux/bug.h> 23#include <linux/bug.h>
23#include <linux/cache.h> 24#include <linux/cache.h>
@@ -48,6 +49,7 @@
48#include <asm/sigcontext.h> 49#include <asm/sigcontext.h>
49#include <asm/sysreg.h> 50#include <asm/sysreg.h>
50#include <asm/traps.h> 51#include <asm/traps.h>
52#include <asm/virt.h>
51 53
52#define FPEXC_IOF (1 << 0) 54#define FPEXC_IOF (1 << 0)
53#define FPEXC_DZF (1 << 1) 55#define FPEXC_DZF (1 << 1)
@@ -119,6 +121,8 @@
119 */ 121 */
120struct fpsimd_last_state_struct { 122struct fpsimd_last_state_struct {
121 struct user_fpsimd_state *st; 123 struct user_fpsimd_state *st;
124 void *sve_state;
125 unsigned int sve_vl;
122}; 126};
123 127
124static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state); 128static DEFINE_PER_CPU(struct fpsimd_last_state_struct, fpsimd_last_state);
@@ -130,14 +134,23 @@ static int sve_default_vl = -1;
130 134
131/* Maximum supported vector length across all CPUs (initially poisoned) */ 135/* Maximum supported vector length across all CPUs (initially poisoned) */
132int __ro_after_init sve_max_vl = SVE_VL_MIN; 136int __ro_after_init sve_max_vl = SVE_VL_MIN;
133/* Set of available vector lengths, as vq_to_bit(vq): */ 137int __ro_after_init sve_max_virtualisable_vl = SVE_VL_MIN;
134static __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); 138
139/*
140 * Set of available vector lengths,
141 * where length vq encoded as bit __vq_to_bit(vq):
142 */
143__ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
144/* Set of vector lengths present on at least one cpu: */
145static __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX);
146
135static void __percpu *efi_sve_state; 147static void __percpu *efi_sve_state;
136 148
137#else /* ! CONFIG_ARM64_SVE */ 149#else /* ! CONFIG_ARM64_SVE */
138 150
139/* Dummy declaration for code that will be optimised out: */ 151/* Dummy declaration for code that will be optimised out: */
140extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX); 152extern __ro_after_init DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
153extern __ro_after_init DECLARE_BITMAP(sve_vq_partial_map, SVE_VQ_MAX);
141extern void __percpu *efi_sve_state; 154extern void __percpu *efi_sve_state;
142 155
143#endif /* ! CONFIG_ARM64_SVE */ 156#endif /* ! CONFIG_ARM64_SVE */
@@ -235,14 +248,15 @@ static void task_fpsimd_load(void)
235 */ 248 */
236void fpsimd_save(void) 249void fpsimd_save(void)
237{ 250{
238 struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st); 251 struct fpsimd_last_state_struct const *last =
252 this_cpu_ptr(&fpsimd_last_state);
239 /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */ 253 /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */
240 254
241 WARN_ON(!in_softirq() && !irqs_disabled()); 255 WARN_ON(!in_softirq() && !irqs_disabled());
242 256
243 if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { 257 if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
244 if (system_supports_sve() && test_thread_flag(TIF_SVE)) { 258 if (system_supports_sve() && test_thread_flag(TIF_SVE)) {
245 if (WARN_ON(sve_get_vl() != current->thread.sve_vl)) { 259 if (WARN_ON(sve_get_vl() != last->sve_vl)) {
246 /* 260 /*
247 * Can't save the user regs, so current would 261 * Can't save the user regs, so current would
248 * re-enter user with corrupt state. 262 * re-enter user with corrupt state.
@@ -252,32 +266,15 @@ void fpsimd_save(void)
252 return; 266 return;
253 } 267 }
254 268
255 sve_save_state(sve_pffr(&current->thread), &st->fpsr); 269 sve_save_state((char *)last->sve_state +
270 sve_ffr_offset(last->sve_vl),
271 &last->st->fpsr);
256 } else 272 } else
257 fpsimd_save_state(st); 273 fpsimd_save_state(last->st);
258 } 274 }
259} 275}
260 276
261/* 277/*
262 * Helpers to translate bit indices in sve_vq_map to VQ values (and
263 * vice versa). This allows find_next_bit() to be used to find the
264 * _maximum_ VQ not exceeding a certain value.
265 */
266
267static unsigned int vq_to_bit(unsigned int vq)
268{
269 return SVE_VQ_MAX - vq;
270}
271
272static unsigned int bit_to_vq(unsigned int bit)
273{
274 if (WARN_ON(bit >= SVE_VQ_MAX))
275 bit = SVE_VQ_MAX - 1;
276
277 return SVE_VQ_MAX - bit;
278}
279
280/*
281 * All vector length selection from userspace comes through here. 278 * All vector length selection from userspace comes through here.
282 * We're on a slow path, so some sanity-checks are included. 279 * We're on a slow path, so some sanity-checks are included.
283 * If things go wrong there's a bug somewhere, but try to fall back to a 280 * If things go wrong there's a bug somewhere, but try to fall back to a
@@ -298,8 +295,8 @@ static unsigned int find_supported_vector_length(unsigned int vl)
298 vl = max_vl; 295 vl = max_vl;
299 296
300 bit = find_next_bit(sve_vq_map, SVE_VQ_MAX, 297 bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
301 vq_to_bit(sve_vq_from_vl(vl))); 298 __vq_to_bit(sve_vq_from_vl(vl)));
302 return sve_vl_from_vq(bit_to_vq(bit)); 299 return sve_vl_from_vq(__bit_to_vq(bit));
303} 300}
304 301
305#ifdef CONFIG_SYSCTL 302#ifdef CONFIG_SYSCTL
@@ -550,7 +547,6 @@ int sve_set_vector_length(struct task_struct *task,
550 local_bh_disable(); 547 local_bh_disable();
551 548
552 fpsimd_save(); 549 fpsimd_save();
553 set_thread_flag(TIF_FOREIGN_FPSTATE);
554 } 550 }
555 551
556 fpsimd_flush_task_state(task); 552 fpsimd_flush_task_state(task);
@@ -624,12 +620,6 @@ int sve_get_current_vl(void)
624 return sve_prctl_status(0); 620 return sve_prctl_status(0);
625} 621}
626 622
627/*
628 * Bitmap for temporary storage of the per-CPU set of supported vector lengths
629 * during secondary boot.
630 */
631static DECLARE_BITMAP(sve_secondary_vq_map, SVE_VQ_MAX);
632
633static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX)) 623static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
634{ 624{
635 unsigned int vq, vl; 625 unsigned int vq, vl;
@@ -644,40 +634,82 @@ static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
644 write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */ 634 write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1); /* self-syncing */
645 vl = sve_get_vl(); 635 vl = sve_get_vl();
646 vq = sve_vq_from_vl(vl); /* skip intervening lengths */ 636 vq = sve_vq_from_vl(vl); /* skip intervening lengths */
647 set_bit(vq_to_bit(vq), map); 637 set_bit(__vq_to_bit(vq), map);
648 } 638 }
649} 639}
650 640
641/*
642 * Initialise the set of known supported VQs for the boot CPU.
643 * This is called during kernel boot, before secondary CPUs are brought up.
644 */
651void __init sve_init_vq_map(void) 645void __init sve_init_vq_map(void)
652{ 646{
653 sve_probe_vqs(sve_vq_map); 647 sve_probe_vqs(sve_vq_map);
648 bitmap_copy(sve_vq_partial_map, sve_vq_map, SVE_VQ_MAX);
654} 649}
655 650
656/* 651/*
657 * If we haven't committed to the set of supported VQs yet, filter out 652 * If we haven't committed to the set of supported VQs yet, filter out
658 * those not supported by the current CPU. 653 * those not supported by the current CPU.
654 * This function is called during the bring-up of early secondary CPUs only.
659 */ 655 */
660void sve_update_vq_map(void) 656void sve_update_vq_map(void)
661{ 657{
662 sve_probe_vqs(sve_secondary_vq_map); 658 DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
663 bitmap_and(sve_vq_map, sve_vq_map, sve_secondary_vq_map, SVE_VQ_MAX); 659
660 sve_probe_vqs(tmp_map);
661 bitmap_and(sve_vq_map, sve_vq_map, tmp_map, SVE_VQ_MAX);
662 bitmap_or(sve_vq_partial_map, sve_vq_partial_map, tmp_map, SVE_VQ_MAX);
664} 663}
665 664
666/* Check whether the current CPU supports all VQs in the committed set */ 665/*
666 * Check whether the current CPU supports all VQs in the committed set.
667 * This function is called during the bring-up of late secondary CPUs only.
668 */
667int sve_verify_vq_map(void) 669int sve_verify_vq_map(void)
668{ 670{
669 int ret = 0; 671 DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
672 unsigned long b;
670 673
671 sve_probe_vqs(sve_secondary_vq_map); 674 sve_probe_vqs(tmp_map);
672 bitmap_andnot(sve_secondary_vq_map, sve_vq_map, sve_secondary_vq_map, 675
673 SVE_VQ_MAX); 676 bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX);
674 if (!bitmap_empty(sve_secondary_vq_map, SVE_VQ_MAX)) { 677 if (bitmap_intersects(tmp_map, sve_vq_map, SVE_VQ_MAX)) {
675 pr_warn("SVE: cpu%d: Required vector length(s) missing\n", 678 pr_warn("SVE: cpu%d: Required vector length(s) missing\n",
676 smp_processor_id()); 679 smp_processor_id());
677 ret = -EINVAL; 680 return -EINVAL;
678 } 681 }
679 682
680 return ret; 683 if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available())
684 return 0;
685
686 /*
687 * For KVM, it is necessary to ensure that this CPU doesn't
688 * support any vector length that guests may have probed as
689 * unsupported.
690 */
691
692 /* Recover the set of supported VQs: */
693 bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX);
694 /* Find VQs supported that are not globally supported: */
695 bitmap_andnot(tmp_map, tmp_map, sve_vq_map, SVE_VQ_MAX);
696
697 /* Find the lowest such VQ, if any: */
698 b = find_last_bit(tmp_map, SVE_VQ_MAX);
699 if (b >= SVE_VQ_MAX)
700 return 0; /* no mismatches */
701
702 /*
703 * Mismatches above sve_max_virtualisable_vl are fine, since
704 * no guest is allowed to configure ZCR_EL2.LEN to exceed this:
705 */
706 if (sve_vl_from_vq(__bit_to_vq(b)) <= sve_max_virtualisable_vl) {
707 pr_warn("SVE: cpu%d: Unsupported vector length(s) present\n",
708 smp_processor_id());
709 return -EINVAL;
710 }
711
712 return 0;
681} 713}
682 714
683static void __init sve_efi_setup(void) 715static void __init sve_efi_setup(void)
@@ -744,6 +776,8 @@ u64 read_zcr_features(void)
744void __init sve_setup(void) 776void __init sve_setup(void)
745{ 777{
746 u64 zcr; 778 u64 zcr;
779 DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
780 unsigned long b;
747 781
748 if (!system_supports_sve()) 782 if (!system_supports_sve())
749 return; 783 return;
@@ -753,8 +787,8 @@ void __init sve_setup(void)
753 * so sve_vq_map must have at least SVE_VQ_MIN set. 787 * so sve_vq_map must have at least SVE_VQ_MIN set.
754 * If something went wrong, at least try to patch it up: 788 * If something went wrong, at least try to patch it up:
755 */ 789 */
756 if (WARN_ON(!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map))) 790 if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map)))
757 set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map); 791 set_bit(__vq_to_bit(SVE_VQ_MIN), sve_vq_map);
758 792
759 zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1); 793 zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
760 sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1); 794 sve_max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1);
@@ -772,11 +806,31 @@ void __init sve_setup(void)
772 */ 806 */
773 sve_default_vl = find_supported_vector_length(64); 807 sve_default_vl = find_supported_vector_length(64);
774 808
809 bitmap_andnot(tmp_map, sve_vq_partial_map, sve_vq_map,
810 SVE_VQ_MAX);
811
812 b = find_last_bit(tmp_map, SVE_VQ_MAX);
813 if (b >= SVE_VQ_MAX)
814 /* No non-virtualisable VLs found */
815 sve_max_virtualisable_vl = SVE_VQ_MAX;
816 else if (WARN_ON(b == SVE_VQ_MAX - 1))
817 /* No virtualisable VLs? This is architecturally forbidden. */
818 sve_max_virtualisable_vl = SVE_VQ_MIN;
819 else /* b + 1 < SVE_VQ_MAX */
820 sve_max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1));
821
822 if (sve_max_virtualisable_vl > sve_max_vl)
823 sve_max_virtualisable_vl = sve_max_vl;
824
775 pr_info("SVE: maximum available vector length %u bytes per vector\n", 825 pr_info("SVE: maximum available vector length %u bytes per vector\n",
776 sve_max_vl); 826 sve_max_vl);
777 pr_info("SVE: default vector length %u bytes per vector\n", 827 pr_info("SVE: default vector length %u bytes per vector\n",
778 sve_default_vl); 828 sve_default_vl);
779 829
830 /* KVM decides whether to support mismatched systems. Just warn here: */
831 if (sve_max_virtualisable_vl < sve_max_vl)
832 pr_warn("SVE: unvirtualisable vector lengths present\n");
833
780 sve_efi_setup(); 834 sve_efi_setup();
781} 835}
782 836
@@ -816,12 +870,11 @@ asmlinkage void do_sve_acc(unsigned int esr, struct pt_regs *regs)
816 local_bh_disable(); 870 local_bh_disable();
817 871
818 fpsimd_save(); 872 fpsimd_save();
819 fpsimd_to_sve(current);
820 873
821 /* Force ret_to_user to reload the registers: */ 874 /* Force ret_to_user to reload the registers: */
822 fpsimd_flush_task_state(current); 875 fpsimd_flush_task_state(current);
823 set_thread_flag(TIF_FOREIGN_FPSTATE);
824 876
877 fpsimd_to_sve(current);
825 if (test_and_set_thread_flag(TIF_SVE)) 878 if (test_and_set_thread_flag(TIF_SVE))
826 WARN_ON(1); /* SVE access shouldn't have trapped */ 879 WARN_ON(1); /* SVE access shouldn't have trapped */
827 880
@@ -894,9 +947,9 @@ void fpsimd_flush_thread(void)
894 947
895 local_bh_disable(); 948 local_bh_disable();
896 949
950 fpsimd_flush_task_state(current);
897 memset(&current->thread.uw.fpsimd_state, 0, 951 memset(&current->thread.uw.fpsimd_state, 0,
898 sizeof(current->thread.uw.fpsimd_state)); 952 sizeof(current->thread.uw.fpsimd_state));
899 fpsimd_flush_task_state(current);
900 953
901 if (system_supports_sve()) { 954 if (system_supports_sve()) {
902 clear_thread_flag(TIF_SVE); 955 clear_thread_flag(TIF_SVE);
@@ -933,8 +986,6 @@ void fpsimd_flush_thread(void)
933 current->thread.sve_vl_onexec = 0; 986 current->thread.sve_vl_onexec = 0;
934 } 987 }
935 988
936 set_thread_flag(TIF_FOREIGN_FPSTATE);
937
938 local_bh_enable(); 989 local_bh_enable();
939} 990}
940 991
@@ -974,6 +1025,8 @@ void fpsimd_bind_task_to_cpu(void)
974 this_cpu_ptr(&fpsimd_last_state); 1025 this_cpu_ptr(&fpsimd_last_state);
975 1026
976 last->st = &current->thread.uw.fpsimd_state; 1027 last->st = &current->thread.uw.fpsimd_state;
1028 last->sve_state = current->thread.sve_state;
1029 last->sve_vl = current->thread.sve_vl;
977 current->thread.fpsimd_cpu = smp_processor_id(); 1030 current->thread.fpsimd_cpu = smp_processor_id();
978 1031
979 if (system_supports_sve()) { 1032 if (system_supports_sve()) {
@@ -987,7 +1040,8 @@ void fpsimd_bind_task_to_cpu(void)
987 } 1040 }
988} 1041}
989 1042
990void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st) 1043void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
1044 unsigned int sve_vl)
991{ 1045{
992 struct fpsimd_last_state_struct *last = 1046 struct fpsimd_last_state_struct *last =
993 this_cpu_ptr(&fpsimd_last_state); 1047 this_cpu_ptr(&fpsimd_last_state);
@@ -995,6 +1049,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st)
995 WARN_ON(!in_softirq() && !irqs_disabled()); 1049 WARN_ON(!in_softirq() && !irqs_disabled());
996 1050
997 last->st = st; 1051 last->st = st;
1052 last->sve_state = sve_state;
1053 last->sve_vl = sve_vl;
998} 1054}
999 1055
1000/* 1056/*
@@ -1043,12 +1099,29 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state)
1043 1099
1044/* 1100/*
1045 * Invalidate live CPU copies of task t's FPSIMD state 1101 * Invalidate live CPU copies of task t's FPSIMD state
1102 *
1103 * This function may be called with preemption enabled. The barrier()
1104 * ensures that the assignment to fpsimd_cpu is visible to any
1105 * preemption/softirq that could race with set_tsk_thread_flag(), so
1106 * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared.
1107 *
1108 * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any
1109 * subsequent code.
1046 */ 1110 */
1047void fpsimd_flush_task_state(struct task_struct *t) 1111void fpsimd_flush_task_state(struct task_struct *t)
1048{ 1112{
1049 t->thread.fpsimd_cpu = NR_CPUS; 1113 t->thread.fpsimd_cpu = NR_CPUS;
1114
1115 barrier();
1116 set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE);
1117
1118 barrier();
1050} 1119}
1051 1120
1121/*
1122 * Invalidate any task's FPSIMD state that is present on this cpu.
1123 * This function must be called with softirqs disabled.
1124 */
1052void fpsimd_flush_cpu_state(void) 1125void fpsimd_flush_cpu_state(void)
1053{ 1126{
1054 __this_cpu_write(fpsimd_last_state.st, NULL); 1127 __this_cpu_write(fpsimd_last_state.st, NULL);
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 6164d389eed6..348d12eec566 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -26,6 +26,7 @@
26 26
27#include <linux/acpi.h> 27#include <linux/acpi.h>
28#include <linux/clocksource.h> 28#include <linux/clocksource.h>
29#include <linux/kvm_host.h>
29#include <linux/of.h> 30#include <linux/of.h>
30#include <linux/perf/arm_pmu.h> 31#include <linux/perf/arm_pmu.h>
31#include <linux/platform_device.h> 32#include <linux/platform_device.h>
@@ -528,12 +529,21 @@ static inline int armv8pmu_enable_counter(int idx)
528 529
529static inline void armv8pmu_enable_event_counter(struct perf_event *event) 530static inline void armv8pmu_enable_event_counter(struct perf_event *event)
530{ 531{
532 struct perf_event_attr *attr = &event->attr;
531 int idx = event->hw.idx; 533 int idx = event->hw.idx;
534 u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx));
532 535
533 armv8pmu_enable_counter(idx);
534 if (armv8pmu_event_is_chained(event)) 536 if (armv8pmu_event_is_chained(event))
535 armv8pmu_enable_counter(idx - 1); 537 counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1));
536 isb(); 538
539 kvm_set_pmu_events(counter_bits, attr);
540
541 /* We rely on the hypervisor switch code to enable guest counters */
542 if (!kvm_pmu_counter_deferred(attr)) {
543 armv8pmu_enable_counter(idx);
544 if (armv8pmu_event_is_chained(event))
545 armv8pmu_enable_counter(idx - 1);
546 }
537} 547}
538 548
539static inline int armv8pmu_disable_counter(int idx) 549static inline int armv8pmu_disable_counter(int idx)
@@ -546,11 +556,21 @@ static inline int armv8pmu_disable_counter(int idx)
546static inline void armv8pmu_disable_event_counter(struct perf_event *event) 556static inline void armv8pmu_disable_event_counter(struct perf_event *event)
547{ 557{
548 struct hw_perf_event *hwc = &event->hw; 558 struct hw_perf_event *hwc = &event->hw;
559 struct perf_event_attr *attr = &event->attr;
549 int idx = hwc->idx; 560 int idx = hwc->idx;
561 u32 counter_bits = BIT(ARMV8_IDX_TO_COUNTER(idx));
550 562
551 if (armv8pmu_event_is_chained(event)) 563 if (armv8pmu_event_is_chained(event))
552 armv8pmu_disable_counter(idx - 1); 564 counter_bits |= BIT(ARMV8_IDX_TO_COUNTER(idx - 1));
553 armv8pmu_disable_counter(idx); 565
566 kvm_clr_pmu_events(counter_bits);
567
568 /* We rely on the hypervisor switch code to disable guest counters */
569 if (!kvm_pmu_counter_deferred(attr)) {
570 if (armv8pmu_event_is_chained(event))
571 armv8pmu_disable_counter(idx - 1);
572 armv8pmu_disable_counter(idx);
573 }
554} 574}
555 575
556static inline int armv8pmu_enable_intens(int idx) 576static inline int armv8pmu_enable_intens(int idx)
@@ -827,14 +847,23 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
827 * with other architectures (x86 and Power). 847 * with other architectures (x86 and Power).
828 */ 848 */
829 if (is_kernel_in_hyp_mode()) { 849 if (is_kernel_in_hyp_mode()) {
830 if (!attr->exclude_kernel) 850 if (!attr->exclude_kernel && !attr->exclude_host)
831 config_base |= ARMV8_PMU_INCLUDE_EL2; 851 config_base |= ARMV8_PMU_INCLUDE_EL2;
832 } else { 852 if (attr->exclude_guest)
833 if (attr->exclude_kernel)
834 config_base |= ARMV8_PMU_EXCLUDE_EL1; 853 config_base |= ARMV8_PMU_EXCLUDE_EL1;
835 if (!attr->exclude_hv) 854 if (attr->exclude_host)
855 config_base |= ARMV8_PMU_EXCLUDE_EL0;
856 } else {
857 if (!attr->exclude_hv && !attr->exclude_host)
836 config_base |= ARMV8_PMU_INCLUDE_EL2; 858 config_base |= ARMV8_PMU_INCLUDE_EL2;
837 } 859 }
860
861 /*
862 * Filter out !VHE kernels and guest kernels
863 */
864 if (attr->exclude_kernel)
865 config_base |= ARMV8_PMU_EXCLUDE_EL1;
866
838 if (attr->exclude_user) 867 if (attr->exclude_user)
839 config_base |= ARMV8_PMU_EXCLUDE_EL0; 868 config_base |= ARMV8_PMU_EXCLUDE_EL0;
840 869
@@ -864,6 +893,9 @@ static void armv8pmu_reset(void *info)
864 armv8pmu_disable_intens(idx); 893 armv8pmu_disable_intens(idx);
865 } 894 }
866 895
896 /* Clear the counters we flip at guest entry/exit */
897 kvm_clr_pmu_events(U32_MAX);
898
867 /* 899 /*
868 * Initialize & Reset PMNC. Request overflow interrupt for 900 * Initialize & Reset PMNC. Request overflow interrupt for
869 * 64 bit cycle counter but cheat in armv8pmu_write_counter(). 901 * 64 bit cycle counter but cheat in armv8pmu_write_counter().
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 867a7cea70e5..a9b0485df074 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -296,11 +296,6 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
296 */ 296 */
297 297
298 fpsimd_flush_task_state(current); 298 fpsimd_flush_task_state(current);
299 barrier();
300 /* From now, fpsimd_thread_switch() won't clear TIF_FOREIGN_FPSTATE */
301
302 set_thread_flag(TIF_FOREIGN_FPSTATE);
303 barrier();
304 /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ 299 /* From now, fpsimd_thread_switch() won't touch thread.sve_state */
305 300
306 sve_alloc(current); 301 sve_alloc(current);
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 690e033a91c0..3ac1a64d2fb9 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
17kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o 17kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
18kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o 18kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
19kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o 19kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
20kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o 20kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o
21kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o 21kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
22 22
23kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o 23kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index aac7808ce216..6e3c9c8b2df9 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -9,6 +9,7 @@
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/thread_info.h> 10#include <linux/thread_info.h>
11#include <linux/kvm_host.h> 11#include <linux/kvm_host.h>
12#include <asm/fpsimd.h>
12#include <asm/kvm_asm.h> 13#include <asm/kvm_asm.h>
13#include <asm/kvm_host.h> 14#include <asm/kvm_host.h>
14#include <asm/kvm_mmu.h> 15#include <asm/kvm_mmu.h>
@@ -85,9 +86,12 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
85 WARN_ON_ONCE(!irqs_disabled()); 86 WARN_ON_ONCE(!irqs_disabled());
86 87
87 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { 88 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
88 fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs); 89 fpsimd_bind_state_to_cpu(&vcpu->arch.ctxt.gp_regs.fp_regs,
90 vcpu->arch.sve_state,
91 vcpu->arch.sve_max_vl);
92
89 clear_thread_flag(TIF_FOREIGN_FPSTATE); 93 clear_thread_flag(TIF_FOREIGN_FPSTATE);
90 clear_thread_flag(TIF_SVE); 94 update_thread_flag(TIF_SVE, vcpu_has_sve(vcpu));
91 } 95 }
92} 96}
93 97
@@ -100,14 +104,21 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
100void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) 104void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
101{ 105{
102 unsigned long flags; 106 unsigned long flags;
107 bool host_has_sve = system_supports_sve();
108 bool guest_has_sve = vcpu_has_sve(vcpu);
103 109
104 local_irq_save(flags); 110 local_irq_save(flags);
105 111
106 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { 112 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) {
113 u64 *guest_zcr = &vcpu->arch.ctxt.sys_regs[ZCR_EL1];
114
107 /* Clean guest FP state to memory and invalidate cpu view */ 115 /* Clean guest FP state to memory and invalidate cpu view */
108 fpsimd_save(); 116 fpsimd_save();
109 fpsimd_flush_cpu_state(); 117 fpsimd_flush_cpu_state();
110 } else if (system_supports_sve()) { 118
119 if (guest_has_sve)
120 *guest_zcr = read_sysreg_s(SYS_ZCR_EL12);
121 } else if (host_has_sve) {
111 /* 122 /*
112 * The FPSIMD/SVE state in the CPU has not been touched, and we 123 * The FPSIMD/SVE state in the CPU has not been touched, and we
113 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been 124 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dd436a50fce7..3ae2f82fca46 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -19,18 +19,25 @@
19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 */ 20 */
21 21
22#include <linux/bits.h>
22#include <linux/errno.h> 23#include <linux/errno.h>
23#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/nospec.h>
24#include <linux/kvm_host.h> 26#include <linux/kvm_host.h>
25#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/stddef.h>
29#include <linux/string.h>
26#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
27#include <linux/fs.h> 31#include <linux/fs.h>
28#include <kvm/arm_psci.h> 32#include <kvm/arm_psci.h>
29#include <asm/cputype.h> 33#include <asm/cputype.h>
30#include <linux/uaccess.h> 34#include <linux/uaccess.h>
35#include <asm/fpsimd.h>
31#include <asm/kvm.h> 36#include <asm/kvm.h>
32#include <asm/kvm_emulate.h> 37#include <asm/kvm_emulate.h>
33#include <asm/kvm_coproc.h> 38#include <asm/kvm_coproc.h>
39#include <asm/kvm_host.h>
40#include <asm/sigcontext.h>
34 41
35#include "trace.h" 42#include "trace.h"
36 43
@@ -52,12 +59,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
52 return 0; 59 return 0;
53} 60}
54 61
62static bool core_reg_offset_is_vreg(u64 off)
63{
64 return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) &&
65 off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr);
66}
67
55static u64 core_reg_offset_from_id(u64 id) 68static u64 core_reg_offset_from_id(u64 id)
56{ 69{
57 return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); 70 return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
58} 71}
59 72
60static int validate_core_offset(const struct kvm_one_reg *reg) 73static int validate_core_offset(const struct kvm_vcpu *vcpu,
74 const struct kvm_one_reg *reg)
61{ 75{
62 u64 off = core_reg_offset_from_id(reg->id); 76 u64 off = core_reg_offset_from_id(reg->id);
63 int size; 77 int size;
@@ -89,11 +103,19 @@ static int validate_core_offset(const struct kvm_one_reg *reg)
89 return -EINVAL; 103 return -EINVAL;
90 } 104 }
91 105
92 if (KVM_REG_SIZE(reg->id) == size && 106 if (KVM_REG_SIZE(reg->id) != size ||
93 IS_ALIGNED(off, size / sizeof(__u32))) 107 !IS_ALIGNED(off, size / sizeof(__u32)))
94 return 0; 108 return -EINVAL;
95 109
96 return -EINVAL; 110 /*
111 * The KVM_REG_ARM64_SVE regs must be used instead of
112 * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on
113 * SVE-enabled vcpus:
114 */
115 if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off))
116 return -EINVAL;
117
118 return 0;
97} 119}
98 120
99static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) 121static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
@@ -115,7 +137,7 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
115 (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) 137 (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
116 return -ENOENT; 138 return -ENOENT;
117 139
118 if (validate_core_offset(reg)) 140 if (validate_core_offset(vcpu, reg))
119 return -EINVAL; 141 return -EINVAL;
120 142
121 if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id))) 143 if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id)))
@@ -140,7 +162,7 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
140 (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) 162 (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
141 return -ENOENT; 163 return -ENOENT;
142 164
143 if (validate_core_offset(reg)) 165 if (validate_core_offset(vcpu, reg))
144 return -EINVAL; 166 return -EINVAL;
145 167
146 if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) 168 if (KVM_REG_SIZE(reg->id) > sizeof(tmp))
@@ -183,6 +205,239 @@ out:
183 return err; 205 return err;
184} 206}
185 207
208#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64)
209#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64)
210
211static bool vq_present(
212 const u64 (*const vqs)[KVM_ARM64_SVE_VLS_WORDS],
213 unsigned int vq)
214{
215 return (*vqs)[vq_word(vq)] & vq_mask(vq);
216}
217
218static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
219{
220 unsigned int max_vq, vq;
221 u64 vqs[KVM_ARM64_SVE_VLS_WORDS];
222
223 if (!vcpu_has_sve(vcpu))
224 return -ENOENT;
225
226 if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
227 return -EINVAL;
228
229 memset(vqs, 0, sizeof(vqs));
230
231 max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
232 for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
233 if (sve_vq_available(vq))
234 vqs[vq_word(vq)] |= vq_mask(vq);
235
236 if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
237 return -EFAULT;
238
239 return 0;
240}
241
242static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
243{
244 unsigned int max_vq, vq;
245 u64 vqs[KVM_ARM64_SVE_VLS_WORDS];
246
247 if (!vcpu_has_sve(vcpu))
248 return -ENOENT;
249
250 if (kvm_arm_vcpu_sve_finalized(vcpu))
251 return -EPERM; /* too late! */
252
253 if (WARN_ON(vcpu->arch.sve_state))
254 return -EINVAL;
255
256 if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
257 return -EFAULT;
258
259 max_vq = 0;
260 for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
261 if (vq_present(&vqs, vq))
262 max_vq = vq;
263
264 if (max_vq > sve_vq_from_vl(kvm_sve_max_vl))
265 return -EINVAL;
266
267 /*
268 * Vector lengths supported by the host can't currently be
269 * hidden from the guest individually: instead we can only set a
270 * maxmium via ZCR_EL2.LEN. So, make sure the available vector
271 * lengths match the set requested exactly up to the requested
272 * maximum:
273 */
274 for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
275 if (vq_present(&vqs, vq) != sve_vq_available(vq))
276 return -EINVAL;
277
278 /* Can't run with no vector lengths at all: */
279 if (max_vq < SVE_VQ_MIN)
280 return -EINVAL;
281
282 /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */
283 vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);
284
285 return 0;
286}
287
288#define SVE_REG_SLICE_SHIFT 0
289#define SVE_REG_SLICE_BITS 5
290#define SVE_REG_ID_SHIFT (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
291#define SVE_REG_ID_BITS 5
292
293#define SVE_REG_SLICE_MASK \
294 GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1, \
295 SVE_REG_SLICE_SHIFT)
296#define SVE_REG_ID_MASK \
297 GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT)
298
299#define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS)
300
301#define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0))
302#define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0))
303
304/*
305 * Number of register slices required to cover each whole SVE register.
306 * NOTE: Only the first slice every exists, for now.
307 * If you are tempted to modify this, you must also rework sve_reg_to_region()
308 * to match:
309 */
310#define vcpu_sve_slices(vcpu) 1
311
312/* Bounds of a single SVE register slice within vcpu->arch.sve_state */
313struct sve_state_reg_region {
314 unsigned int koffset; /* offset into sve_state in kernel memory */
315 unsigned int klen; /* length in kernel memory */
316 unsigned int upad; /* extra trailing padding in user memory */
317};
318
319/*
320 * Validate SVE register ID and get sanitised bounds for user/kernel SVE
321 * register copy
322 */
323static int sve_reg_to_region(struct sve_state_reg_region *region,
324 struct kvm_vcpu *vcpu,
325 const struct kvm_one_reg *reg)
326{
327 /* reg ID ranges for Z- registers */
328 const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0);
329 const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1,
330 SVE_NUM_SLICES - 1);
331
332 /* reg ID ranges for P- registers and FFR (which are contiguous) */
333 const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0);
334 const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1);
335
336 unsigned int vq;
337 unsigned int reg_num;
338
339 unsigned int reqoffset, reqlen; /* User-requested offset and length */
340 unsigned int maxlen; /* Maxmimum permitted length */
341
342 size_t sve_state_size;
343
344 const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1,
345 SVE_NUM_SLICES - 1);
346
347 /* Verify that the P-regs and FFR really do have contiguous IDs: */
348 BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1);
349
350 /* Verify that we match the UAPI header: */
351 BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES);
352
353 reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT;
354
355 if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) {
356 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
357 return -ENOENT;
358
359 vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
360
361 reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
362 SVE_SIG_REGS_OFFSET;
363 reqlen = KVM_SVE_ZREG_SIZE;
364 maxlen = SVE_SIG_ZREG_SIZE(vq);
365 } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) {
366 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
367 return -ENOENT;
368
369 vq = sve_vq_from_vl(vcpu->arch.sve_max_vl);
370
371 reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
372 SVE_SIG_REGS_OFFSET;
373 reqlen = KVM_SVE_PREG_SIZE;
374 maxlen = SVE_SIG_PREG_SIZE(vq);
375 } else {
376 return -EINVAL;
377 }
378
379 sve_state_size = vcpu_sve_state_size(vcpu);
380 if (WARN_ON(!sve_state_size))
381 return -EINVAL;
382
383 region->koffset = array_index_nospec(reqoffset, sve_state_size);
384 region->klen = min(maxlen, reqlen);
385 region->upad = reqlen - region->klen;
386
387 return 0;
388}
389
390static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
391{
392 int ret;
393 struct sve_state_reg_region region;
394 char __user *uptr = (char __user *)reg->addr;
395
396 /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */
397 if (reg->id == KVM_REG_ARM64_SVE_VLS)
398 return get_sve_vls(vcpu, reg);
399
400 /* Try to interpret reg ID as an architectural SVE register... */
401 ret = sve_reg_to_region(&region, vcpu, reg);
402 if (ret)
403 return ret;
404
405 if (!kvm_arm_vcpu_sve_finalized(vcpu))
406 return -EPERM;
407
408 if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset,
409 region.klen) ||
410 clear_user(uptr + region.klen, region.upad))
411 return -EFAULT;
412
413 return 0;
414}
415
416static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
417{
418 int ret;
419 struct sve_state_reg_region region;
420 const char __user *uptr = (const char __user *)reg->addr;
421
422 /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */
423 if (reg->id == KVM_REG_ARM64_SVE_VLS)
424 return set_sve_vls(vcpu, reg);
425
426 /* Try to interpret reg ID as an architectural SVE register... */
427 ret = sve_reg_to_region(&region, vcpu, reg);
428 if (ret)
429 return ret;
430
431 if (!kvm_arm_vcpu_sve_finalized(vcpu))
432 return -EPERM;
433
434 if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr,
435 region.klen))
436 return -EFAULT;
437
438 return 0;
439}
440
186int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 441int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
187{ 442{
188 return -EINVAL; 443 return -EINVAL;
@@ -193,9 +448,37 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
193 return -EINVAL; 448 return -EINVAL;
194} 449}
195 450
196static unsigned long num_core_regs(void) 451static int copy_core_reg_indices(const struct kvm_vcpu *vcpu,
452 u64 __user *uindices)
453{
454 unsigned int i;
455 int n = 0;
456 const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE;
457
458 for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {
459 /*
460 * The KVM_REG_ARM64_SVE regs must be used instead of
461 * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on
462 * SVE-enabled vcpus:
463 */
464 if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(i))
465 continue;
466
467 if (uindices) {
468 if (put_user(core_reg | i, uindices))
469 return -EFAULT;
470 uindices++;
471 }
472
473 n++;
474 }
475
476 return n;
477}
478
479static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
197{ 480{
198 return sizeof(struct kvm_regs) / sizeof(__u32); 481 return copy_core_reg_indices(vcpu, NULL);
199} 482}
200 483
201/** 484/**
@@ -251,6 +534,67 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
251 return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; 534 return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
252} 535}
253 536
537static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
538{
539 const unsigned int slices = vcpu_sve_slices(vcpu);
540
541 if (!vcpu_has_sve(vcpu))
542 return 0;
543
544 /* Policed by KVM_GET_REG_LIST: */
545 WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));
546
547 return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
548 + 1; /* KVM_REG_ARM64_SVE_VLS */
549}
550
551static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
552 u64 __user *uindices)
553{
554 const unsigned int slices = vcpu_sve_slices(vcpu);
555 u64 reg;
556 unsigned int i, n;
557 int num_regs = 0;
558
559 if (!vcpu_has_sve(vcpu))
560 return 0;
561
562 /* Policed by KVM_GET_REG_LIST: */
563 WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));
564
565 /*
566 * Enumerate this first, so that userspace can save/restore in
567 * the order reported by KVM_GET_REG_LIST:
568 */
569 reg = KVM_REG_ARM64_SVE_VLS;
570 if (put_user(reg, uindices++))
571 return -EFAULT;
572 ++num_regs;
573
574 for (i = 0; i < slices; i++) {
575 for (n = 0; n < SVE_NUM_ZREGS; n++) {
576 reg = KVM_REG_ARM64_SVE_ZREG(n, i);
577 if (put_user(reg, uindices++))
578 return -EFAULT;
579 num_regs++;
580 }
581
582 for (n = 0; n < SVE_NUM_PREGS; n++) {
583 reg = KVM_REG_ARM64_SVE_PREG(n, i);
584 if (put_user(reg, uindices++))
585 return -EFAULT;
586 num_regs++;
587 }
588
589 reg = KVM_REG_ARM64_SVE_FFR(i);
590 if (put_user(reg, uindices++))
591 return -EFAULT;
592 num_regs++;
593 }
594
595 return num_regs;
596}
597
254/** 598/**
255 * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG 599 * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG
256 * 600 *
@@ -258,8 +602,15 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
258 */ 602 */
259unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) 603unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
260{ 604{
261 return num_core_regs() + kvm_arm_num_sys_reg_descs(vcpu) 605 unsigned long res = 0;
262 + kvm_arm_get_fw_num_regs(vcpu) + NUM_TIMER_REGS; 606
607 res += num_core_regs(vcpu);
608 res += num_sve_regs(vcpu);
609 res += kvm_arm_num_sys_reg_descs(vcpu);
610 res += kvm_arm_get_fw_num_regs(vcpu);
611 res += NUM_TIMER_REGS;
612
613 return res;
263} 614}
264 615
265/** 616/**
@@ -269,23 +620,25 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
269 */ 620 */
270int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) 621int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
271{ 622{
272 unsigned int i;
273 const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE;
274 int ret; 623 int ret;
275 624
276 for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) { 625 ret = copy_core_reg_indices(vcpu, uindices);
277 if (put_user(core_reg | i, uindices)) 626 if (ret < 0)
278 return -EFAULT; 627 return ret;
279 uindices++; 628 uindices += ret;
280 } 629
630 ret = copy_sve_reg_indices(vcpu, uindices);
631 if (ret < 0)
632 return ret;
633 uindices += ret;
281 634
282 ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); 635 ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices);
283 if (ret) 636 if (ret < 0)
284 return ret; 637 return ret;
285 uindices += kvm_arm_get_fw_num_regs(vcpu); 638 uindices += kvm_arm_get_fw_num_regs(vcpu);
286 639
287 ret = copy_timer_indices(vcpu, uindices); 640 ret = copy_timer_indices(vcpu, uindices);
288 if (ret) 641 if (ret < 0)
289 return ret; 642 return ret;
290 uindices += NUM_TIMER_REGS; 643 uindices += NUM_TIMER_REGS;
291 644
@@ -298,12 +651,11 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
298 if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) 651 if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32)
299 return -EINVAL; 652 return -EINVAL;
300 653
301 /* Register group 16 means we want a core register. */ 654 switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
302 if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) 655 case KVM_REG_ARM_CORE: return get_core_reg(vcpu, reg);
303 return get_core_reg(vcpu, reg); 656 case KVM_REG_ARM_FW: return kvm_arm_get_fw_reg(vcpu, reg);
304 657 case KVM_REG_ARM64_SVE: return get_sve_reg(vcpu, reg);
305 if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) 658 }
306 return kvm_arm_get_fw_reg(vcpu, reg);
307 659
308 if (is_timer_reg(reg->id)) 660 if (is_timer_reg(reg->id))
309 return get_timer_reg(vcpu, reg); 661 return get_timer_reg(vcpu, reg);
@@ -317,12 +669,11 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
317 if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) 669 if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32)
318 return -EINVAL; 670 return -EINVAL;
319 671
320 /* Register group 16 means we set a core register. */ 672 switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
321 if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) 673 case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg);
322 return set_core_reg(vcpu, reg); 674 case KVM_REG_ARM_FW: return kvm_arm_set_fw_reg(vcpu, reg);
323 675 case KVM_REG_ARM64_SVE: return set_sve_reg(vcpu, reg);
324 if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) 676 }
325 return kvm_arm_set_fw_reg(vcpu, reg);
326 677
327 if (is_timer_reg(reg->id)) 678 if (is_timer_reg(reg->id))
328 return set_timer_reg(vcpu, reg); 679 return set_timer_reg(vcpu, reg);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 0b7983442071..516aead3c2a9 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -173,20 +173,40 @@ static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run)
173 return 1; 173 return 1;
174} 174}
175 175
176#define __ptrauth_save_key(regs, key) \
177({ \
178 regs[key ## KEYLO_EL1] = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \
179 regs[key ## KEYHI_EL1] = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \
180})
181
182/*
183 * Handle the guest trying to use a ptrauth instruction, or trying to access a
184 * ptrauth register.
185 */
186void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu)
187{
188 struct kvm_cpu_context *ctxt;
189
190 if (vcpu_has_ptrauth(vcpu)) {
191 vcpu_ptrauth_enable(vcpu);
192 ctxt = vcpu->arch.host_cpu_context;
193 __ptrauth_save_key(ctxt->sys_regs, APIA);
194 __ptrauth_save_key(ctxt->sys_regs, APIB);
195 __ptrauth_save_key(ctxt->sys_regs, APDA);
196 __ptrauth_save_key(ctxt->sys_regs, APDB);
197 __ptrauth_save_key(ctxt->sys_regs, APGA);
198 } else {
199 kvm_inject_undefined(vcpu);
200 }
201}
202
176/* 203/*
177 * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into 204 * Guest usage of a ptrauth instruction (which the guest EL1 did not turn into
178 * a NOP). 205 * a NOP).
179 */ 206 */
180static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu, struct kvm_run *run) 207static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu, struct kvm_run *run)
181{ 208{
182 /* 209 kvm_arm_vcpu_ptrauth_trap(vcpu);
183 * We don't currently support ptrauth in a guest, and we mask the ID
184 * registers to prevent well-behaved guests from trying to make use of
185 * it.
186 *
187 * Inject an UNDEF, as if the feature really isn't present.
188 */
189 kvm_inject_undefined(vcpu);
190 return 1; 210 return 1;
191} 211}
192 212
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 675fdc186e3b..93ba3d7ef027 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -24,6 +24,7 @@
24#include <asm/kvm_arm.h> 24#include <asm/kvm_arm.h>
25#include <asm/kvm_asm.h> 25#include <asm/kvm_asm.h>
26#include <asm/kvm_mmu.h> 26#include <asm/kvm_mmu.h>
27#include <asm/kvm_ptrauth.h>
27 28
28#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x) 29#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x)
29#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x) 30#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
@@ -64,6 +65,13 @@ ENTRY(__guest_enter)
64 65
65 add x18, x0, #VCPU_CONTEXT 66 add x18, x0, #VCPU_CONTEXT
66 67
68 // Macro ptrauth_switch_to_guest format:
69 // ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3)
70 // The below macro to restore guest keys is not implemented in C code
71 // as it may cause Pointer Authentication key signing mismatch errors
72 // when this feature is enabled for kernel code.
73 ptrauth_switch_to_guest x18, x0, x1, x2
74
67 // Restore guest regs x0-x17 75 // Restore guest regs x0-x17
68 ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)] 76 ldp x0, x1, [x18, #CPU_XREG_OFFSET(0)]
69 ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)] 77 ldp x2, x3, [x18, #CPU_XREG_OFFSET(2)]
@@ -118,6 +126,13 @@ ENTRY(__guest_exit)
118 126
119 get_host_ctxt x2, x3 127 get_host_ctxt x2, x3
120 128
129 // Macro ptrauth_switch_to_guest format:
130 // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
131 // The below macro to save/restore keys is not implemented in C code
132 // as it may cause Pointer Authentication key signing mismatch errors
133 // when this feature is enabled for kernel code.
134 ptrauth_switch_to_host x1, x2, x3, x4, x5
135
121 // Now restore the host regs 136 // Now restore the host regs
122 restore_callee_saved_regs x2 137 restore_callee_saved_regs x2
123 138
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 3563fe655cd5..22b4c335e0b2 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -100,7 +100,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu)
100 val = read_sysreg(cpacr_el1); 100 val = read_sysreg(cpacr_el1);
101 val |= CPACR_EL1_TTA; 101 val |= CPACR_EL1_TTA;
102 val &= ~CPACR_EL1_ZEN; 102 val &= ~CPACR_EL1_ZEN;
103 if (!update_fp_enabled(vcpu)) { 103 if (update_fp_enabled(vcpu)) {
104 if (vcpu_has_sve(vcpu))
105 val |= CPACR_EL1_ZEN;
106 } else {
104 val &= ~CPACR_EL1_FPEN; 107 val &= ~CPACR_EL1_FPEN;
105 __activate_traps_fpsimd32(vcpu); 108 __activate_traps_fpsimd32(vcpu);
106 } 109 }
@@ -317,16 +320,48 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
317 return true; 320 return true;
318} 321}
319 322
320static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) 323/* Check for an FPSIMD/SVE trap and handle as appropriate */
324static bool __hyp_text __hyp_handle_fpsimd(struct kvm_vcpu *vcpu)
321{ 325{
322 struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; 326 bool vhe, sve_guest, sve_host;
327 u8 hsr_ec;
323 328
324 if (has_vhe()) 329 if (!system_supports_fpsimd())
325 write_sysreg(read_sysreg(cpacr_el1) | CPACR_EL1_FPEN, 330 return false;
326 cpacr_el1); 331
327 else 332 if (system_supports_sve()) {
333 sve_guest = vcpu_has_sve(vcpu);
334 sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE;
335 vhe = true;
336 } else {
337 sve_guest = false;
338 sve_host = false;
339 vhe = has_vhe();
340 }
341
342 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
343 if (hsr_ec != ESR_ELx_EC_FP_ASIMD &&
344 hsr_ec != ESR_ELx_EC_SVE)
345 return false;
346
347 /* Don't handle SVE traps for non-SVE vcpus here: */
348 if (!sve_guest)
349 if (hsr_ec != ESR_ELx_EC_FP_ASIMD)
350 return false;
351
352 /* Valid trap. Switch the context: */
353
354 if (vhe) {
355 u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN;
356
357 if (sve_guest)
358 reg |= CPACR_EL1_ZEN;
359
360 write_sysreg(reg, cpacr_el1);
361 } else {
328 write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP, 362 write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP,
329 cptr_el2); 363 cptr_el2);
364 }
330 365
331 isb(); 366 isb();
332 367
@@ -335,21 +370,28 @@ static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu)
335 * In the SVE case, VHE is assumed: it is enforced by 370 * In the SVE case, VHE is assumed: it is enforced by
336 * Kconfig and kvm_arch_init(). 371 * Kconfig and kvm_arch_init().
337 */ 372 */
338 if (system_supports_sve() && 373 if (sve_host) {
339 (vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE)) {
340 struct thread_struct *thread = container_of( 374 struct thread_struct *thread = container_of(
341 host_fpsimd, 375 vcpu->arch.host_fpsimd_state,
342 struct thread_struct, uw.fpsimd_state); 376 struct thread_struct, uw.fpsimd_state);
343 377
344 sve_save_state(sve_pffr(thread), &host_fpsimd->fpsr); 378 sve_save_state(sve_pffr(thread),
379 &vcpu->arch.host_fpsimd_state->fpsr);
345 } else { 380 } else {
346 __fpsimd_save_state(host_fpsimd); 381 __fpsimd_save_state(vcpu->arch.host_fpsimd_state);
347 } 382 }
348 383
349 vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; 384 vcpu->arch.flags &= ~KVM_ARM64_FP_HOST;
350 } 385 }
351 386
352 __fpsimd_restore_state(&vcpu->arch.ctxt.gp_regs.fp_regs); 387 if (sve_guest) {
388 sve_load_state(vcpu_sve_pffr(vcpu),
389 &vcpu->arch.ctxt.gp_regs.fp_regs.fpsr,
390 sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1);
391 write_sysreg_s(vcpu->arch.ctxt.sys_regs[ZCR_EL1], SYS_ZCR_EL12);
392 } else {
393 __fpsimd_restore_state(&vcpu->arch.ctxt.gp_regs.fp_regs);
394 }
353 395
354 /* Skip restoring fpexc32 for AArch64 guests */ 396 /* Skip restoring fpexc32 for AArch64 guests */
355 if (!(read_sysreg(hcr_el2) & HCR_RW)) 397 if (!(read_sysreg(hcr_el2) & HCR_RW))
@@ -385,10 +427,10 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
385 * and restore the guest context lazily. 427 * and restore the guest context lazily.
386 * If FP/SIMD is not implemented, handle the trap and inject an 428 * If FP/SIMD is not implemented, handle the trap and inject an
387 * undefined instruction exception to the guest. 429 * undefined instruction exception to the guest.
430 * Similarly for trapped SVE accesses.
388 */ 431 */
389 if (system_supports_fpsimd() && 432 if (__hyp_handle_fpsimd(vcpu))
390 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_FP_ASIMD) 433 return true;
391 return __hyp_switch_fpsimd(vcpu);
392 434
393 if (!__populate_fault_info(vcpu)) 435 if (!__populate_fault_info(vcpu))
394 return true; 436 return true;
@@ -524,6 +566,7 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
524{ 566{
525 struct kvm_cpu_context *host_ctxt; 567 struct kvm_cpu_context *host_ctxt;
526 struct kvm_cpu_context *guest_ctxt; 568 struct kvm_cpu_context *guest_ctxt;
569 bool pmu_switch_needed;
527 u64 exit_code; 570 u64 exit_code;
528 571
529 /* 572 /*
@@ -543,6 +586,8 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
543 host_ctxt->__hyp_running_vcpu = vcpu; 586 host_ctxt->__hyp_running_vcpu = vcpu;
544 guest_ctxt = &vcpu->arch.ctxt; 587 guest_ctxt = &vcpu->arch.ctxt;
545 588
589 pmu_switch_needed = __pmu_switch_to_guest(host_ctxt);
590
546 __sysreg_save_state_nvhe(host_ctxt); 591 __sysreg_save_state_nvhe(host_ctxt);
547 592
548 __activate_vm(kern_hyp_va(vcpu->kvm)); 593 __activate_vm(kern_hyp_va(vcpu->kvm));
@@ -589,6 +634,9 @@ int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
589 */ 634 */
590 __debug_switch_to_host(vcpu); 635 __debug_switch_to_host(vcpu);
591 636
637 if (pmu_switch_needed)
638 __pmu_switch_to_host(host_ctxt);
639
592 /* Returning to host will clear PSR.I, remask PMR if needed */ 640 /* Returning to host will clear PSR.I, remask PMR if needed */
593 if (system_uses_irq_prio_masking()) 641 if (system_uses_irq_prio_masking())
594 gic_write_pmr(GIC_PRIO_IRQOFF); 642 gic_write_pmr(GIC_PRIO_IRQOFF);
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
new file mode 100644
index 000000000000..3da94a5bb6b7
--- /dev/null
+++ b/arch/arm64/kvm/pmu.c
@@ -0,0 +1,239 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2019 Arm Limited
4 * Author: Andrew Murray <Andrew.Murray@arm.com>
5 */
6#include <linux/kvm_host.h>
7#include <linux/perf_event.h>
8#include <asm/kvm_hyp.h>
9
10/*
11 * Given the perf event attributes and system type, determine
12 * if we are going to need to switch counters at guest entry/exit.
13 */
14static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
15{
16 /**
17 * With VHE the guest kernel runs at EL1 and the host at EL2,
18 * where user (EL0) is excluded then we have no reason to switch
19 * counters.
20 */
21 if (has_vhe() && attr->exclude_user)
22 return false;
23
24 /* Only switch if attributes are different */
25 return (attr->exclude_host != attr->exclude_guest);
26}
27
28/*
29 * Add events to track that we may want to switch at guest entry/exit
30 * time.
31 */
32void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
33{
34 struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
35
36 if (!kvm_pmu_switch_needed(attr))
37 return;
38
39 if (!attr->exclude_host)
40 ctx->pmu_events.events_host |= set;
41 if (!attr->exclude_guest)
42 ctx->pmu_events.events_guest |= set;
43}
44
45/*
46 * Stop tracking events
47 */
48void kvm_clr_pmu_events(u32 clr)
49{
50 struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
51
52 ctx->pmu_events.events_host &= ~clr;
53 ctx->pmu_events.events_guest &= ~clr;
54}
55
56/**
57 * Disable host events, enable guest events
58 */
59bool __hyp_text __pmu_switch_to_guest(struct kvm_cpu_context *host_ctxt)
60{
61 struct kvm_host_data *host;
62 struct kvm_pmu_events *pmu;
63
64 host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
65 pmu = &host->pmu_events;
66
67 if (pmu->events_host)
68 write_sysreg(pmu->events_host, pmcntenclr_el0);
69
70 if (pmu->events_guest)
71 write_sysreg(pmu->events_guest, pmcntenset_el0);
72
73 return (pmu->events_host || pmu->events_guest);
74}
75
76/**
77 * Disable guest events, enable host events
78 */
79void __hyp_text __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt)
80{
81 struct kvm_host_data *host;
82 struct kvm_pmu_events *pmu;
83
84 host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
85 pmu = &host->pmu_events;
86
87 if (pmu->events_guest)
88 write_sysreg(pmu->events_guest, pmcntenclr_el0);
89
90 if (pmu->events_host)
91 write_sysreg(pmu->events_host, pmcntenset_el0);
92}
93
94#define PMEVTYPER_READ_CASE(idx) \
95 case idx: \
96 return read_sysreg(pmevtyper##idx##_el0)
97
98#define PMEVTYPER_WRITE_CASE(idx) \
99 case idx: \
100 write_sysreg(val, pmevtyper##idx##_el0); \
101 break
102
103#define PMEVTYPER_CASES(readwrite) \
104 PMEVTYPER_##readwrite##_CASE(0); \
105 PMEVTYPER_##readwrite##_CASE(1); \
106 PMEVTYPER_##readwrite##_CASE(2); \
107 PMEVTYPER_##readwrite##_CASE(3); \
108 PMEVTYPER_##readwrite##_CASE(4); \
109 PMEVTYPER_##readwrite##_CASE(5); \
110 PMEVTYPER_##readwrite##_CASE(6); \
111 PMEVTYPER_##readwrite##_CASE(7); \
112 PMEVTYPER_##readwrite##_CASE(8); \
113 PMEVTYPER_##readwrite##_CASE(9); \
114 PMEVTYPER_##readwrite##_CASE(10); \
115 PMEVTYPER_##readwrite##_CASE(11); \
116 PMEVTYPER_##readwrite##_CASE(12); \
117 PMEVTYPER_##readwrite##_CASE(13); \
118 PMEVTYPER_##readwrite##_CASE(14); \
119 PMEVTYPER_##readwrite##_CASE(15); \
120 PMEVTYPER_##readwrite##_CASE(16); \
121 PMEVTYPER_##readwrite##_CASE(17); \
122 PMEVTYPER_##readwrite##_CASE(18); \
123 PMEVTYPER_##readwrite##_CASE(19); \
124 PMEVTYPER_##readwrite##_CASE(20); \
125 PMEVTYPER_##readwrite##_CASE(21); \
126 PMEVTYPER_##readwrite##_CASE(22); \
127 PMEVTYPER_##readwrite##_CASE(23); \
128 PMEVTYPER_##readwrite##_CASE(24); \
129 PMEVTYPER_##readwrite##_CASE(25); \
130 PMEVTYPER_##readwrite##_CASE(26); \
131 PMEVTYPER_##readwrite##_CASE(27); \
132 PMEVTYPER_##readwrite##_CASE(28); \
133 PMEVTYPER_##readwrite##_CASE(29); \
134 PMEVTYPER_##readwrite##_CASE(30)
135
136/*
137 * Read a value direct from PMEVTYPER<idx> where idx is 0-30
138 * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
139 */
140static u64 kvm_vcpu_pmu_read_evtype_direct(int idx)
141{
142 switch (idx) {
143 PMEVTYPER_CASES(READ);
144 case ARMV8_PMU_CYCLE_IDX:
145 return read_sysreg(pmccfiltr_el0);
146 default:
147 WARN_ON(1);
148 }
149
150 return 0;
151}
152
153/*
154 * Write a value direct to PMEVTYPER<idx> where idx is 0-30
155 * or PMCCFILTR_EL0 where idx is ARMV8_PMU_CYCLE_IDX (31).
156 */
157static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val)
158{
159 switch (idx) {
160 PMEVTYPER_CASES(WRITE);
161 case ARMV8_PMU_CYCLE_IDX:
162 write_sysreg(val, pmccfiltr_el0);
163 break;
164 default:
165 WARN_ON(1);
166 }
167}
168
169/*
170 * Modify ARMv8 PMU events to include EL0 counting
171 */
172static void kvm_vcpu_pmu_enable_el0(unsigned long events)
173{
174 u64 typer;
175 u32 counter;
176
177 for_each_set_bit(counter, &events, 32) {
178 typer = kvm_vcpu_pmu_read_evtype_direct(counter);
179 typer &= ~ARMV8_PMU_EXCLUDE_EL0;
180 kvm_vcpu_pmu_write_evtype_direct(counter, typer);
181 }
182}
183
184/*
185 * Modify ARMv8 PMU events to exclude EL0 counting
186 */
187static void kvm_vcpu_pmu_disable_el0(unsigned long events)
188{
189 u64 typer;
190 u32 counter;
191
192 for_each_set_bit(counter, &events, 32) {
193 typer = kvm_vcpu_pmu_read_evtype_direct(counter);
194 typer |= ARMV8_PMU_EXCLUDE_EL0;
195 kvm_vcpu_pmu_write_evtype_direct(counter, typer);
196 }
197}
198
199/*
200 * On VHE ensure that only guest events have EL0 counting enabled
201 */
202void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
203{
204 struct kvm_cpu_context *host_ctxt;
205 struct kvm_host_data *host;
206 u32 events_guest, events_host;
207
208 if (!has_vhe())
209 return;
210
211 host_ctxt = vcpu->arch.host_cpu_context;
212 host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
213 events_guest = host->pmu_events.events_guest;
214 events_host = host->pmu_events.events_host;
215
216 kvm_vcpu_pmu_enable_el0(events_guest);
217 kvm_vcpu_pmu_disable_el0(events_host);
218}
219
220/*
221 * On VHE ensure that only host events have EL0 counting enabled
222 */
223void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
224{
225 struct kvm_cpu_context *host_ctxt;
226 struct kvm_host_data *host;
227 u32 events_guest, events_host;
228
229 if (!has_vhe())
230 return;
231
232 host_ctxt = vcpu->arch.host_cpu_context;
233 host = container_of(host_ctxt, struct kvm_host_data, host_ctxt);
234 events_guest = host->pmu_events.events_guest;
235 events_host = host->pmu_events.events_host;
236
237 kvm_vcpu_pmu_enable_el0(events_host);
238 kvm_vcpu_pmu_disable_el0(events_guest);
239}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e2a0500cd7a2..1140b4485575 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -20,20 +20,26 @@
20 */ 20 */
21 21
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/kernel.h>
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
24#include <linux/kvm.h> 25#include <linux/kvm.h>
25#include <linux/hw_breakpoint.h> 26#include <linux/hw_breakpoint.h>
27#include <linux/slab.h>
28#include <linux/string.h>
29#include <linux/types.h>
26 30
27#include <kvm/arm_arch_timer.h> 31#include <kvm/arm_arch_timer.h>
28 32
29#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
30#include <asm/cputype.h> 34#include <asm/cputype.h>
35#include <asm/fpsimd.h>
31#include <asm/ptrace.h> 36#include <asm/ptrace.h>
32#include <asm/kvm_arm.h> 37#include <asm/kvm_arm.h>
33#include <asm/kvm_asm.h> 38#include <asm/kvm_asm.h>
34#include <asm/kvm_coproc.h> 39#include <asm/kvm_coproc.h>
35#include <asm/kvm_emulate.h> 40#include <asm/kvm_emulate.h>
36#include <asm/kvm_mmu.h> 41#include <asm/kvm_mmu.h>
42#include <asm/virt.h>
37 43
38/* Maximum phys_shift supported for any VM on this host */ 44/* Maximum phys_shift supported for any VM on this host */
39static u32 kvm_ipa_limit; 45static u32 kvm_ipa_limit;
@@ -92,6 +98,14 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
92 case KVM_CAP_ARM_VM_IPA_SIZE: 98 case KVM_CAP_ARM_VM_IPA_SIZE:
93 r = kvm_ipa_limit; 99 r = kvm_ipa_limit;
94 break; 100 break;
101 case KVM_CAP_ARM_SVE:
102 r = system_supports_sve();
103 break;
104 case KVM_CAP_ARM_PTRAUTH_ADDRESS:
105 case KVM_CAP_ARM_PTRAUTH_GENERIC:
106 r = has_vhe() && system_supports_address_auth() &&
107 system_supports_generic_auth();
108 break;
95 default: 109 default:
96 r = 0; 110 r = 0;
97 } 111 }
@@ -99,13 +113,148 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
99 return r; 113 return r;
100} 114}
101 115
116unsigned int kvm_sve_max_vl;
117
118int kvm_arm_init_sve(void)
119{
120 if (system_supports_sve()) {
121 kvm_sve_max_vl = sve_max_virtualisable_vl;
122
123 /*
124 * The get_sve_reg()/set_sve_reg() ioctl interface will need
125 * to be extended with multiple register slice support in
126 * order to support vector lengths greater than
127 * SVE_VL_ARCH_MAX:
128 */
129 if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX))
130 kvm_sve_max_vl = SVE_VL_ARCH_MAX;
131
132 /*
133 * Don't even try to make use of vector lengths that
134 * aren't available on all CPUs, for now:
135 */
136 if (kvm_sve_max_vl < sve_max_vl)
137 pr_warn("KVM: SVE vector length for guests limited to %u bytes\n",
138 kvm_sve_max_vl);
139 }
140
141 return 0;
142}
143
144static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
145{
146 if (!system_supports_sve())
147 return -EINVAL;
148
149 /* Verify that KVM startup enforced this when SVE was detected: */
150 if (WARN_ON(!has_vhe()))
151 return -EINVAL;
152
153 vcpu->arch.sve_max_vl = kvm_sve_max_vl;
154
155 /*
156 * Userspace can still customize the vector lengths by writing
157 * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until
158 * kvm_arm_vcpu_finalize(), which freezes the configuration.
159 */
160 vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE;
161
162 return 0;
163}
164
165/*
166 * Finalize vcpu's maximum SVE vector length, allocating
167 * vcpu->arch.sve_state as necessary.
168 */
169static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
170{
171 void *buf;
172 unsigned int vl;
173
174 vl = vcpu->arch.sve_max_vl;
175
176 /*
177 * Resposibility for these properties is shared between
178 * kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and
179 * set_sve_vls(). Double-check here just to be sure:
180 */
181 if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl ||
182 vl > SVE_VL_ARCH_MAX))
183 return -EIO;
184
185 buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL);
186 if (!buf)
187 return -ENOMEM;
188
189 vcpu->arch.sve_state = buf;
190 vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED;
191 return 0;
192}
193
194int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature)
195{
196 switch (feature) {
197 case KVM_ARM_VCPU_SVE:
198 if (!vcpu_has_sve(vcpu))
199 return -EINVAL;
200
201 if (kvm_arm_vcpu_sve_finalized(vcpu))
202 return -EPERM;
203
204 return kvm_vcpu_finalize_sve(vcpu);
205 }
206
207 return -EINVAL;
208}
209
210bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
211{
212 if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu))
213 return false;
214
215 return true;
216}
217
218void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
219{
220 kfree(vcpu->arch.sve_state);
221}
222
223static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
224{
225 if (vcpu_has_sve(vcpu))
226 memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
227}
228
229static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
230{
231 /* Support ptrauth only if the system supports these capabilities. */
232 if (!has_vhe())
233 return -EINVAL;
234
235 if (!system_supports_address_auth() ||
236 !system_supports_generic_auth())
237 return -EINVAL;
238 /*
239 * For now make sure that both address/generic pointer authentication
240 * features are requested by the userspace together.
241 */
242 if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
243 !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features))
244 return -EINVAL;
245
246 vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH;
247 return 0;
248}
249
102/** 250/**
103 * kvm_reset_vcpu - sets core registers and sys_regs to reset value 251 * kvm_reset_vcpu - sets core registers and sys_regs to reset value
104 * @vcpu: The VCPU pointer 252 * @vcpu: The VCPU pointer
105 * 253 *
106 * This function finds the right table above and sets the registers on 254 * This function finds the right table above and sets the registers on
107 * the virtual CPU struct to their architecturally defined reset 255 * the virtual CPU struct to their architecturally defined reset
108 * values. 256 * values, except for registers whose reset is deferred until
257 * kvm_arm_vcpu_finalize().
109 * 258 *
110 * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT 259 * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT
111 * ioctl or as part of handling a request issued by another VCPU in the PSCI 260 * ioctl or as part of handling a request issued by another VCPU in the PSCI
@@ -131,6 +280,22 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
131 if (loaded) 280 if (loaded)
132 kvm_arch_vcpu_put(vcpu); 281 kvm_arch_vcpu_put(vcpu);
133 282
283 if (!kvm_arm_vcpu_sve_finalized(vcpu)) {
284 if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) {
285 ret = kvm_vcpu_enable_sve(vcpu);
286 if (ret)
287 goto out;
288 }
289 } else {
290 kvm_vcpu_reset_sve(vcpu);
291 }
292
293 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
294 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features)) {
295 if (kvm_vcpu_enable_ptrauth(vcpu))
296 goto out;
297 }
298
134 switch (vcpu->arch.target) { 299 switch (vcpu->arch.target) {
135 default: 300 default:
136 if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { 301 if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 539feecda5b8..857b226bcdde 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -695,6 +695,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
695 val |= p->regval & ARMV8_PMU_PMCR_MASK; 695 val |= p->regval & ARMV8_PMU_PMCR_MASK;
696 __vcpu_sys_reg(vcpu, PMCR_EL0) = val; 696 __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
697 kvm_pmu_handle_pmcr(vcpu, val); 697 kvm_pmu_handle_pmcr(vcpu, val);
698 kvm_vcpu_pmu_restore_guest(vcpu);
698 } else { 699 } else {
699 /* PMCR.P & PMCR.C are RAZ */ 700 /* PMCR.P & PMCR.C are RAZ */
700 val = __vcpu_sys_reg(vcpu, PMCR_EL0) 701 val = __vcpu_sys_reg(vcpu, PMCR_EL0)
@@ -850,6 +851,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
850 if (p->is_write) { 851 if (p->is_write) {
851 kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); 852 kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
852 __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK; 853 __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
854 kvm_vcpu_pmu_restore_guest(vcpu);
853 } else { 855 } else {
854 p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK; 856 p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
855 } 857 }
@@ -875,6 +877,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
875 /* accessing PMCNTENSET_EL0 */ 877 /* accessing PMCNTENSET_EL0 */
876 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; 878 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
877 kvm_pmu_enable_counter(vcpu, val); 879 kvm_pmu_enable_counter(vcpu, val);
880 kvm_vcpu_pmu_restore_guest(vcpu);
878 } else { 881 } else {
879 /* accessing PMCNTENCLR_EL0 */ 882 /* accessing PMCNTENCLR_EL0 */
880 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; 883 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
@@ -1007,6 +1010,37 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1007 { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ 1010 { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \
1008 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } 1011 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
1009 1012
1013static bool trap_ptrauth(struct kvm_vcpu *vcpu,
1014 struct sys_reg_params *p,
1015 const struct sys_reg_desc *rd)
1016{
1017 kvm_arm_vcpu_ptrauth_trap(vcpu);
1018
1019 /*
1020 * Return false for both cases as we never skip the trapped
1021 * instruction:
1022 *
1023 * - Either we re-execute the same key register access instruction
1024 * after enabling ptrauth.
1025 * - Or an UNDEF is injected as ptrauth is not supported/enabled.
1026 */
1027 return false;
1028}
1029
1030static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu,
1031 const struct sys_reg_desc *rd)
1032{
1033 return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN_USER | REG_HIDDEN_GUEST;
1034}
1035
1036#define __PTRAUTH_KEY(k) \
1037 { SYS_DESC(SYS_## k), trap_ptrauth, reset_unknown, k, \
1038 .visibility = ptrauth_visibility}
1039
1040#define PTRAUTH_KEY(k) \
1041 __PTRAUTH_KEY(k ## KEYLO_EL1), \
1042 __PTRAUTH_KEY(k ## KEYHI_EL1)
1043
1010static bool access_arch_timer(struct kvm_vcpu *vcpu, 1044static bool access_arch_timer(struct kvm_vcpu *vcpu,
1011 struct sys_reg_params *p, 1045 struct sys_reg_params *p,
1012 const struct sys_reg_desc *r) 1046 const struct sys_reg_desc *r)
@@ -1044,25 +1078,20 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
1044} 1078}
1045 1079
1046/* Read a sanitised cpufeature ID register by sys_reg_desc */ 1080/* Read a sanitised cpufeature ID register by sys_reg_desc */
1047static u64 read_id_reg(struct sys_reg_desc const *r, bool raz) 1081static u64 read_id_reg(const struct kvm_vcpu *vcpu,
1082 struct sys_reg_desc const *r, bool raz)
1048{ 1083{
1049 u32 id = sys_reg((u32)r->Op0, (u32)r->Op1, 1084 u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
1050 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2); 1085 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
1051 u64 val = raz ? 0 : read_sanitised_ftr_reg(id); 1086 u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
1052 1087
1053 if (id == SYS_ID_AA64PFR0_EL1) { 1088 if (id == SYS_ID_AA64PFR0_EL1 && !vcpu_has_sve(vcpu)) {
1054 if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT))
1055 kvm_debug("SVE unsupported for guests, suppressing\n");
1056
1057 val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); 1089 val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
1058 } else if (id == SYS_ID_AA64ISAR1_EL1) { 1090 } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
1059 const u64 ptrauth_mask = (0xfUL << ID_AA64ISAR1_APA_SHIFT) | 1091 val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
1060 (0xfUL << ID_AA64ISAR1_API_SHIFT) | 1092 (0xfUL << ID_AA64ISAR1_API_SHIFT) |
1061 (0xfUL << ID_AA64ISAR1_GPA_SHIFT) | 1093 (0xfUL << ID_AA64ISAR1_GPA_SHIFT) |
1062 (0xfUL << ID_AA64ISAR1_GPI_SHIFT); 1094 (0xfUL << ID_AA64ISAR1_GPI_SHIFT));
1063 if (val & ptrauth_mask)
1064 kvm_debug("ptrauth unsupported for guests, suppressing\n");
1065 val &= ~ptrauth_mask;
1066 } 1095 }
1067 1096
1068 return val; 1097 return val;
@@ -1078,7 +1107,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu,
1078 if (p->is_write) 1107 if (p->is_write)
1079 return write_to_read_only(vcpu, p, r); 1108 return write_to_read_only(vcpu, p, r);
1080 1109
1081 p->regval = read_id_reg(r, raz); 1110 p->regval = read_id_reg(vcpu, r, raz);
1082 return true; 1111 return true;
1083} 1112}
1084 1113
@@ -1100,6 +1129,81 @@ static int reg_from_user(u64 *val, const void __user *uaddr, u64 id);
1100static int reg_to_user(void __user *uaddr, const u64 *val, u64 id); 1129static int reg_to_user(void __user *uaddr, const u64 *val, u64 id);
1101static u64 sys_reg_to_index(const struct sys_reg_desc *reg); 1130static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
1102 1131
1132/* Visibility overrides for SVE-specific control registers */
1133static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
1134 const struct sys_reg_desc *rd)
1135{
1136 if (vcpu_has_sve(vcpu))
1137 return 0;
1138
1139 return REG_HIDDEN_USER | REG_HIDDEN_GUEST;
1140}
1141
1142/* Visibility overrides for SVE-specific ID registers */
1143static unsigned int sve_id_visibility(const struct kvm_vcpu *vcpu,
1144 const struct sys_reg_desc *rd)
1145{
1146 if (vcpu_has_sve(vcpu))
1147 return 0;
1148
1149 return REG_HIDDEN_USER;
1150}
1151
1152/* Generate the emulated ID_AA64ZFR0_EL1 value exposed to the guest */
1153static u64 guest_id_aa64zfr0_el1(const struct kvm_vcpu *vcpu)
1154{
1155 if (!vcpu_has_sve(vcpu))
1156 return 0;
1157
1158 return read_sanitised_ftr_reg(SYS_ID_AA64ZFR0_EL1);
1159}
1160
1161static bool access_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
1162 struct sys_reg_params *p,
1163 const struct sys_reg_desc *rd)
1164{
1165 if (p->is_write)
1166 return write_to_read_only(vcpu, p, rd);
1167
1168 p->regval = guest_id_aa64zfr0_el1(vcpu);
1169 return true;
1170}
1171
1172static int get_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
1173 const struct sys_reg_desc *rd,
1174 const struct kvm_one_reg *reg, void __user *uaddr)
1175{
1176 u64 val;
1177
1178 if (WARN_ON(!vcpu_has_sve(vcpu)))
1179 return -ENOENT;
1180
1181 val = guest_id_aa64zfr0_el1(vcpu);
1182 return reg_to_user(uaddr, &val, reg->id);
1183}
1184
1185static int set_id_aa64zfr0_el1(struct kvm_vcpu *vcpu,
1186 const struct sys_reg_desc *rd,
1187 const struct kvm_one_reg *reg, void __user *uaddr)
1188{
1189 const u64 id = sys_reg_to_index(rd);
1190 int err;
1191 u64 val;
1192
1193 if (WARN_ON(!vcpu_has_sve(vcpu)))
1194 return -ENOENT;
1195
1196 err = reg_from_user(&val, uaddr, id);
1197 if (err)
1198 return err;
1199
1200 /* This is what we mean by invariant: you can't change it. */
1201 if (val != guest_id_aa64zfr0_el1(vcpu))
1202 return -EINVAL;
1203
1204 return 0;
1205}
1206
1103/* 1207/*
1104 * cpufeature ID register user accessors 1208 * cpufeature ID register user accessors
1105 * 1209 *
@@ -1107,16 +1211,18 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
1107 * are stored, and for set_id_reg() we don't allow the effective value 1211 * are stored, and for set_id_reg() we don't allow the effective value
1108 * to be changed. 1212 * to be changed.
1109 */ 1213 */
1110static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, 1214static int __get_id_reg(const struct kvm_vcpu *vcpu,
1215 const struct sys_reg_desc *rd, void __user *uaddr,
1111 bool raz) 1216 bool raz)
1112{ 1217{
1113 const u64 id = sys_reg_to_index(rd); 1218 const u64 id = sys_reg_to_index(rd);
1114 const u64 val = read_id_reg(rd, raz); 1219 const u64 val = read_id_reg(vcpu, rd, raz);
1115 1220
1116 return reg_to_user(uaddr, &val, id); 1221 return reg_to_user(uaddr, &val, id);
1117} 1222}
1118 1223
1119static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr, 1224static int __set_id_reg(const struct kvm_vcpu *vcpu,
1225 const struct sys_reg_desc *rd, void __user *uaddr,
1120 bool raz) 1226 bool raz)
1121{ 1227{
1122 const u64 id = sys_reg_to_index(rd); 1228 const u64 id = sys_reg_to_index(rd);
@@ -1128,7 +1234,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
1128 return err; 1234 return err;
1129 1235
1130 /* This is what we mean by invariant: you can't change it. */ 1236 /* This is what we mean by invariant: you can't change it. */
1131 if (val != read_id_reg(rd, raz)) 1237 if (val != read_id_reg(vcpu, rd, raz))
1132 return -EINVAL; 1238 return -EINVAL;
1133 1239
1134 return 0; 1240 return 0;
@@ -1137,25 +1243,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
1137static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1243static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
1138 const struct kvm_one_reg *reg, void __user *uaddr) 1244 const struct kvm_one_reg *reg, void __user *uaddr)
1139{ 1245{
1140 return __get_id_reg(rd, uaddr, false); 1246 return __get_id_reg(vcpu, rd, uaddr, false);
1141} 1247}
1142 1248
1143static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1249static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
1144 const struct kvm_one_reg *reg, void __user *uaddr) 1250 const struct kvm_one_reg *reg, void __user *uaddr)
1145{ 1251{
1146 return __set_id_reg(rd, uaddr, false); 1252 return __set_id_reg(vcpu, rd, uaddr, false);
1147} 1253}
1148 1254
1149static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1255static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
1150 const struct kvm_one_reg *reg, void __user *uaddr) 1256 const struct kvm_one_reg *reg, void __user *uaddr)
1151{ 1257{
1152 return __get_id_reg(rd, uaddr, true); 1258 return __get_id_reg(vcpu, rd, uaddr, true);
1153} 1259}
1154 1260
1155static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1261static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
1156 const struct kvm_one_reg *reg, void __user *uaddr) 1262 const struct kvm_one_reg *reg, void __user *uaddr)
1157{ 1263{
1158 return __set_id_reg(rd, uaddr, true); 1264 return __set_id_reg(vcpu, rd, uaddr, true);
1159} 1265}
1160 1266
1161static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, 1267static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
@@ -1343,7 +1449,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1343 ID_SANITISED(ID_AA64PFR1_EL1), 1449 ID_SANITISED(ID_AA64PFR1_EL1),
1344 ID_UNALLOCATED(4,2), 1450 ID_UNALLOCATED(4,2),
1345 ID_UNALLOCATED(4,3), 1451 ID_UNALLOCATED(4,3),
1346 ID_UNALLOCATED(4,4), 1452 { SYS_DESC(SYS_ID_AA64ZFR0_EL1), access_id_aa64zfr0_el1, .get_user = get_id_aa64zfr0_el1, .set_user = set_id_aa64zfr0_el1, .visibility = sve_id_visibility },
1347 ID_UNALLOCATED(4,5), 1453 ID_UNALLOCATED(4,5),
1348 ID_UNALLOCATED(4,6), 1454 ID_UNALLOCATED(4,6),
1349 ID_UNALLOCATED(4,7), 1455 ID_UNALLOCATED(4,7),
@@ -1380,10 +1486,17 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1380 1486
1381 { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, 1487 { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
1382 { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, 1488 { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
1489 { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
1383 { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, 1490 { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
1384 { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, 1491 { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
1385 { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, 1492 { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
1386 1493
1494 PTRAUTH_KEY(APIA),
1495 PTRAUTH_KEY(APIB),
1496 PTRAUTH_KEY(APDA),
1497 PTRAUTH_KEY(APDB),
1498 PTRAUTH_KEY(APGA),
1499
1387 { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, 1500 { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
1388 { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, 1501 { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
1389 { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, 1502 { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
@@ -1924,6 +2037,12 @@ static void perform_access(struct kvm_vcpu *vcpu,
1924{ 2037{
1925 trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); 2038 trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);
1926 2039
2040 /* Check for regs disabled by runtime config */
2041 if (sysreg_hidden_from_guest(vcpu, r)) {
2042 kvm_inject_undefined(vcpu);
2043 return;
2044 }
2045
1927 /* 2046 /*
1928 * Not having an accessor means that we have configured a trap 2047 * Not having an accessor means that we have configured a trap
1929 * that we don't know how to handle. This certainly qualifies 2048 * that we don't know how to handle. This certainly qualifies
@@ -2435,6 +2554,10 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
2435 if (!r) 2554 if (!r)
2436 return get_invariant_sys_reg(reg->id, uaddr); 2555 return get_invariant_sys_reg(reg->id, uaddr);
2437 2556
2557 /* Check for regs disabled by runtime config */
2558 if (sysreg_hidden_from_user(vcpu, r))
2559 return -ENOENT;
2560
2438 if (r->get_user) 2561 if (r->get_user)
2439 return (r->get_user)(vcpu, r, reg, uaddr); 2562 return (r->get_user)(vcpu, r, reg, uaddr);
2440 2563
@@ -2456,6 +2579,10 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
2456 if (!r) 2579 if (!r)
2457 return set_invariant_sys_reg(reg->id, uaddr); 2580 return set_invariant_sys_reg(reg->id, uaddr);
2458 2581
2582 /* Check for regs disabled by runtime config */
2583 if (sysreg_hidden_from_user(vcpu, r))
2584 return -ENOENT;
2585
2459 if (r->set_user) 2586 if (r->set_user)
2460 return (r->set_user)(vcpu, r, reg, uaddr); 2587 return (r->set_user)(vcpu, r, reg, uaddr);
2461 2588
@@ -2512,7 +2639,8 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind)
2512 return true; 2639 return true;
2513} 2640}
2514 2641
2515static int walk_one_sys_reg(const struct sys_reg_desc *rd, 2642static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
2643 const struct sys_reg_desc *rd,
2516 u64 __user **uind, 2644 u64 __user **uind,
2517 unsigned int *total) 2645 unsigned int *total)
2518{ 2646{
@@ -2523,6 +2651,9 @@ static int walk_one_sys_reg(const struct sys_reg_desc *rd,
2523 if (!(rd->reg || rd->get_user)) 2651 if (!(rd->reg || rd->get_user))
2524 return 0; 2652 return 0;
2525 2653
2654 if (sysreg_hidden_from_user(vcpu, rd))
2655 return 0;
2656
2526 if (!copy_reg_to_user(rd, uind)) 2657 if (!copy_reg_to_user(rd, uind))
2527 return -EFAULT; 2658 return -EFAULT;
2528 2659
@@ -2551,9 +2682,9 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
2551 int cmp = cmp_sys_reg(i1, i2); 2682 int cmp = cmp_sys_reg(i1, i2);
2552 /* target-specific overrides generic entry. */ 2683 /* target-specific overrides generic entry. */
2553 if (cmp <= 0) 2684 if (cmp <= 0)
2554 err = walk_one_sys_reg(i1, &uind, &total); 2685 err = walk_one_sys_reg(vcpu, i1, &uind, &total);
2555 else 2686 else
2556 err = walk_one_sys_reg(i2, &uind, &total); 2687 err = walk_one_sys_reg(vcpu, i2, &uind, &total);
2557 2688
2558 if (err) 2689 if (err)
2559 return err; 2690 return err;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 3b1bc7f01d0b..2be99508dcb9 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -64,8 +64,15 @@ struct sys_reg_desc {
64 const struct kvm_one_reg *reg, void __user *uaddr); 64 const struct kvm_one_reg *reg, void __user *uaddr);
65 int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 65 int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
66 const struct kvm_one_reg *reg, void __user *uaddr); 66 const struct kvm_one_reg *reg, void __user *uaddr);
67
68 /* Return mask of REG_* runtime visibility overrides */
69 unsigned int (*visibility)(const struct kvm_vcpu *vcpu,
70 const struct sys_reg_desc *rd);
67}; 71};
68 72
73#define REG_HIDDEN_USER (1 << 0) /* hidden from userspace ioctls */
74#define REG_HIDDEN_GUEST (1 << 1) /* hidden from guest */
75
69static inline void print_sys_reg_instr(const struct sys_reg_params *p) 76static inline void print_sys_reg_instr(const struct sys_reg_params *p)
70{ 77{
71 /* Look, we even formatted it for you to paste into the table! */ 78 /* Look, we even formatted it for you to paste into the table! */
@@ -102,6 +109,24 @@ static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r
102 __vcpu_sys_reg(vcpu, r->reg) = r->val; 109 __vcpu_sys_reg(vcpu, r->reg) = r->val;
103} 110}
104 111
112static inline bool sysreg_hidden_from_guest(const struct kvm_vcpu *vcpu,
113 const struct sys_reg_desc *r)
114{
115 if (likely(!r->visibility))
116 return false;
117
118 return r->visibility(vcpu, r) & REG_HIDDEN_GUEST;
119}
120
121static inline bool sysreg_hidden_from_user(const struct kvm_vcpu *vcpu,
122 const struct sys_reg_desc *r)
123{
124 if (likely(!r->visibility))
125 return false;
126
127 return r->visibility(vcpu, r) & REG_HIDDEN_USER;
128}
129
105static inline int cmp_sys_reg(const struct sys_reg_desc *i1, 130static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
106 const struct sys_reg_desc *i2) 131 const struct sys_reg_desc *i2)
107{ 132{
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e6b5bb012ccb..013c76a0a03e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -201,6 +201,8 @@ struct kvmppc_spapr_tce_iommu_table {
201 struct kref kref; 201 struct kref kref;
202}; 202};
203 203
204#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
205
204struct kvmppc_spapr_tce_table { 206struct kvmppc_spapr_tce_table {
205 struct list_head list; 207 struct list_head list;
206 struct kvm *kvm; 208 struct kvm *kvm;
@@ -210,6 +212,7 @@ struct kvmppc_spapr_tce_table {
210 u64 offset; /* in pages */ 212 u64 offset; /* in pages */
211 u64 size; /* window size in pages */ 213 u64 size; /* window size in pages */
212 struct list_head iommu_tables; 214 struct list_head iommu_tables;
215 struct mutex alloc_lock;
213 struct page *pages[0]; 216 struct page *pages[0];
214}; 217};
215 218
@@ -222,6 +225,7 @@ extern struct kvm_device_ops kvm_xics_ops;
222struct kvmppc_xive; 225struct kvmppc_xive;
223struct kvmppc_xive_vcpu; 226struct kvmppc_xive_vcpu;
224extern struct kvm_device_ops kvm_xive_ops; 227extern struct kvm_device_ops kvm_xive_ops;
228extern struct kvm_device_ops kvm_xive_native_ops;
225 229
226struct kvmppc_passthru_irqmap; 230struct kvmppc_passthru_irqmap;
227 231
@@ -312,7 +316,11 @@ struct kvm_arch {
312#endif 316#endif
313#ifdef CONFIG_KVM_XICS 317#ifdef CONFIG_KVM_XICS
314 struct kvmppc_xics *xics; 318 struct kvmppc_xics *xics;
315 struct kvmppc_xive *xive; 319 struct kvmppc_xive *xive; /* Current XIVE device in use */
320 struct {
321 struct kvmppc_xive *native;
322 struct kvmppc_xive *xics_on_xive;
323 } xive_devices;
316 struct kvmppc_passthru_irqmap *pimap; 324 struct kvmppc_passthru_irqmap *pimap;
317#endif 325#endif
318 struct kvmppc_ops *kvm_ops; 326 struct kvmppc_ops *kvm_ops;
@@ -449,6 +457,7 @@ struct kvmppc_passthru_irqmap {
449#define KVMPPC_IRQ_DEFAULT 0 457#define KVMPPC_IRQ_DEFAULT 0
450#define KVMPPC_IRQ_MPIC 1 458#define KVMPPC_IRQ_MPIC 1
451#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */ 459#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */
460#define KVMPPC_IRQ_XIVE 3 /* XIVE native exploitation mode */
452 461
453#define MMIO_HPTE_CACHE_SIZE 4 462#define MMIO_HPTE_CACHE_SIZE 4
454 463
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ac22b28ae78d..bc892380e6cd 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -197,10 +197,6 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
197 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ 197 (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
198 (stt)->size, (ioba), (npages)) ? \ 198 (stt)->size, (ioba), (npages)) ? \
199 H_PARAMETER : H_SUCCESS) 199 H_PARAMETER : H_SUCCESS)
200extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
201 unsigned long *ua, unsigned long **prmap);
202extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
203 unsigned long idx, unsigned long tce);
204extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, 200extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
205 unsigned long ioba, unsigned long tce); 201 unsigned long ioba, unsigned long tce);
206extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, 202extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
@@ -273,6 +269,7 @@ union kvmppc_one_reg {
273 u64 addr; 269 u64 addr;
274 u64 length; 270 u64 length;
275 } vpaval; 271 } vpaval;
272 u64 xive_timaval[2];
276}; 273};
277 274
278struct kvmppc_ops { 275struct kvmppc_ops {
@@ -480,6 +477,9 @@ extern void kvm_hv_vm_activated(void);
480extern void kvm_hv_vm_deactivated(void); 477extern void kvm_hv_vm_deactivated(void);
481extern bool kvm_hv_mode_active(void); 478extern bool kvm_hv_mode_active(void);
482 479
480extern void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
481 struct kvm_nested_guest *nested);
482
483#else 483#else
484static inline void __init kvm_cma_reserve(void) 484static inline void __init kvm_cma_reserve(void)
485{} 485{}
@@ -594,6 +594,22 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
594extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 594extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
595 int level, bool line_status); 595 int level, bool line_status);
596extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); 596extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
597
598static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
599{
600 return vcpu->arch.irq_type == KVMPPC_IRQ_XIVE;
601}
602
603extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
604 struct kvm_vcpu *vcpu, u32 cpu);
605extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
606extern void kvmppc_xive_native_init_module(void);
607extern void kvmppc_xive_native_exit_module(void);
608extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
609 union kvmppc_one_reg *val);
610extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
611 union kvmppc_one_reg *val);
612
597#else 613#else
598static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, 614static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
599 u32 priority) { return -1; } 615 u32 priority) { return -1; }
@@ -617,6 +633,21 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
617static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, 633static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
618 int level, bool line_status) { return -ENODEV; } 634 int level, bool line_status) { return -ENODEV; }
619static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } 635static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
636
637static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
638 { return 0; }
639static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
640 struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
641static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
642static inline void kvmppc_xive_native_init_module(void) { }
643static inline void kvmppc_xive_native_exit_module(void) { }
644static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
645 union kvmppc_one_reg *val)
646{ return 0; }
647static inline int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
648 union kvmppc_one_reg *val)
649{ return -ENOENT; }
650
620#endif /* CONFIG_KVM_XIVE */ 651#endif /* CONFIG_KVM_XIVE */
621 652
622#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER) 653#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER)
@@ -665,6 +696,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
665 unsigned long pte_index); 696 unsigned long pte_index);
666long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, 697long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
667 unsigned long pte_index); 698 unsigned long pte_index);
699long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
700 unsigned long dest, unsigned long src);
668long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, 701long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
669 unsigned long slb_v, unsigned int status, bool data); 702 unsigned long slb_v, unsigned int status, bool data);
670unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); 703unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index b579a943407b..eaf76f57023a 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -23,6 +23,7 @@
23 * same offset regardless of where the code is executing 23 * same offset regardless of where the code is executing
24 */ 24 */
25extern void __iomem *xive_tima; 25extern void __iomem *xive_tima;
26extern unsigned long xive_tima_os;
26 27
27/* 28/*
28 * Offset in the TM area of our current execution level (provided by 29 * Offset in the TM area of our current execution level (provided by
@@ -73,6 +74,8 @@ struct xive_q {
73 u32 esc_irq; 74 u32 esc_irq;
74 atomic_t count; 75 atomic_t count;
75 atomic_t pending_count; 76 atomic_t pending_count;
77 u64 guest_qaddr;
78 u32 guest_qshift;
76}; 79};
77 80
78/* Global enable flags for the XIVE support */ 81/* Global enable flags for the XIVE support */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 26ca425f4c2c..b0f72dea8b11 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -482,6 +482,8 @@ struct kvm_ppc_cpu_char {
482#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */ 482#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */
483#define KVM_REG_PPC_ICP_PPRI_MASK 0xff 483#define KVM_REG_PPC_ICP_PPRI_MASK 0xff
484 484
485#define KVM_REG_PPC_VP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x8d)
486
485/* Device control API: PPC-specific devices */ 487/* Device control API: PPC-specific devices */
486#define KVM_DEV_MPIC_GRP_MISC 1 488#define KVM_DEV_MPIC_GRP_MISC 1
487#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */ 489#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */
@@ -677,4 +679,48 @@ struct kvm_ppc_cpu_char {
677#define KVM_XICS_PRESENTED (1ULL << 43) 679#define KVM_XICS_PRESENTED (1ULL << 43)
678#define KVM_XICS_QUEUED (1ULL << 44) 680#define KVM_XICS_QUEUED (1ULL << 44)
679 681
682/* POWER9 XIVE Native Interrupt Controller */
683#define KVM_DEV_XIVE_GRP_CTRL 1
684#define KVM_DEV_XIVE_RESET 1
685#define KVM_DEV_XIVE_EQ_SYNC 2
686#define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */
687#define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */
688#define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */
689#define KVM_DEV_XIVE_GRP_SOURCE_SYNC 5 /* 64-bit source identifier */
690
691/* Layout of 64-bit XIVE source attribute values */
692#define KVM_XIVE_LEVEL_SENSITIVE (1ULL << 0)
693#define KVM_XIVE_LEVEL_ASSERTED (1ULL << 1)
694
695/* Layout of 64-bit XIVE source configuration attribute values */
696#define KVM_XIVE_SOURCE_PRIORITY_SHIFT 0
697#define KVM_XIVE_SOURCE_PRIORITY_MASK 0x7
698#define KVM_XIVE_SOURCE_SERVER_SHIFT 3
699#define KVM_XIVE_SOURCE_SERVER_MASK 0xfffffff8ULL
700#define KVM_XIVE_SOURCE_MASKED_SHIFT 32
701#define KVM_XIVE_SOURCE_MASKED_MASK 0x100000000ULL
702#define KVM_XIVE_SOURCE_EISN_SHIFT 33
703#define KVM_XIVE_SOURCE_EISN_MASK 0xfffffffe00000000ULL
704
705/* Layout of 64-bit EQ identifier */
706#define KVM_XIVE_EQ_PRIORITY_SHIFT 0
707#define KVM_XIVE_EQ_PRIORITY_MASK 0x7
708#define KVM_XIVE_EQ_SERVER_SHIFT 3
709#define KVM_XIVE_EQ_SERVER_MASK 0xfffffff8ULL
710
711/* Layout of EQ configuration values (64 bytes) */
712struct kvm_ppc_xive_eq {
713 __u32 flags;
714 __u32 qshift;
715 __u64 qaddr;
716 __u32 qtoggle;
717 __u32 qindex;
718 __u8 pad[40];
719};
720
721#define KVM_XIVE_EQ_ALWAYS_NOTIFY 0x00000001
722
723#define KVM_XIVE_TIMA_PAGE_OFFSET 0
724#define KVM_XIVE_ESB_PAGE_OFFSET 4
725
680#endif /* __LINUX_KVM_POWERPC_H */ 726#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 3223aec88b2c..4c67cc79de7c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -94,7 +94,7 @@ endif
94kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ 94kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
95 book3s_xics.o 95 book3s_xics.o
96 96
97kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o 97kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o book3s_xive_native.o
98kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o 98kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o
99 99
100kvm-book3s_64-module-objs := \ 100kvm-book3s_64-module-objs := \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 10c5579d20ce..61a212d0daf0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -651,6 +651,18 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
651 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); 651 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
652 break; 652 break;
653#endif /* CONFIG_KVM_XICS */ 653#endif /* CONFIG_KVM_XICS */
654#ifdef CONFIG_KVM_XIVE
655 case KVM_REG_PPC_VP_STATE:
656 if (!vcpu->arch.xive_vcpu) {
657 r = -ENXIO;
658 break;
659 }
660 if (xive_enabled())
661 r = kvmppc_xive_native_get_vp(vcpu, val);
662 else
663 r = -ENXIO;
664 break;
665#endif /* CONFIG_KVM_XIVE */
654 case KVM_REG_PPC_FSCR: 666 case KVM_REG_PPC_FSCR:
655 *val = get_reg_val(id, vcpu->arch.fscr); 667 *val = get_reg_val(id, vcpu->arch.fscr);
656 break; 668 break;
@@ -724,6 +736,18 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
724 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); 736 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
725 break; 737 break;
726#endif /* CONFIG_KVM_XICS */ 738#endif /* CONFIG_KVM_XICS */
739#ifdef CONFIG_KVM_XIVE
740 case KVM_REG_PPC_VP_STATE:
741 if (!vcpu->arch.xive_vcpu) {
742 r = -ENXIO;
743 break;
744 }
745 if (xive_enabled())
746 r = kvmppc_xive_native_set_vp(vcpu, val);
747 else
748 r = -ENXIO;
749 break;
750#endif /* CONFIG_KVM_XIVE */
727 case KVM_REG_PPC_FSCR: 751 case KVM_REG_PPC_FSCR:
728 vcpu->arch.fscr = set_reg_val(id, *val); 752 vcpu->arch.fscr = set_reg_val(id, *val);
729 break; 753 break;
@@ -891,6 +915,17 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
891 kvmppc_rtas_tokens_free(kvm); 915 kvmppc_rtas_tokens_free(kvm);
892 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); 916 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
893#endif 917#endif
918
919#ifdef CONFIG_KVM_XICS
920 /*
921 * Free the XIVE devices which are not directly freed by the
922 * device 'release' method
923 */
924 kfree(kvm->arch.xive_devices.native);
925 kvm->arch.xive_devices.native = NULL;
926 kfree(kvm->arch.xive_devices.xics_on_xive);
927 kvm->arch.xive_devices.xics_on_xive = NULL;
928#endif /* CONFIG_KVM_XICS */
894} 929}
895 930
896int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) 931int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
@@ -1050,6 +1085,9 @@ static int kvmppc_book3s_init(void)
1050 if (xics_on_xive()) { 1085 if (xics_on_xive()) {
1051 kvmppc_xive_init_module(); 1086 kvmppc_xive_init_module();
1052 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); 1087 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
1088 kvmppc_xive_native_init_module();
1089 kvm_register_device_ops(&kvm_xive_native_ops,
1090 KVM_DEV_TYPE_XIVE);
1053 } else 1091 } else
1054#endif 1092#endif
1055 kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); 1093 kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
@@ -1060,8 +1098,10 @@ static int kvmppc_book3s_init(void)
1060static void kvmppc_book3s_exit(void) 1098static void kvmppc_book3s_exit(void)
1061{ 1099{
1062#ifdef CONFIG_KVM_XICS 1100#ifdef CONFIG_KVM_XICS
1063 if (xics_on_xive()) 1101 if (xics_on_xive()) {
1064 kvmppc_xive_exit_module(); 1102 kvmppc_xive_exit_module();
1103 kvmppc_xive_native_exit_module();
1104 }
1065#endif 1105#endif
1066#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 1106#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
1067 kvmppc_book3s_exit_pr(); 1107 kvmppc_book3s_exit_pr();
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f100e331e69b..66270e07449a 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -228,11 +228,33 @@ static void release_spapr_tce_table(struct rcu_head *head)
228 unsigned long i, npages = kvmppc_tce_pages(stt->size); 228 unsigned long i, npages = kvmppc_tce_pages(stt->size);
229 229
230 for (i = 0; i < npages; i++) 230 for (i = 0; i < npages; i++)
231 __free_page(stt->pages[i]); 231 if (stt->pages[i])
232 __free_page(stt->pages[i]);
232 233
233 kfree(stt); 234 kfree(stt);
234} 235}
235 236
237static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt,
238 unsigned long sttpage)
239{
240 struct page *page = stt->pages[sttpage];
241
242 if (page)
243 return page;
244
245 mutex_lock(&stt->alloc_lock);
246 page = stt->pages[sttpage];
247 if (!page) {
248 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
249 WARN_ON_ONCE(!page);
250 if (page)
251 stt->pages[sttpage] = page;
252 }
253 mutex_unlock(&stt->alloc_lock);
254
255 return page;
256}
257
236static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) 258static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
237{ 259{
238 struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data; 260 struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
@@ -241,7 +263,10 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
241 if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) 263 if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
242 return VM_FAULT_SIGBUS; 264 return VM_FAULT_SIGBUS;
243 265
244 page = stt->pages[vmf->pgoff]; 266 page = kvm_spapr_get_tce_page(stt, vmf->pgoff);
267 if (!page)
268 return VM_FAULT_OOM;
269
245 get_page(page); 270 get_page(page);
246 vmf->page = page; 271 vmf->page = page;
247 return 0; 272 return 0;
@@ -296,7 +321,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
296 struct kvmppc_spapr_tce_table *siter; 321 struct kvmppc_spapr_tce_table *siter;
297 unsigned long npages, size = args->size; 322 unsigned long npages, size = args->size;
298 int ret = -ENOMEM; 323 int ret = -ENOMEM;
299 int i;
300 324
301 if (!args->size || args->page_shift < 12 || args->page_shift > 34 || 325 if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
302 (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) 326 (args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -318,14 +342,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
318 stt->offset = args->offset; 342 stt->offset = args->offset;
319 stt->size = size; 343 stt->size = size;
320 stt->kvm = kvm; 344 stt->kvm = kvm;
345 mutex_init(&stt->alloc_lock);
321 INIT_LIST_HEAD_RCU(&stt->iommu_tables); 346 INIT_LIST_HEAD_RCU(&stt->iommu_tables);
322 347
323 for (i = 0; i < npages; i++) {
324 stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
325 if (!stt->pages[i])
326 goto fail;
327 }
328
329 mutex_lock(&kvm->lock); 348 mutex_lock(&kvm->lock);
330 349
331 /* Check this LIOBN hasn't been previously allocated */ 350 /* Check this LIOBN hasn't been previously allocated */
@@ -352,17 +371,28 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
352 if (ret >= 0) 371 if (ret >= 0)
353 return ret; 372 return ret;
354 373
355 fail:
356 for (i = 0; i < npages; i++)
357 if (stt->pages[i])
358 __free_page(stt->pages[i]);
359
360 kfree(stt); 374 kfree(stt);
361 fail_acct: 375 fail_acct:
362 kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); 376 kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
363 return ret; 377 return ret;
364} 378}
365 379
380static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
381 unsigned long *ua)
382{
383 unsigned long gfn = tce >> PAGE_SHIFT;
384 struct kvm_memory_slot *memslot;
385
386 memslot = search_memslots(kvm_memslots(kvm), gfn);
387 if (!memslot)
388 return -EINVAL;
389
390 *ua = __gfn_to_hva_memslot(memslot, gfn) |
391 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
392
393 return 0;
394}
395
366static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, 396static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
367 unsigned long tce) 397 unsigned long tce)
368{ 398{
@@ -378,7 +408,7 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
378 if (iommu_tce_check_gpa(stt->page_shift, gpa)) 408 if (iommu_tce_check_gpa(stt->page_shift, gpa))
379 return H_TOO_HARD; 409 return H_TOO_HARD;
380 410
381 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) 411 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua))
382 return H_TOO_HARD; 412 return H_TOO_HARD;
383 413
384 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { 414 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
@@ -397,6 +427,36 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
397 return H_SUCCESS; 427 return H_SUCCESS;
398} 428}
399 429
430/*
431 * Handles TCE requests for emulated devices.
432 * Puts guest TCE values to the table and expects user space to convert them.
433 * Cannot fail so kvmppc_tce_validate must be called before it.
434 */
435static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
436 unsigned long idx, unsigned long tce)
437{
438 struct page *page;
439 u64 *tbl;
440 unsigned long sttpage;
441
442 idx -= stt->offset;
443 sttpage = idx / TCES_PER_PAGE;
444 page = stt->pages[sttpage];
445
446 if (!page) {
447 /* We allow any TCE, not just with read|write permissions */
448 if (!tce)
449 return;
450
451 page = kvm_spapr_get_tce_page(stt, sttpage);
452 if (!page)
453 return;
454 }
455 tbl = page_to_virt(page);
456
457 tbl[idx % TCES_PER_PAGE] = tce;
458}
459
400static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, 460static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
401 unsigned long entry) 461 unsigned long entry)
402{ 462{
@@ -551,7 +611,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
551 611
552 dir = iommu_tce_direction(tce); 612 dir = iommu_tce_direction(tce);
553 613
554 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { 614 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
555 ret = H_PARAMETER; 615 ret = H_PARAMETER;
556 goto unlock_exit; 616 goto unlock_exit;
557 } 617 }
@@ -612,7 +672,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
612 return ret; 672 return ret;
613 673
614 idx = srcu_read_lock(&vcpu->kvm->srcu); 674 idx = srcu_read_lock(&vcpu->kvm->srcu);
615 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { 675 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) {
616 ret = H_TOO_HARD; 676 ret = H_TOO_HARD;
617 goto unlock_exit; 677 goto unlock_exit;
618 } 678 }
@@ -647,7 +707,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
647 } 707 }
648 tce = be64_to_cpu(tce); 708 tce = be64_to_cpu(tce);
649 709
650 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 710 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua))
651 return H_PARAMETER; 711 return H_PARAMETER;
652 712
653 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 713 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2206bc729b9a..484b47fa3960 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -66,8 +66,6 @@
66 66
67#endif 67#endif
68 68
69#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
70
71/* 69/*
72 * Finds a TCE table descriptor by LIOBN. 70 * Finds a TCE table descriptor by LIOBN.
73 * 71 *
@@ -88,6 +86,25 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
88EXPORT_SYMBOL_GPL(kvmppc_find_table); 86EXPORT_SYMBOL_GPL(kvmppc_find_table);
89 87
90#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 88#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
89static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
90 unsigned long *ua, unsigned long **prmap)
91{
92 unsigned long gfn = tce >> PAGE_SHIFT;
93 struct kvm_memory_slot *memslot;
94
95 memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
96 if (!memslot)
97 return -EINVAL;
98
99 *ua = __gfn_to_hva_memslot(memslot, gfn) |
100 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
101
102 if (prmap)
103 *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
104
105 return 0;
106}
107
91/* 108/*
92 * Validates TCE address. 109 * Validates TCE address.
93 * At the moment flags and page mask are validated. 110 * At the moment flags and page mask are validated.
@@ -111,7 +128,7 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
111 if (iommu_tce_check_gpa(stt->page_shift, gpa)) 128 if (iommu_tce_check_gpa(stt->page_shift, gpa))
112 return H_PARAMETER; 129 return H_PARAMETER;
113 130
114 if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) 131 if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL))
115 return H_TOO_HARD; 132 return H_TOO_HARD;
116 133
117 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 134 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -129,7 +146,6 @@ static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
129 146
130 return H_SUCCESS; 147 return H_SUCCESS;
131} 148}
132#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
133 149
134/* Note on the use of page_address() in real mode, 150/* Note on the use of page_address() in real mode,
135 * 151 *
@@ -161,13 +177,9 @@ static u64 *kvmppc_page_address(struct page *page)
161/* 177/*
162 * Handles TCE requests for emulated devices. 178 * Handles TCE requests for emulated devices.
163 * Puts guest TCE values to the table and expects user space to convert them. 179 * Puts guest TCE values to the table and expects user space to convert them.
164 * Called in both real and virtual modes. 180 * Cannot fail so kvmppc_rm_tce_validate must be called before it.
165 * Cannot fail so kvmppc_tce_validate must be called before it.
166 *
167 * WARNING: This will be called in real-mode on HV KVM and virtual
168 * mode on PR KVM
169 */ 181 */
170void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, 182static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt,
171 unsigned long idx, unsigned long tce) 183 unsigned long idx, unsigned long tce)
172{ 184{
173 struct page *page; 185 struct page *page;
@@ -175,35 +187,48 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
175 187
176 idx -= stt->offset; 188 idx -= stt->offset;
177 page = stt->pages[idx / TCES_PER_PAGE]; 189 page = stt->pages[idx / TCES_PER_PAGE];
190 /*
191 * page must not be NULL in real mode,
192 * kvmppc_rm_ioba_validate() must have taken care of this.
193 */
194 WARN_ON_ONCE_RM(!page);
178 tbl = kvmppc_page_address(page); 195 tbl = kvmppc_page_address(page);
179 196
180 tbl[idx % TCES_PER_PAGE] = tce; 197 tbl[idx % TCES_PER_PAGE] = tce;
181} 198}
182EXPORT_SYMBOL_GPL(kvmppc_tce_put);
183 199
184long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, 200/*
185 unsigned long *ua, unsigned long **prmap) 201 * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so
202 * in real mode.
203 * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is
204 * allocated or not required (when clearing a tce entry).
205 */
206static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
207 unsigned long ioba, unsigned long npages, bool clearing)
186{ 208{
187 unsigned long gfn = tce >> PAGE_SHIFT; 209 unsigned long i, idx, sttpage, sttpages;
188 struct kvm_memory_slot *memslot; 210 unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
189 211
190 memslot = search_memslots(kvm_memslots(kvm), gfn); 212 if (ret)
191 if (!memslot) 213 return ret;
192 return -EINVAL; 214 /*
193 215 * clearing==true says kvmppc_rm_tce_put won't be allocating pages
194 *ua = __gfn_to_hva_memslot(memslot, gfn) | 216 * for empty tces.
195 (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); 217 */
218 if (clearing)
219 return H_SUCCESS;
196 220
197#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 221 idx = (ioba >> stt->page_shift) - stt->offset;
198 if (prmap) 222 sttpage = idx / TCES_PER_PAGE;
199 *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; 223 sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
200#endif 224 TCES_PER_PAGE;
225 for (i = sttpage; i < sttpage + sttpages; ++i)
226 if (!stt->pages[i])
227 return H_TOO_HARD;
201 228
202 return 0; 229 return H_SUCCESS;
203} 230}
204EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
205 231
206#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
207static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 232static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
208 unsigned long entry, unsigned long *hpa, 233 unsigned long entry, unsigned long *hpa,
209 enum dma_data_direction *direction) 234 enum dma_data_direction *direction)
@@ -381,7 +406,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
381 if (!stt) 406 if (!stt)
382 return H_TOO_HARD; 407 return H_TOO_HARD;
383 408
384 ret = kvmppc_ioba_validate(stt, ioba, 1); 409 ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
385 if (ret != H_SUCCESS) 410 if (ret != H_SUCCESS)
386 return ret; 411 return ret;
387 412
@@ -390,7 +415,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
390 return ret; 415 return ret;
391 416
392 dir = iommu_tce_direction(tce); 417 dir = iommu_tce_direction(tce);
393 if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 418 if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
394 return H_PARAMETER; 419 return H_PARAMETER;
395 420
396 entry = ioba >> stt->page_shift; 421 entry = ioba >> stt->page_shift;
@@ -409,7 +434,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
409 } 434 }
410 } 435 }
411 436
412 kvmppc_tce_put(stt, entry, tce); 437 kvmppc_rm_tce_put(stt, entry, tce);
413 438
414 return H_SUCCESS; 439 return H_SUCCESS;
415} 440}
@@ -480,7 +505,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
480 if (tce_list & (SZ_4K - 1)) 505 if (tce_list & (SZ_4K - 1))
481 return H_PARAMETER; 506 return H_PARAMETER;
482 507
483 ret = kvmppc_ioba_validate(stt, ioba, npages); 508 ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
484 if (ret != H_SUCCESS) 509 if (ret != H_SUCCESS)
485 return ret; 510 return ret;
486 511
@@ -492,7 +517,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
492 */ 517 */
493 struct mm_iommu_table_group_mem_t *mem; 518 struct mm_iommu_table_group_mem_t *mem;
494 519
495 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) 520 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
496 return H_TOO_HARD; 521 return H_TOO_HARD;
497 522
498 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); 523 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -508,7 +533,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
508 * We do not require memory to be preregistered in this case 533 * We do not require memory to be preregistered in this case
509 * so lock rmap and do __find_linux_pte_or_hugepte(). 534 * so lock rmap and do __find_linux_pte_or_hugepte().
510 */ 535 */
511 if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) 536 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
512 return H_TOO_HARD; 537 return H_TOO_HARD;
513 538
514 rmap = (void *) vmalloc_to_phys(rmap); 539 rmap = (void *) vmalloc_to_phys(rmap);
@@ -542,7 +567,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
542 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); 567 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
543 568
544 ua = 0; 569 ua = 0;
545 if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) 570 if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
546 return H_PARAMETER; 571 return H_PARAMETER;
547 572
548 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { 573 list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -557,7 +582,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
557 } 582 }
558 } 583 }
559 584
560 kvmppc_tce_put(stt, entry + i, tce); 585 kvmppc_rm_tce_put(stt, entry + i, tce);
561 } 586 }
562 587
563unlock_exit: 588unlock_exit:
@@ -583,7 +608,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
583 if (!stt) 608 if (!stt)
584 return H_TOO_HARD; 609 return H_TOO_HARD;
585 610
586 ret = kvmppc_ioba_validate(stt, ioba, npages); 611 ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
587 if (ret != H_SUCCESS) 612 if (ret != H_SUCCESS)
588 return ret; 613 return ret;
589 614
@@ -610,7 +635,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
610 } 635 }
611 636
612 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) 637 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
613 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); 638 kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
614 639
615 return H_SUCCESS; 640 return H_SUCCESS;
616} 641}
@@ -635,6 +660,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
635 660
636 idx = (ioba >> stt->page_shift) - stt->offset; 661 idx = (ioba >> stt->page_shift) - stt->offset;
637 page = stt->pages[idx / TCES_PER_PAGE]; 662 page = stt->pages[idx / TCES_PER_PAGE];
663 if (!page) {
664 vcpu->arch.regs.gpr[4] = 0;
665 return H_SUCCESS;
666 }
638 tbl = (u64 *)page_address(page); 667 tbl = (u64 *)page_address(page);
639 668
640 vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; 669 vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7bdcd4d7a9f0..d5fc624e0655 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -750,7 +750,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
750 /* 750 /*
751 * Ensure that the read of vcore->dpdes comes after the read 751 * Ensure that the read of vcore->dpdes comes after the read
752 * of vcpu->doorbell_request. This barrier matches the 752 * of vcpu->doorbell_request. This barrier matches the
753 * smb_wmb() in kvmppc_guest_entry_inject(). 753 * smp_wmb() in kvmppc_guest_entry_inject().
754 */ 754 */
755 smp_rmb(); 755 smp_rmb();
756 vc = vcpu->arch.vcore; 756 vc = vcpu->arch.vcore;
@@ -802,6 +802,80 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
802 } 802 }
803} 803}
804 804
805/* Copy guest memory in place - must reside within a single memslot */
806static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
807 unsigned long len)
808{
809 struct kvm_memory_slot *to_memslot = NULL;
810 struct kvm_memory_slot *from_memslot = NULL;
811 unsigned long to_addr, from_addr;
812 int r;
813
814 /* Get HPA for from address */
815 from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
816 if (!from_memslot)
817 return -EFAULT;
818 if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
819 << PAGE_SHIFT))
820 return -EINVAL;
821 from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
822 if (kvm_is_error_hva(from_addr))
823 return -EFAULT;
824 from_addr |= (from & (PAGE_SIZE - 1));
825
826 /* Get HPA for to address */
827 to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
828 if (!to_memslot)
829 return -EFAULT;
830 if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
831 << PAGE_SHIFT))
832 return -EINVAL;
833 to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
834 if (kvm_is_error_hva(to_addr))
835 return -EFAULT;
836 to_addr |= (to & (PAGE_SIZE - 1));
837
838 /* Perform copy */
839 r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
840 len);
841 if (r)
842 return -EFAULT;
843 mark_page_dirty(kvm, to >> PAGE_SHIFT);
844 return 0;
845}
846
847static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
848 unsigned long dest, unsigned long src)
849{
850 u64 pg_sz = SZ_4K; /* 4K page size */
851 u64 pg_mask = SZ_4K - 1;
852 int ret;
853
854 /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
855 if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
856 H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
857 return H_PARAMETER;
858
859 /* dest (and src if copy_page flag set) must be page aligned */
860 if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
861 return H_PARAMETER;
862
863 /* zero and/or copy the page as determined by the flags */
864 if (flags & H_COPY_PAGE) {
865 ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
866 if (ret < 0)
867 return H_PARAMETER;
868 } else if (flags & H_ZERO_PAGE) {
869 ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
870 if (ret < 0)
871 return H_PARAMETER;
872 }
873
874 /* We can ignore the remaining flags */
875
876 return H_SUCCESS;
877}
878
805static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) 879static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
806{ 880{
807 struct kvmppc_vcore *vcore = target->arch.vcore; 881 struct kvmppc_vcore *vcore = target->arch.vcore;
@@ -1004,6 +1078,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
1004 if (nesting_enabled(vcpu->kvm)) 1078 if (nesting_enabled(vcpu->kvm))
1005 ret = kvmhv_copy_tofrom_guest_nested(vcpu); 1079 ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1006 break; 1080 break;
1081 case H_PAGE_INIT:
1082 ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1083 kvmppc_get_gpr(vcpu, 5),
1084 kvmppc_get_gpr(vcpu, 6));
1085 break;
1007 default: 1086 default:
1008 return RESUME_HOST; 1087 return RESUME_HOST;
1009 } 1088 }
@@ -1048,6 +1127,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
1048 case H_IPOLL: 1127 case H_IPOLL:
1049 case H_XIRR_X: 1128 case H_XIRR_X:
1050#endif 1129#endif
1130 case H_PAGE_INIT:
1051 return 1; 1131 return 1;
1052 } 1132 }
1053 1133
@@ -2505,37 +2585,6 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2505 } 2585 }
2506} 2586}
2507 2587
2508static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
2509 struct kvm_nested_guest *nested)
2510{
2511 cpumask_t *need_tlb_flush;
2512 int lpid;
2513
2514 if (!cpu_has_feature(CPU_FTR_HVMODE))
2515 return;
2516
2517 if (cpu_has_feature(CPU_FTR_ARCH_300))
2518 pcpu &= ~0x3UL;
2519
2520 if (nested) {
2521 lpid = nested->shadow_lpid;
2522 need_tlb_flush = &nested->need_tlb_flush;
2523 } else {
2524 lpid = kvm->arch.lpid;
2525 need_tlb_flush = &kvm->arch.need_tlb_flush;
2526 }
2527
2528 mtspr(SPRN_LPID, lpid);
2529 isync();
2530 smp_mb();
2531
2532 if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
2533 radix__local_flush_tlb_lpid_guest(lpid);
2534 /* Clear the bit after the TLB flush */
2535 cpumask_clear_cpu(pcpu, need_tlb_flush);
2536 }
2537}
2538
2539static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 2588static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
2540{ 2589{
2541 int cpu; 2590 int cpu;
@@ -3229,19 +3278,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3229 for (sub = 0; sub < core_info.n_subcores; ++sub) 3278 for (sub = 0; sub < core_info.n_subcores; ++sub)
3230 spin_unlock(&core_info.vc[sub]->lock); 3279 spin_unlock(&core_info.vc[sub]->lock);
3231 3280
3232 if (kvm_is_radix(vc->kvm)) { 3281 guest_enter_irqoff();
3233 /* 3282
3234 * Do we need to flush the process scoped TLB for the LPAR? 3283 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3235 * 3284
3236 * On POWER9, individual threads can come in here, but the 3285 this_cpu_disable_ftrace();
3237 * TLB is shared between the 4 threads in a core, hence
3238 * invalidating on one thread invalidates for all.
3239 * Thus we make all 4 threads use the same bit here.
3240 *
3241 * Hash must be flushed in realmode in order to use tlbiel.
3242 */
3243 kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
3244 }
3245 3286
3246 /* 3287 /*
3247 * Interrupts will be enabled once we get into the guest, 3288 * Interrupts will be enabled once we get into the guest,
@@ -3249,19 +3290,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
3249 */ 3290 */
3250 trace_hardirqs_on(); 3291 trace_hardirqs_on();
3251 3292
3252 guest_enter_irqoff();
3253
3254 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
3255
3256 this_cpu_disable_ftrace();
3257
3258 trap = __kvmppc_vcore_entry(); 3293 trap = __kvmppc_vcore_entry();
3259 3294
3295 trace_hardirqs_off();
3296
3260 this_cpu_enable_ftrace(); 3297 this_cpu_enable_ftrace();
3261 3298
3262 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 3299 srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
3263 3300
3264 trace_hardirqs_off();
3265 set_irq_happened(trap); 3301 set_irq_happened(trap);
3266 3302
3267 spin_lock(&vc->lock); 3303 spin_lock(&vc->lock);
@@ -3514,6 +3550,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3514#ifdef CONFIG_ALTIVEC 3550#ifdef CONFIG_ALTIVEC
3515 load_vr_state(&vcpu->arch.vr); 3551 load_vr_state(&vcpu->arch.vr);
3516#endif 3552#endif
3553 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
3517 3554
3518 mtspr(SPRN_DSCR, vcpu->arch.dscr); 3555 mtspr(SPRN_DSCR, vcpu->arch.dscr);
3519 mtspr(SPRN_IAMR, vcpu->arch.iamr); 3556 mtspr(SPRN_IAMR, vcpu->arch.iamr);
@@ -3605,6 +3642,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3605#ifdef CONFIG_ALTIVEC 3642#ifdef CONFIG_ALTIVEC
3606 store_vr_state(&vcpu->arch.vr); 3643 store_vr_state(&vcpu->arch.vr);
3607#endif 3644#endif
3645 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
3608 3646
3609 if (cpu_has_feature(CPU_FTR_TM) || 3647 if (cpu_has_feature(CPU_FTR_TM) ||
3610 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) 3648 cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
@@ -3970,7 +4008,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
3970 unsigned long lpcr) 4008 unsigned long lpcr)
3971{ 4009{
3972 int trap, r, pcpu; 4010 int trap, r, pcpu;
3973 int srcu_idx; 4011 int srcu_idx, lpid;
3974 struct kvmppc_vcore *vc; 4012 struct kvmppc_vcore *vc;
3975 struct kvm *kvm = vcpu->kvm; 4013 struct kvm *kvm = vcpu->kvm;
3976 struct kvm_nested_guest *nested = vcpu->arch.nested; 4014 struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -4046,8 +4084,12 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
4046 vc->vcore_state = VCORE_RUNNING; 4084 vc->vcore_state = VCORE_RUNNING;
4047 trace_kvmppc_run_core(vc, 0); 4085 trace_kvmppc_run_core(vc, 0);
4048 4086
4049 if (cpu_has_feature(CPU_FTR_HVMODE)) 4087 if (cpu_has_feature(CPU_FTR_HVMODE)) {
4050 kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); 4088 lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
4089 mtspr(SPRN_LPID, lpid);
4090 isync();
4091 kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
4092 }
4051 4093
4052 trace_hardirqs_on(); 4094 trace_hardirqs_on();
4053 guest_enter_irqoff(); 4095 guest_enter_irqoff();
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index b0cf22477e87..6035d24f1d1d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -805,3 +805,60 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
805 vcpu->arch.doorbell_request = 0; 805 vcpu->arch.doorbell_request = 0;
806 } 806 }
807} 807}
808
809static void flush_guest_tlb(struct kvm *kvm)
810{
811 unsigned long rb, set;
812
813 rb = PPC_BIT(52); /* IS = 2 */
814 if (kvm_is_radix(kvm)) {
815 /* R=1 PRS=1 RIC=2 */
816 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
817 : : "r" (rb), "i" (1), "i" (1), "i" (2),
818 "r" (0) : "memory");
819 for (set = 1; set < kvm->arch.tlb_sets; ++set) {
820 rb += PPC_BIT(51); /* increment set number */
821 /* R=1 PRS=1 RIC=0 */
822 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
823 : : "r" (rb), "i" (1), "i" (1), "i" (0),
824 "r" (0) : "memory");
825 }
826 } else {
827 for (set = 0; set < kvm->arch.tlb_sets; ++set) {
828 /* R=0 PRS=0 RIC=0 */
829 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
830 : : "r" (rb), "i" (0), "i" (0), "i" (0),
831 "r" (0) : "memory");
832 rb += PPC_BIT(51); /* increment set number */
833 }
834 }
835 asm volatile("ptesync": : :"memory");
836}
837
838void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
839 struct kvm_nested_guest *nested)
840{
841 cpumask_t *need_tlb_flush;
842
843 /*
844 * On POWER9, individual threads can come in here, but the
845 * TLB is shared between the 4 threads in a core, hence
846 * invalidating on one thread invalidates for all.
847 * Thus we make all 4 threads use the same bit.
848 */
849 if (cpu_has_feature(CPU_FTR_ARCH_300))
850 pcpu = cpu_first_thread_sibling(pcpu);
851
852 if (nested)
853 need_tlb_flush = &nested->need_tlb_flush;
854 else
855 need_tlb_flush = &kvm->arch.need_tlb_flush;
856
857 if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
858 flush_guest_tlb(kvm);
859
860 /* Clear the bit after the TLB flush */
861 cpumask_clear_cpu(pcpu, need_tlb_flush);
862 }
863}
864EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 3b3791ed74a6..8431ad1e8391 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/log2.h> 15#include <linux/log2.h>
16#include <linux/sizes.h>
16 17
17#include <asm/trace.h> 18#include <asm/trace.h>
18#include <asm/kvm_ppc.h> 19#include <asm/kvm_ppc.h>
@@ -867,6 +868,149 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
867 return ret; 868 return ret;
868} 869}
869 870
871static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
872 int writing, unsigned long *hpa,
873 struct kvm_memory_slot **memslot_p)
874{
875 struct kvm *kvm = vcpu->kvm;
876 struct kvm_memory_slot *memslot;
877 unsigned long gfn, hva, pa, psize = PAGE_SHIFT;
878 unsigned int shift;
879 pte_t *ptep, pte;
880
881 /* Find the memslot for this address */
882 gfn = gpa >> PAGE_SHIFT;
883 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
884 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
885 return H_PARAMETER;
886
887 /* Translate to host virtual address */
888 hva = __gfn_to_hva_memslot(memslot, gfn);
889
890 /* Try to find the host pte for that virtual address */
891 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
892 if (!ptep)
893 return H_TOO_HARD;
894 pte = kvmppc_read_update_linux_pte(ptep, writing);
895 if (!pte_present(pte))
896 return H_TOO_HARD;
897
898 /* Convert to a physical address */
899 if (shift)
900 psize = 1UL << shift;
901 pa = pte_pfn(pte) << PAGE_SHIFT;
902 pa |= hva & (psize - 1);
903 pa |= gpa & ~PAGE_MASK;
904
905 if (hpa)
906 *hpa = pa;
907 if (memslot_p)
908 *memslot_p = memslot;
909
910 return H_SUCCESS;
911}
912
913static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
914 unsigned long dest)
915{
916 struct kvm_memory_slot *memslot;
917 struct kvm *kvm = vcpu->kvm;
918 unsigned long pa, mmu_seq;
919 long ret = H_SUCCESS;
920 int i;
921
922 /* Used later to detect if we might have been invalidated */
923 mmu_seq = kvm->mmu_notifier_seq;
924 smp_rmb();
925
926 ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot);
927 if (ret != H_SUCCESS)
928 return ret;
929
930 /* Check if we've been invalidated */
931 raw_spin_lock(&kvm->mmu_lock.rlock);
932 if (mmu_notifier_retry(kvm, mmu_seq)) {
933 ret = H_TOO_HARD;
934 goto out_unlock;
935 }
936
937 /* Zero the page */
938 for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
939 dcbz((void *)pa);
940 kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
941
942out_unlock:
943 raw_spin_unlock(&kvm->mmu_lock.rlock);
944 return ret;
945}
946
947static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
948 unsigned long dest, unsigned long src)
949{
950 unsigned long dest_pa, src_pa, mmu_seq;
951 struct kvm_memory_slot *dest_memslot;
952 struct kvm *kvm = vcpu->kvm;
953 long ret = H_SUCCESS;
954
955 /* Used later to detect if we might have been invalidated */
956 mmu_seq = kvm->mmu_notifier_seq;
957 smp_rmb();
958
959 ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot);
960 if (ret != H_SUCCESS)
961 return ret;
962 ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL);
963 if (ret != H_SUCCESS)
964 return ret;
965
966 /* Check if we've been invalidated */
967 raw_spin_lock(&kvm->mmu_lock.rlock);
968 if (mmu_notifier_retry(kvm, mmu_seq)) {
969 ret = H_TOO_HARD;
970 goto out_unlock;
971 }
972
973 /* Copy the page */
974 memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
975
976 kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
977
978out_unlock:
979 raw_spin_unlock(&kvm->mmu_lock.rlock);
980 return ret;
981}
982
983long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
984 unsigned long dest, unsigned long src)
985{
986 struct kvm *kvm = vcpu->kvm;
987 u64 pg_mask = SZ_4K - 1; /* 4K page size */
988 long ret = H_SUCCESS;
989
990 /* Don't handle radix mode here, go up to the virtual mode handler */
991 if (kvm_is_radix(kvm))
992 return H_TOO_HARD;
993
994 /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
995 if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
996 H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
997 return H_PARAMETER;
998
999 /* dest (and src if copy_page flag set) must be page aligned */
1000 if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
1001 return H_PARAMETER;
1002
1003 /* zero and/or copy the page as determined by the flags */
1004 if (flags & H_COPY_PAGE)
1005 ret = kvmppc_do_h_page_init_copy(vcpu, dest, src);
1006 else if (flags & H_ZERO_PAGE)
1007 ret = kvmppc_do_h_page_init_zero(vcpu, dest);
1008
1009 /* We can ignore the other flags */
1010
1011 return ret;
1012}
1013
870void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, 1014void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
871 unsigned long pte_index) 1015 unsigned long pte_index)
872{ 1016{
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index dd014308f065..f9b2620fbecd 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -589,11 +589,8 @@ kvmppc_hv_entry:
5891: 5891:
590#endif 590#endif
591 591
592 /* Use cr7 as an indication of radix mode */
593 ld r5, HSTATE_KVM_VCORE(r13) 592 ld r5, HSTATE_KVM_VCORE(r13)
594 ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ 593 ld r9, VCORE_KVM(r5) /* pointer to struct kvm */
595 lbz r0, KVM_RADIX(r9)
596 cmpwi cr7, r0, 0
597 594
598 /* 595 /*
599 * POWER7/POWER8 host -> guest partition switch code. 596 * POWER7/POWER8 host -> guest partition switch code.
@@ -616,9 +613,6 @@ kvmppc_hv_entry:
616 cmpwi r6,0 613 cmpwi r6,0
617 bne 10f 614 bne 10f
618 615
619 /* Radix has already switched LPID and flushed core TLB */
620 bne cr7, 22f
621
622 lwz r7,KVM_LPID(r9) 616 lwz r7,KVM_LPID(r9)
623BEGIN_FTR_SECTION 617BEGIN_FTR_SECTION
624 ld r6,KVM_SDR1(r9) 618 ld r6,KVM_SDR1(r9)
@@ -630,41 +624,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
630 mtspr SPRN_LPID,r7 624 mtspr SPRN_LPID,r7
631 isync 625 isync
632 626
633 /* See if we need to flush the TLB. Hash has to be done in RM */ 627 /* See if we need to flush the TLB. */
634 lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ 628 mr r3, r9 /* kvm pointer */
635BEGIN_FTR_SECTION 629 lhz r4, PACAPACAINDEX(r13) /* physical cpu number */
636 /* 630 li r5, 0 /* nested vcpu pointer */
637 * On POWER9, individual threads can come in here, but the 631 bl kvmppc_check_need_tlb_flush
638 * TLB is shared between the 4 threads in a core, hence 632 nop
639 * invalidating on one thread invalidates for all. 633 ld r5, HSTATE_KVM_VCORE(r13)
640 * Thus we make all 4 threads use the same bit here.
641 */
642 clrrdi r6,r6,2
643END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
644 clrldi r7,r6,64-6 /* extract bit number (6 bits) */
645 srdi r6,r6,6 /* doubleword number */
646 sldi r6,r6,3 /* address offset */
647 add r6,r6,r9
648 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
649 li r8,1
650 sld r8,r8,r7
651 ld r7,0(r6)
652 and. r7,r7,r8
653 beq 22f
654 /* Flush the TLB of any entries for this LPID */
655 lwz r0,KVM_TLB_SETS(r9)
656 mtctr r0
657 li r7,0x800 /* IS field = 0b10 */
658 ptesync
659 li r0,0 /* RS for P9 version of tlbiel */
66028: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
661 addi r7,r7,0x1000
662 bdnz 28b
663 ptesync
66423: ldarx r7,0,r6 /* clear the bit after TLB flushed */
665 andc r7,r7,r8
666 stdcx. r7,0,r6
667 bne 23b
668 634
669 /* Add timebase offset onto timebase */ 635 /* Add timebase offset onto timebase */
67022: ld r8,VCORE_TB_OFFSET(r5) 63622: ld r8,VCORE_TB_OFFSET(r5)
@@ -980,17 +946,27 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
980 946
981#ifdef CONFIG_KVM_XICS 947#ifdef CONFIG_KVM_XICS
982 /* We are entering the guest on that thread, push VCPU to XIVE */ 948 /* We are entering the guest on that thread, push VCPU to XIVE */
983 ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
984 cmpldi cr0, r10, 0
985 beq no_xive
986 ld r11, VCPU_XIVE_SAVED_STATE(r4) 949 ld r11, VCPU_XIVE_SAVED_STATE(r4)
987 li r9, TM_QW1_OS 950 li r9, TM_QW1_OS
951 lwz r8, VCPU_XIVE_CAM_WORD(r4)
952 li r7, TM_QW1_OS + TM_WORD2
953 mfmsr r0
954 andi. r0, r0, MSR_DR /* in real mode? */
955 beq 2f
956 ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
957 cmpldi cr1, r10, 0
958 beq cr1, no_xive
959 eieio
960 stdx r11,r9,r10
961 stwx r8,r7,r10
962 b 3f
9632: ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
964 cmpldi cr1, r10, 0
965 beq cr1, no_xive
988 eieio 966 eieio
989 stdcix r11,r9,r10 967 stdcix r11,r9,r10
990 lwz r11, VCPU_XIVE_CAM_WORD(r4) 968 stwcix r8,r7,r10
991 li r9, TM_QW1_OS + TM_WORD2 9693: li r9, 1
992 stwcix r11,r9,r10
993 li r9, 1
994 stb r9, VCPU_XIVE_PUSHED(r4) 970 stb r9, VCPU_XIVE_PUSHED(r4)
995 eieio 971 eieio
996 972
@@ -1009,12 +985,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
1009 * on, we mask it. 985 * on, we mask it.
1010 */ 986 */
1011 lbz r0, VCPU_XIVE_ESC_ON(r4) 987 lbz r0, VCPU_XIVE_ESC_ON(r4)
1012 cmpwi r0,0 988 cmpwi cr1, r0,0
1013 beq 1f 989 beq cr1, 1f
1014 ld r10, VCPU_XIVE_ESC_RADDR(r4)
1015 li r9, XIVE_ESB_SET_PQ_01 990 li r9, XIVE_ESB_SET_PQ_01
991 beq 4f /* in real mode? */
992 ld r10, VCPU_XIVE_ESC_VADDR(r4)
993 ldx r0, r10, r9
994 b 5f
9954: ld r10, VCPU_XIVE_ESC_RADDR(r4)
1016 ldcix r0, r10, r9 996 ldcix r0, r10, r9
1017 sync 9975: sync
1018 998
1019 /* We have a possible subtle race here: The escalation interrupt might 999 /* We have a possible subtle race here: The escalation interrupt might
1020 * have fired and be on its way to the host queue while we mask it, 1000 * have fired and be on its way to the host queue while we mask it,
@@ -2292,7 +2272,7 @@ hcall_real_table:
2292#endif 2272#endif
2293 .long 0 /* 0x24 - H_SET_SPRG0 */ 2273 .long 0 /* 0x24 - H_SET_SPRG0 */
2294 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table 2274 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
2295 .long 0 /* 0x2c */ 2275 .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table
2296 .long 0 /* 0x30 */ 2276 .long 0 /* 0x30 */
2297 .long 0 /* 0x34 */ 2277 .long 0 /* 0x34 */
2298 .long 0 /* 0x38 */ 2278 .long 0 /* 0x38 */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index f78d002f0fe0..4953957333b7 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,7 +166,8 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
166 return IRQ_HANDLED; 166 return IRQ_HANDLED;
167} 167}
168 168
169static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) 169int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
170 bool single_escalation)
170{ 171{
171 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 172 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
172 struct xive_q *q = &xc->queues[prio]; 173 struct xive_q *q = &xc->queues[prio];
@@ -185,7 +186,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
185 return -EIO; 186 return -EIO;
186 } 187 }
187 188
188 if (xc->xive->single_escalation) 189 if (single_escalation)
189 name = kasprintf(GFP_KERNEL, "kvm-%d-%d", 190 name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
190 vcpu->kvm->arch.lpid, xc->server_num); 191 vcpu->kvm->arch.lpid, xc->server_num);
191 else 192 else
@@ -217,7 +218,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
217 * interrupt, thus leaving it effectively masked after 218 * interrupt, thus leaving it effectively masked after
218 * it fires once. 219 * it fires once.
219 */ 220 */
220 if (xc->xive->single_escalation) { 221 if (single_escalation) {
221 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); 222 struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
222 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 223 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
223 224
@@ -291,7 +292,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
291 continue; 292 continue;
292 rc = xive_provision_queue(vcpu, prio); 293 rc = xive_provision_queue(vcpu, prio);
293 if (rc == 0 && !xive->single_escalation) 294 if (rc == 0 && !xive->single_escalation)
294 xive_attach_escalation(vcpu, prio); 295 kvmppc_xive_attach_escalation(vcpu, prio,
296 xive->single_escalation);
295 if (rc) 297 if (rc)
296 return rc; 298 return rc;
297 } 299 }
@@ -342,7 +344,7 @@ static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
342 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY; 344 return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
343} 345}
344 346
345static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio) 347int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
346{ 348{
347 struct kvm_vcpu *vcpu; 349 struct kvm_vcpu *vcpu;
348 int i, rc; 350 int i, rc;
@@ -380,11 +382,6 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
380 return -EBUSY; 382 return -EBUSY;
381} 383}
382 384
383static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
384{
385 return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
386}
387
388static u8 xive_lock_and_mask(struct kvmppc_xive *xive, 385static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
389 struct kvmppc_xive_src_block *sb, 386 struct kvmppc_xive_src_block *sb,
390 struct kvmppc_xive_irq_state *state) 387 struct kvmppc_xive_irq_state *state)
@@ -430,8 +427,8 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
430 */ 427 */
431 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 428 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
432 xive_native_configure_irq(hw_num, 429 xive_native_configure_irq(hw_num,
433 xive_vp(xive, state->act_server), 430 kvmppc_xive_vp(xive, state->act_server),
434 MASKED, state->number); 431 MASKED, state->number);
435 /* set old_p so we can track if an H_EOI was done */ 432 /* set old_p so we can track if an H_EOI was done */
436 state->old_p = true; 433 state->old_p = true;
437 state->old_q = false; 434 state->old_q = false;
@@ -486,8 +483,8 @@ static void xive_finish_unmask(struct kvmppc_xive *xive,
486 */ 483 */
487 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { 484 if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
488 xive_native_configure_irq(hw_num, 485 xive_native_configure_irq(hw_num,
489 xive_vp(xive, state->act_server), 486 kvmppc_xive_vp(xive, state->act_server),
490 state->act_priority, state->number); 487 state->act_priority, state->number);
491 /* If an EOI is needed, do it here */ 488 /* If an EOI is needed, do it here */
492 if (!state->old_p) 489 if (!state->old_p)
493 xive_vm_source_eoi(hw_num, xd); 490 xive_vm_source_eoi(hw_num, xd);
@@ -535,7 +532,7 @@ static int xive_target_interrupt(struct kvm *kvm,
535 * priority. The count for that new target will have 532 * priority. The count for that new target will have
536 * already been incremented. 533 * already been incremented.
537 */ 534 */
538 rc = xive_select_target(kvm, &server, prio); 535 rc = kvmppc_xive_select_target(kvm, &server, prio);
539 536
540 /* 537 /*
541 * We failed to find a target ? Not much we can do 538 * We failed to find a target ? Not much we can do
@@ -563,7 +560,7 @@ static int xive_target_interrupt(struct kvm *kvm,
563 kvmppc_xive_select_irq(state, &hw_num, NULL); 560 kvmppc_xive_select_irq(state, &hw_num, NULL);
564 561
565 return xive_native_configure_irq(hw_num, 562 return xive_native_configure_irq(hw_num,
566 xive_vp(xive, server), 563 kvmppc_xive_vp(xive, server),
567 prio, state->number); 564 prio, state->number);
568} 565}
569 566
@@ -849,7 +846,8 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
849 846
850 /* 847 /*
851 * We can't update the state of a "pushed" VCPU, but that 848 * We can't update the state of a "pushed" VCPU, but that
852 * shouldn't happen. 849 * shouldn't happen because the vcpu->mutex makes running a
850 * vcpu mutually exclusive with doing one_reg get/set on it.
853 */ 851 */
854 if (WARN_ON(vcpu->arch.xive_pushed)) 852 if (WARN_ON(vcpu->arch.xive_pushed))
855 return -EIO; 853 return -EIO;
@@ -940,6 +938,13 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
940 /* Turn the IPI hard off */ 938 /* Turn the IPI hard off */
941 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 939 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
942 940
941 /*
942 * Reset ESB guest mapping. Needed when ESB pages are exposed
943 * to the guest in XIVE native mode
944 */
945 if (xive->ops && xive->ops->reset_mapped)
946 xive->ops->reset_mapped(kvm, guest_irq);
947
943 /* Grab info about irq */ 948 /* Grab info about irq */
944 state->pt_number = hw_irq; 949 state->pt_number = hw_irq;
945 state->pt_data = irq_data_get_irq_handler_data(host_data); 950 state->pt_data = irq_data_get_irq_handler_data(host_data);
@@ -951,7 +956,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
951 * which is fine for a never started interrupt. 956 * which is fine for a never started interrupt.
952 */ 957 */
953 xive_native_configure_irq(hw_irq, 958 xive_native_configure_irq(hw_irq,
954 xive_vp(xive, state->act_server), 959 kvmppc_xive_vp(xive, state->act_server),
955 state->act_priority, state->number); 960 state->act_priority, state->number);
956 961
957 /* 962 /*
@@ -1025,9 +1030,17 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
1025 state->pt_number = 0; 1030 state->pt_number = 0;
1026 state->pt_data = NULL; 1031 state->pt_data = NULL;
1027 1032
1033 /*
1034 * Reset ESB guest mapping. Needed when ESB pages are exposed
1035 * to the guest in XIVE native mode
1036 */
1037 if (xive->ops && xive->ops->reset_mapped) {
1038 xive->ops->reset_mapped(kvm, guest_irq);
1039 }
1040
1028 /* Reconfigure the IPI */ 1041 /* Reconfigure the IPI */
1029 xive_native_configure_irq(state->ipi_number, 1042 xive_native_configure_irq(state->ipi_number,
1030 xive_vp(xive, state->act_server), 1043 kvmppc_xive_vp(xive, state->act_server),
1031 state->act_priority, state->number); 1044 state->act_priority, state->number);
1032 1045
1033 /* 1046 /*
@@ -1049,7 +1062,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
1049} 1062}
1050EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped); 1063EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
1051 1064
1052static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) 1065void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1053{ 1066{
1054 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1067 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1055 struct kvm *kvm = vcpu->kvm; 1068 struct kvm *kvm = vcpu->kvm;
@@ -1083,14 +1096,35 @@ static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
1083 arch_spin_unlock(&sb->lock); 1096 arch_spin_unlock(&sb->lock);
1084 } 1097 }
1085 } 1098 }
1099
1100 /* Disable vcpu's escalation interrupt */
1101 if (vcpu->arch.xive_esc_on) {
1102 __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
1103 XIVE_ESB_SET_PQ_01));
1104 vcpu->arch.xive_esc_on = false;
1105 }
1106
1107 /*
1108 * Clear pointers to escalation interrupt ESB.
1109 * This is safe because the vcpu->mutex is held, preventing
1110 * any other CPU from concurrently executing a KVM_RUN ioctl.
1111 */
1112 vcpu->arch.xive_esc_vaddr = 0;
1113 vcpu->arch.xive_esc_raddr = 0;
1086} 1114}
1087 1115
1088void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) 1116void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1089{ 1117{
1090 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1118 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1091 struct kvmppc_xive *xive = xc->xive; 1119 struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1092 int i; 1120 int i;
1093 1121
1122 if (!kvmppc_xics_enabled(vcpu))
1123 return;
1124
1125 if (!xc)
1126 return;
1127
1094 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num); 1128 pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
1095 1129
1096 /* Ensure no interrupt is still routed to that VP */ 1130 /* Ensure no interrupt is still routed to that VP */
@@ -1129,6 +1163,10 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
1129 } 1163 }
1130 /* Free the VP */ 1164 /* Free the VP */
1131 kfree(xc); 1165 kfree(xc);
1166
1167 /* Cleanup the vcpu */
1168 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
1169 vcpu->arch.xive_vcpu = NULL;
1132} 1170}
1133 1171
1134int kvmppc_xive_connect_vcpu(struct kvm_device *dev, 1172int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
@@ -1146,7 +1184,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1146 } 1184 }
1147 if (xive->kvm != vcpu->kvm) 1185 if (xive->kvm != vcpu->kvm)
1148 return -EPERM; 1186 return -EPERM;
1149 if (vcpu->arch.irq_type) 1187 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
1150 return -EBUSY; 1188 return -EBUSY;
1151 if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { 1189 if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
1152 pr_devel("Duplicate !\n"); 1190 pr_devel("Duplicate !\n");
@@ -1166,7 +1204,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1166 xc->xive = xive; 1204 xc->xive = xive;
1167 xc->vcpu = vcpu; 1205 xc->vcpu = vcpu;
1168 xc->server_num = cpu; 1206 xc->server_num = cpu;
1169 xc->vp_id = xive_vp(xive, cpu); 1207 xc->vp_id = kvmppc_xive_vp(xive, cpu);
1170 xc->mfrr = 0xff; 1208 xc->mfrr = 0xff;
1171 xc->valid = true; 1209 xc->valid = true;
1172 1210
@@ -1219,7 +1257,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1219 if (xive->qmap & (1 << i)) { 1257 if (xive->qmap & (1 << i)) {
1220 r = xive_provision_queue(vcpu, i); 1258 r = xive_provision_queue(vcpu, i);
1221 if (r == 0 && !xive->single_escalation) 1259 if (r == 0 && !xive->single_escalation)
1222 xive_attach_escalation(vcpu, i); 1260 kvmppc_xive_attach_escalation(
1261 vcpu, i, xive->single_escalation);
1223 if (r) 1262 if (r)
1224 goto bail; 1263 goto bail;
1225 } else { 1264 } else {
@@ -1234,7 +1273,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
1234 } 1273 }
1235 1274
1236 /* If not done above, attach priority 0 escalation */ 1275 /* If not done above, attach priority 0 escalation */
1237 r = xive_attach_escalation(vcpu, 0); 1276 r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
1238 if (r) 1277 if (r)
1239 goto bail; 1278 goto bail;
1240 1279
@@ -1485,8 +1524,8 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
1485 return 0; 1524 return 0;
1486} 1525}
1487 1526
1488static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive, 1527struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
1489 int irq) 1528 struct kvmppc_xive *xive, int irq)
1490{ 1529{
1491 struct kvm *kvm = xive->kvm; 1530 struct kvm *kvm = xive->kvm;
1492 struct kvmppc_xive_src_block *sb; 1531 struct kvmppc_xive_src_block *sb;
@@ -1509,6 +1548,7 @@ static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *x
1509 1548
1510 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1549 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
1511 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i; 1550 sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
1551 sb->irq_state[i].eisn = 0;
1512 sb->irq_state[i].guest_priority = MASKED; 1552 sb->irq_state[i].guest_priority = MASKED;
1513 sb->irq_state[i].saved_priority = MASKED; 1553 sb->irq_state[i].saved_priority = MASKED;
1514 sb->irq_state[i].act_priority = MASKED; 1554 sb->irq_state[i].act_priority = MASKED;
@@ -1565,7 +1605,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
1565 sb = kvmppc_xive_find_source(xive, irq, &idx); 1605 sb = kvmppc_xive_find_source(xive, irq, &idx);
1566 if (!sb) { 1606 if (!sb) {
1567 pr_devel("No source, creating source block...\n"); 1607 pr_devel("No source, creating source block...\n");
1568 sb = xive_create_src_block(xive, irq); 1608 sb = kvmppc_xive_create_src_block(xive, irq);
1569 if (!sb) { 1609 if (!sb) {
1570 pr_devel("Failed to create block...\n"); 1610 pr_devel("Failed to create block...\n");
1571 return -ENOMEM; 1611 return -ENOMEM;
@@ -1789,7 +1829,7 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
1789 xive_cleanup_irq_data(xd); 1829 xive_cleanup_irq_data(xd);
1790} 1830}
1791 1831
1792static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb) 1832void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
1793{ 1833{
1794 int i; 1834 int i;
1795 1835
@@ -1810,16 +1850,55 @@ static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
1810 } 1850 }
1811} 1851}
1812 1852
1813static void kvmppc_xive_free(struct kvm_device *dev) 1853/*
1854 * Called when device fd is closed. kvm->lock is held.
1855 */
1856static void kvmppc_xive_release(struct kvm_device *dev)
1814{ 1857{
1815 struct kvmppc_xive *xive = dev->private; 1858 struct kvmppc_xive *xive = dev->private;
1816 struct kvm *kvm = xive->kvm; 1859 struct kvm *kvm = xive->kvm;
1860 struct kvm_vcpu *vcpu;
1817 int i; 1861 int i;
1862 int was_ready;
1863
1864 pr_devel("Releasing xive device\n");
1818 1865
1819 debugfs_remove(xive->dentry); 1866 debugfs_remove(xive->dentry);
1820 1867
1821 if (kvm) 1868 /*
1822 kvm->arch.xive = NULL; 1869 * Clearing mmu_ready temporarily while holding kvm->lock
1870 * is a way of ensuring that no vcpus can enter the guest
1871 * until we drop kvm->lock. Doing kick_all_cpus_sync()
1872 * ensures that any vcpu executing inside the guest has
1873 * exited the guest. Once kick_all_cpus_sync() has finished,
1874 * we know that no vcpu can be executing the XIVE push or
1875 * pull code, or executing a XICS hcall.
1876 *
1877 * Since this is the device release function, we know that
1878 * userspace does not have any open fd referring to the
1879 * device. Therefore there can not be any of the device
1880 * attribute set/get functions being executed concurrently,
1881 * and similarly, the connect_vcpu and set/clr_mapped
1882 * functions also cannot be being executed.
1883 */
1884 was_ready = kvm->arch.mmu_ready;
1885 kvm->arch.mmu_ready = 0;
1886 kick_all_cpus_sync();
1887
1888 /*
1889 * We should clean up the vCPU interrupt presenters first.
1890 */
1891 kvm_for_each_vcpu(i, vcpu, kvm) {
1892 /*
1893 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1894 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
1895 */
1896 mutex_lock(&vcpu->mutex);
1897 kvmppc_xive_cleanup_vcpu(vcpu);
1898 mutex_unlock(&vcpu->mutex);
1899 }
1900
1901 kvm->arch.xive = NULL;
1823 1902
1824 /* Mask and free interrupts */ 1903 /* Mask and free interrupts */
1825 for (i = 0; i <= xive->max_sbid; i++) { 1904 for (i = 0; i <= xive->max_sbid; i++) {
@@ -1832,11 +1911,47 @@ static void kvmppc_xive_free(struct kvm_device *dev)
1832 if (xive->vp_base != XIVE_INVALID_VP) 1911 if (xive->vp_base != XIVE_INVALID_VP)
1833 xive_native_free_vp_block(xive->vp_base); 1912 xive_native_free_vp_block(xive->vp_base);
1834 1913
1914 kvm->arch.mmu_ready = was_ready;
1915
1916 /*
1917 * A reference of the kvmppc_xive pointer is now kept under
1918 * the xive_devices struct of the machine for reuse. It is
1919 * freed when the VM is destroyed for now until we fix all the
1920 * execution paths.
1921 */
1835 1922
1836 kfree(xive);
1837 kfree(dev); 1923 kfree(dev);
1838} 1924}
1839 1925
1926/*
1927 * When the guest chooses the interrupt mode (XICS legacy or XIVE
1928 * native), the VM will switch of KVM device. The previous device will
1929 * be "released" before the new one is created.
1930 *
1931 * Until we are sure all execution paths are well protected, provide a
1932 * fail safe (transitional) method for device destruction, in which
1933 * the XIVE device pointer is recycled and not directly freed.
1934 */
1935struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
1936{
1937 struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
1938 &kvm->arch.xive_devices.native :
1939 &kvm->arch.xive_devices.xics_on_xive;
1940 struct kvmppc_xive *xive = *kvm_xive_device;
1941
1942 if (!xive) {
1943 xive = kzalloc(sizeof(*xive), GFP_KERNEL);
1944 *kvm_xive_device = xive;
1945 } else {
1946 memset(xive, 0, sizeof(*xive));
1947 }
1948
1949 return xive;
1950}
1951
1952/*
1953 * Create a XICS device with XIVE backend. kvm->lock is held.
1954 */
1840static int kvmppc_xive_create(struct kvm_device *dev, u32 type) 1955static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1841{ 1956{
1842 struct kvmppc_xive *xive; 1957 struct kvmppc_xive *xive;
@@ -1845,7 +1960,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1845 1960
1846 pr_devel("Creating xive for partition\n"); 1961 pr_devel("Creating xive for partition\n");
1847 1962
1848 xive = kzalloc(sizeof(*xive), GFP_KERNEL); 1963 xive = kvmppc_xive_get_device(kvm, type);
1849 if (!xive) 1964 if (!xive)
1850 return -ENOMEM; 1965 return -ENOMEM;
1851 1966
@@ -1883,6 +1998,43 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
1883 return 0; 1998 return 0;
1884} 1999}
1885 2000
2001int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
2002{
2003 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
2004 unsigned int i;
2005
2006 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
2007 struct xive_q *q = &xc->queues[i];
2008 u32 i0, i1, idx;
2009
2010 if (!q->qpage && !xc->esc_virq[i])
2011 continue;
2012
2013 seq_printf(m, " [q%d]: ", i);
2014
2015 if (q->qpage) {
2016 idx = q->idx;
2017 i0 = be32_to_cpup(q->qpage + idx);
2018 idx = (idx + 1) & q->msk;
2019 i1 = be32_to_cpup(q->qpage + idx);
2020 seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
2021 i0, i1);
2022 }
2023 if (xc->esc_virq[i]) {
2024 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
2025 struct xive_irq_data *xd =
2026 irq_data_get_irq_handler_data(d);
2027 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
2028
2029 seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
2030 (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
2031 (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
2032 xc->esc_virq[i], pq, xd->eoi_page);
2033 seq_puts(m, "\n");
2034 }
2035 }
2036 return 0;
2037}
1886 2038
1887static int xive_debug_show(struct seq_file *m, void *private) 2039static int xive_debug_show(struct seq_file *m, void *private)
1888{ 2040{
@@ -1908,7 +2060,6 @@ static int xive_debug_show(struct seq_file *m, void *private)
1908 2060
1909 kvm_for_each_vcpu(i, vcpu, kvm) { 2061 kvm_for_each_vcpu(i, vcpu, kvm) {
1910 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 2062 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1911 unsigned int i;
1912 2063
1913 if (!xc) 2064 if (!xc)
1914 continue; 2065 continue;
@@ -1918,33 +2069,8 @@ static int xive_debug_show(struct seq_file *m, void *private)
1918 xc->server_num, xc->cppr, xc->hw_cppr, 2069 xc->server_num, xc->cppr, xc->hw_cppr,
1919 xc->mfrr, xc->pending, 2070 xc->mfrr, xc->pending,
1920 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); 2071 xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
1921 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
1922 struct xive_q *q = &xc->queues[i];
1923 u32 i0, i1, idx;
1924
1925 if (!q->qpage && !xc->esc_virq[i])
1926 continue;
1927 2072
1928 seq_printf(m, " [q%d]: ", i); 2073 kvmppc_xive_debug_show_queues(m, vcpu);
1929
1930 if (q->qpage) {
1931 idx = q->idx;
1932 i0 = be32_to_cpup(q->qpage + idx);
1933 idx = (idx + 1) & q->msk;
1934 i1 = be32_to_cpup(q->qpage + idx);
1935 seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
1936 }
1937 if (xc->esc_virq[i]) {
1938 struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
1939 struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
1940 u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
1941 seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
1942 (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
1943 (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
1944 xc->esc_virq[i], pq, xd->eoi_page);
1945 seq_printf(m, "\n");
1946 }
1947 }
1948 2074
1949 t_rm_h_xirr += xc->stat_rm_h_xirr; 2075 t_rm_h_xirr += xc->stat_rm_h_xirr;
1950 t_rm_h_ipoll += xc->stat_rm_h_ipoll; 2076 t_rm_h_ipoll += xc->stat_rm_h_ipoll;
@@ -1999,7 +2125,7 @@ struct kvm_device_ops kvm_xive_ops = {
1999 .name = "kvm-xive", 2125 .name = "kvm-xive",
2000 .create = kvmppc_xive_create, 2126 .create = kvmppc_xive_create,
2001 .init = kvmppc_xive_init, 2127 .init = kvmppc_xive_init,
2002 .destroy = kvmppc_xive_free, 2128 .release = kvmppc_xive_release,
2003 .set_attr = xive_set_attr, 2129 .set_attr = xive_set_attr,
2004 .get_attr = xive_get_attr, 2130 .get_attr = xive_get_attr,
2005 .has_attr = xive_has_attr, 2131 .has_attr = xive_has_attr,
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index a08ae6fd4c51..426146332984 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -13,6 +13,13 @@
13#include "book3s_xics.h" 13#include "book3s_xics.h"
14 14
15/* 15/*
16 * The XIVE Interrupt source numbers are within the range 0 to
17 * KVMPPC_XICS_NR_IRQS.
18 */
19#define KVMPPC_XIVE_FIRST_IRQ 0
20#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS
21
22/*
16 * State for one guest irq source. 23 * State for one guest irq source.
17 * 24 *
18 * For each guest source we allocate a HW interrupt in the XIVE 25 * For each guest source we allocate a HW interrupt in the XIVE
@@ -54,6 +61,9 @@ struct kvmppc_xive_irq_state {
54 bool saved_p; 61 bool saved_p;
55 bool saved_q; 62 bool saved_q;
56 u8 saved_scan_prio; 63 u8 saved_scan_prio;
64
65 /* Xive native */
66 u32 eisn; /* Guest Effective IRQ number */
57}; 67};
58 68
59/* Select the "right" interrupt (IPI vs. passthrough) */ 69/* Select the "right" interrupt (IPI vs. passthrough) */
@@ -84,6 +94,11 @@ struct kvmppc_xive_src_block {
84 struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; 94 struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
85}; 95};
86 96
97struct kvmppc_xive;
98
99struct kvmppc_xive_ops {
100 int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
101};
87 102
88struct kvmppc_xive { 103struct kvmppc_xive {
89 struct kvm *kvm; 104 struct kvm *kvm;
@@ -122,6 +137,10 @@ struct kvmppc_xive {
122 137
123 /* Flags */ 138 /* Flags */
124 u8 single_escalation; 139 u8 single_escalation;
140
141 struct kvmppc_xive_ops *ops;
142 struct address_space *mapping;
143 struct mutex mapping_lock;
125}; 144};
126 145
127#define KVMPPC_XIVE_Q_COUNT 8 146#define KVMPPC_XIVE_Q_COUNT 8
@@ -198,6 +217,11 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
198 return xive->src_blocks[bid]; 217 return xive->src_blocks[bid];
199} 218}
200 219
220static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server)
221{
222 return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
223}
224
201/* 225/*
202 * Mapping between guest priorities and host priorities 226 * Mapping between guest priorities and host priorities
203 * is as follow. 227 * is as follow.
@@ -248,5 +272,18 @@ extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
248extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr); 272extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
249extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr); 273extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
250 274
275/*
276 * Common Xive routines for XICS-over-XIVE and XIVE native
277 */
278void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
279int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
280struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
281 struct kvmppc_xive *xive, int irq);
282void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
283int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
284int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
285 bool single_escalation);
286struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
287
251#endif /* CONFIG_KVM_XICS */ 288#endif /* CONFIG_KVM_XICS */
252#endif /* _KVM_PPC_BOOK3S_XICS_H */ 289#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
new file mode 100644
index 000000000000..6a8e698c4b6e
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -0,0 +1,1249 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2017-2019, IBM Corporation.
4 */
5
6#define pr_fmt(fmt) "xive-kvm: " fmt
7
8#include <linux/kernel.h>
9#include <linux/kvm_host.h>
10#include <linux/err.h>
11#include <linux/gfp.h>
12#include <linux/spinlock.h>
13#include <linux/delay.h>
14#include <linux/file.h>
15#include <asm/uaccess.h>
16#include <asm/kvm_book3s.h>
17#include <asm/kvm_ppc.h>
18#include <asm/hvcall.h>
19#include <asm/xive.h>
20#include <asm/xive-regs.h>
21#include <asm/debug.h>
22#include <asm/debugfs.h>
23#include <asm/opal.h>
24
25#include <linux/debugfs.h>
26#include <linux/seq_file.h>
27
28#include "book3s_xive.h"
29
30static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31{
32 u64 val;
33
34 if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
35 offset |= offset << 4;
36
37 val = in_be64(xd->eoi_mmio + offset);
38 return (u8)val;
39}
40
41static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
42{
43 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
44 struct xive_q *q = &xc->queues[prio];
45
46 xive_native_disable_queue(xc->vp_id, q, prio);
47 if (q->qpage) {
48 put_page(virt_to_page(q->qpage));
49 q->qpage = NULL;
50 }
51}
52
53void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
54{
55 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
56 int i;
57
58 if (!kvmppc_xive_enabled(vcpu))
59 return;
60
61 if (!xc)
62 return;
63
64 pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
65
66 /* Ensure no interrupt is still routed to that VP */
67 xc->valid = false;
68 kvmppc_xive_disable_vcpu_interrupts(vcpu);
69
70 /* Disable the VP */
71 xive_native_disable_vp(xc->vp_id);
72
73 /* Free the queues & associated interrupts */
74 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
75 /* Free the escalation irq */
76 if (xc->esc_virq[i]) {
77 free_irq(xc->esc_virq[i], vcpu);
78 irq_dispose_mapping(xc->esc_virq[i]);
79 kfree(xc->esc_virq_names[i]);
80 xc->esc_virq[i] = 0;
81 }
82
83 /* Free the queue */
84 kvmppc_xive_native_cleanup_queue(vcpu, i);
85 }
86
87 /* Free the VP */
88 kfree(xc);
89
90 /* Cleanup the vcpu */
91 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
92 vcpu->arch.xive_vcpu = NULL;
93}
94
95int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
96 struct kvm_vcpu *vcpu, u32 server_num)
97{
98 struct kvmppc_xive *xive = dev->private;
99 struct kvmppc_xive_vcpu *xc = NULL;
100 int rc;
101
102 pr_devel("native_connect_vcpu(server=%d)\n", server_num);
103
104 if (dev->ops != &kvm_xive_native_ops) {
105 pr_devel("Wrong ops !\n");
106 return -EPERM;
107 }
108 if (xive->kvm != vcpu->kvm)
109 return -EPERM;
110 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
111 return -EBUSY;
112 if (server_num >= KVM_MAX_VCPUS) {
113 pr_devel("Out of bounds !\n");
114 return -EINVAL;
115 }
116
117 mutex_lock(&vcpu->kvm->lock);
118
119 if (kvmppc_xive_find_server(vcpu->kvm, server_num)) {
120 pr_devel("Duplicate !\n");
121 rc = -EEXIST;
122 goto bail;
123 }
124
125 xc = kzalloc(sizeof(*xc), GFP_KERNEL);
126 if (!xc) {
127 rc = -ENOMEM;
128 goto bail;
129 }
130
131 vcpu->arch.xive_vcpu = xc;
132 xc->xive = xive;
133 xc->vcpu = vcpu;
134 xc->server_num = server_num;
135
136 xc->vp_id = kvmppc_xive_vp(xive, server_num);
137 xc->valid = true;
138 vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
139
140 rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
141 if (rc) {
142 pr_err("Failed to get VP info from OPAL: %d\n", rc);
143 goto bail;
144 }
145
146 /*
147 * Enable the VP first as the single escalation mode will
148 * affect escalation interrupts numbering
149 */
150 rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
151 if (rc) {
152 pr_err("Failed to enable VP in OPAL: %d\n", rc);
153 goto bail;
154 }
155
156 /* Configure VCPU fields for use by assembly push/pull */
157 vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
158 vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
159
160 /* TODO: reset all queues to a clean state ? */
161bail:
162 mutex_unlock(&vcpu->kvm->lock);
163 if (rc)
164 kvmppc_xive_native_cleanup_vcpu(vcpu);
165
166 return rc;
167}
168
169/*
170 * Device passthrough support
171 */
172static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
173{
174 struct kvmppc_xive *xive = kvm->arch.xive;
175
176 if (irq >= KVMPPC_XIVE_NR_IRQS)
177 return -EINVAL;
178
179 /*
180 * Clear the ESB pages of the IRQ number being mapped (or
181 * unmapped) into the guest and let the the VM fault handler
182 * repopulate with the appropriate ESB pages (device or IC)
183 */
184 pr_debug("clearing esb pages for girq 0x%lx\n", irq);
185 mutex_lock(&xive->mapping_lock);
186 if (xive->mapping)
187 unmap_mapping_range(xive->mapping,
188 irq * (2ull << PAGE_SHIFT),
189 2ull << PAGE_SHIFT, 1);
190 mutex_unlock(&xive->mapping_lock);
191 return 0;
192}
193
194static struct kvmppc_xive_ops kvmppc_xive_native_ops = {
195 .reset_mapped = kvmppc_xive_native_reset_mapped,
196};
197
198static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
199{
200 struct vm_area_struct *vma = vmf->vma;
201 struct kvm_device *dev = vma->vm_file->private_data;
202 struct kvmppc_xive *xive = dev->private;
203 struct kvmppc_xive_src_block *sb;
204 struct kvmppc_xive_irq_state *state;
205 struct xive_irq_data *xd;
206 u32 hw_num;
207 u16 src;
208 u64 page;
209 unsigned long irq;
210 u64 page_offset;
211
212 /*
213 * Linux/KVM uses a two pages ESB setting, one for trigger and
214 * one for EOI
215 */
216 page_offset = vmf->pgoff - vma->vm_pgoff;
217 irq = page_offset / 2;
218
219 sb = kvmppc_xive_find_source(xive, irq, &src);
220 if (!sb) {
221 pr_devel("%s: source %lx not found !\n", __func__, irq);
222 return VM_FAULT_SIGBUS;
223 }
224
225 state = &sb->irq_state[src];
226 kvmppc_xive_select_irq(state, &hw_num, &xd);
227
228 arch_spin_lock(&sb->lock);
229
230 /*
231 * first/even page is for trigger
232 * second/odd page is for EOI and management.
233 */
234 page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
235 arch_spin_unlock(&sb->lock);
236
237 if (WARN_ON(!page)) {
238 pr_err("%s: accessing invalid ESB page for source %lx !\n",
239 __func__, irq);
240 return VM_FAULT_SIGBUS;
241 }
242
243 vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
244 return VM_FAULT_NOPAGE;
245}
246
247static const struct vm_operations_struct xive_native_esb_vmops = {
248 .fault = xive_native_esb_fault,
249};
250
251static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
252{
253 struct vm_area_struct *vma = vmf->vma;
254
255 switch (vmf->pgoff - vma->vm_pgoff) {
256 case 0: /* HW - forbid access */
257 case 1: /* HV - forbid access */
258 return VM_FAULT_SIGBUS;
259 case 2: /* OS */
260 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
261 return VM_FAULT_NOPAGE;
262 case 3: /* USER - TODO */
263 default:
264 return VM_FAULT_SIGBUS;
265 }
266}
267
268static const struct vm_operations_struct xive_native_tima_vmops = {
269 .fault = xive_native_tima_fault,
270};
271
272static int kvmppc_xive_native_mmap(struct kvm_device *dev,
273 struct vm_area_struct *vma)
274{
275 struct kvmppc_xive *xive = dev->private;
276
277 /* We only allow mappings at fixed offset for now */
278 if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
279 if (vma_pages(vma) > 4)
280 return -EINVAL;
281 vma->vm_ops = &xive_native_tima_vmops;
282 } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
283 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
284 return -EINVAL;
285 vma->vm_ops = &xive_native_esb_vmops;
286 } else {
287 return -EINVAL;
288 }
289
290 vma->vm_flags |= VM_IO | VM_PFNMAP;
291 vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
292
293 /*
294 * Grab the KVM device file address_space to be able to clear
295 * the ESB pages mapping when a device is passed-through into
296 * the guest.
297 */
298 xive->mapping = vma->vm_file->f_mapping;
299 return 0;
300}
301
302static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
303 u64 addr)
304{
305 struct kvmppc_xive_src_block *sb;
306 struct kvmppc_xive_irq_state *state;
307 u64 __user *ubufp = (u64 __user *) addr;
308 u64 val;
309 u16 idx;
310 int rc;
311
312 pr_devel("%s irq=0x%lx\n", __func__, irq);
313
314 if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
315 return -E2BIG;
316
317 sb = kvmppc_xive_find_source(xive, irq, &idx);
318 if (!sb) {
319 pr_debug("No source, creating source block...\n");
320 sb = kvmppc_xive_create_src_block(xive, irq);
321 if (!sb) {
322 pr_err("Failed to create block...\n");
323 return -ENOMEM;
324 }
325 }
326 state = &sb->irq_state[idx];
327
328 if (get_user(val, ubufp)) {
329 pr_err("fault getting user info !\n");
330 return -EFAULT;
331 }
332
333 arch_spin_lock(&sb->lock);
334
335 /*
336 * If the source doesn't already have an IPI, allocate
337 * one and get the corresponding data
338 */
339 if (!state->ipi_number) {
340 state->ipi_number = xive_native_alloc_irq();
341 if (state->ipi_number == 0) {
342 pr_err("Failed to allocate IRQ !\n");
343 rc = -ENXIO;
344 goto unlock;
345 }
346 xive_native_populate_irq_data(state->ipi_number,
347 &state->ipi_data);
348 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
349 state->ipi_number, irq);
350 }
351
352 /* Restore LSI state */
353 if (val & KVM_XIVE_LEVEL_SENSITIVE) {
354 state->lsi = true;
355 if (val & KVM_XIVE_LEVEL_ASSERTED)
356 state->asserted = true;
357 pr_devel(" LSI ! Asserted=%d\n", state->asserted);
358 }
359
360 /* Mask IRQ to start with */
361 state->act_server = 0;
362 state->act_priority = MASKED;
363 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
364 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
365
366 /* Increment the number of valid sources and mark this one valid */
367 if (!state->valid)
368 xive->src_count++;
369 state->valid = true;
370
371 rc = 0;
372
373unlock:
374 arch_spin_unlock(&sb->lock);
375
376 return rc;
377}
378
379static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
380 struct kvmppc_xive_src_block *sb,
381 struct kvmppc_xive_irq_state *state,
382 u32 server, u8 priority, bool masked,
383 u32 eisn)
384{
385 struct kvm *kvm = xive->kvm;
386 u32 hw_num;
387 int rc = 0;
388
389 arch_spin_lock(&sb->lock);
390
391 if (state->act_server == server && state->act_priority == priority &&
392 state->eisn == eisn)
393 goto unlock;
394
395 pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
396 priority, server, masked, state->act_server,
397 state->act_priority);
398
399 kvmppc_xive_select_irq(state, &hw_num, NULL);
400
401 if (priority != MASKED && !masked) {
402 rc = kvmppc_xive_select_target(kvm, &server, priority);
403 if (rc)
404 goto unlock;
405
406 state->act_priority = priority;
407 state->act_server = server;
408 state->eisn = eisn;
409
410 rc = xive_native_configure_irq(hw_num,
411 kvmppc_xive_vp(xive, server),
412 priority, eisn);
413 } else {
414 state->act_priority = MASKED;
415 state->act_server = 0;
416 state->eisn = 0;
417
418 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
419 }
420
421unlock:
422 arch_spin_unlock(&sb->lock);
423 return rc;
424}
425
426static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
427 long irq, u64 addr)
428{
429 struct kvmppc_xive_src_block *sb;
430 struct kvmppc_xive_irq_state *state;
431 u64 __user *ubufp = (u64 __user *) addr;
432 u16 src;
433 u64 kvm_cfg;
434 u32 server;
435 u8 priority;
436 bool masked;
437 u32 eisn;
438
439 sb = kvmppc_xive_find_source(xive, irq, &src);
440 if (!sb)
441 return -ENOENT;
442
443 state = &sb->irq_state[src];
444
445 if (!state->valid)
446 return -EINVAL;
447
448 if (get_user(kvm_cfg, ubufp))
449 return -EFAULT;
450
451 pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
452
453 priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
454 KVM_XIVE_SOURCE_PRIORITY_SHIFT;
455 server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
456 KVM_XIVE_SOURCE_SERVER_SHIFT;
457 masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
458 KVM_XIVE_SOURCE_MASKED_SHIFT;
459 eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
460 KVM_XIVE_SOURCE_EISN_SHIFT;
461
462 if (priority != xive_prio_from_guest(priority)) {
463 pr_err("invalid priority for queue %d for VCPU %d\n",
464 priority, server);
465 return -EINVAL;
466 }
467
468 return kvmppc_xive_native_update_source_config(xive, sb, state, server,
469 priority, masked, eisn);
470}
471
472static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
473 long irq, u64 addr)
474{
475 struct kvmppc_xive_src_block *sb;
476 struct kvmppc_xive_irq_state *state;
477 struct xive_irq_data *xd;
478 u32 hw_num;
479 u16 src;
480 int rc = 0;
481
482 pr_devel("%s irq=0x%lx", __func__, irq);
483
484 sb = kvmppc_xive_find_source(xive, irq, &src);
485 if (!sb)
486 return -ENOENT;
487
488 state = &sb->irq_state[src];
489
490 rc = -EINVAL;
491
492 arch_spin_lock(&sb->lock);
493
494 if (state->valid) {
495 kvmppc_xive_select_irq(state, &hw_num, &xd);
496 xive_native_sync_source(hw_num);
497 rc = 0;
498 }
499
500 arch_spin_unlock(&sb->lock);
501 return rc;
502}
503
504static int xive_native_validate_queue_size(u32 qshift)
505{
506 /*
507 * We only support 64K pages for the moment. This is also
508 * advertised in the DT property "ibm,xive-eq-sizes"
509 */
510 switch (qshift) {
511 case 0: /* EQ reset */
512 case 16:
513 return 0;
514 case 12:
515 case 21:
516 case 24:
517 default:
518 return -EINVAL;
519 }
520}
521
522static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
523 long eq_idx, u64 addr)
524{
525 struct kvm *kvm = xive->kvm;
526 struct kvm_vcpu *vcpu;
527 struct kvmppc_xive_vcpu *xc;
528 void __user *ubufp = (void __user *) addr;
529 u32 server;
530 u8 priority;
531 struct kvm_ppc_xive_eq kvm_eq;
532 int rc;
533 __be32 *qaddr = 0;
534 struct page *page;
535 struct xive_q *q;
536 gfn_t gfn;
537 unsigned long page_size;
538
539 /*
540 * Demangle priority/server tuple from the EQ identifier
541 */
542 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
543 KVM_XIVE_EQ_PRIORITY_SHIFT;
544 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
545 KVM_XIVE_EQ_SERVER_SHIFT;
546
547 if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
548 return -EFAULT;
549
550 vcpu = kvmppc_xive_find_server(kvm, server);
551 if (!vcpu) {
552 pr_err("Can't find server %d\n", server);
553 return -ENOENT;
554 }
555 xc = vcpu->arch.xive_vcpu;
556
557 if (priority != xive_prio_from_guest(priority)) {
558 pr_err("Trying to restore invalid queue %d for VCPU %d\n",
559 priority, server);
560 return -EINVAL;
561 }
562 q = &xc->queues[priority];
563
564 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
565 __func__, server, priority, kvm_eq.flags,
566 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
567
568 /*
569 * sPAPR specifies a "Unconditional Notify (n) flag" for the
570 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
571 * without using the coalescing mechanisms provided by the
572 * XIVE END ESBs. This is required on KVM as notification
573 * using the END ESBs is not supported.
574 */
575 if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
576 pr_err("invalid flags %d\n", kvm_eq.flags);
577 return -EINVAL;
578 }
579
580 rc = xive_native_validate_queue_size(kvm_eq.qshift);
581 if (rc) {
582 pr_err("invalid queue size %d\n", kvm_eq.qshift);
583 return rc;
584 }
585
586 /* reset queue and disable queueing */
587 if (!kvm_eq.qshift) {
588 q->guest_qaddr = 0;
589 q->guest_qshift = 0;
590
591 rc = xive_native_configure_queue(xc->vp_id, q, priority,
592 NULL, 0, true);
593 if (rc) {
594 pr_err("Failed to reset queue %d for VCPU %d: %d\n",
595 priority, xc->server_num, rc);
596 return rc;
597 }
598
599 if (q->qpage) {
600 put_page(virt_to_page(q->qpage));
601 q->qpage = NULL;
602 }
603
604 return 0;
605 }
606
607 if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
608 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
609 1ull << kvm_eq.qshift);
610 return -EINVAL;
611 }
612
613 gfn = gpa_to_gfn(kvm_eq.qaddr);
614 page = gfn_to_page(kvm, gfn);
615 if (is_error_page(page)) {
616 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
617 return -EINVAL;
618 }
619
620 page_size = kvm_host_page_size(kvm, gfn);
621 if (1ull << kvm_eq.qshift > page_size) {
622 pr_warn("Incompatible host page size %lx!\n", page_size);
623 return -EINVAL;
624 }
625
626 qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
627
628 /*
629 * Backup the queue page guest address to the mark EQ page
630 * dirty for migration.
631 */
632 q->guest_qaddr = kvm_eq.qaddr;
633 q->guest_qshift = kvm_eq.qshift;
634
635 /*
636 * Unconditional Notification is forced by default at the
637 * OPAL level because the use of END ESBs is not supported by
638 * Linux.
639 */
640 rc = xive_native_configure_queue(xc->vp_id, q, priority,
641 (__be32 *) qaddr, kvm_eq.qshift, true);
642 if (rc) {
643 pr_err("Failed to configure queue %d for VCPU %d: %d\n",
644 priority, xc->server_num, rc);
645 put_page(page);
646 return rc;
647 }
648
649 /*
650 * Only restore the queue state when needed. When doing the
651 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
652 */
653 if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
654 rc = xive_native_set_queue_state(xc->vp_id, priority,
655 kvm_eq.qtoggle,
656 kvm_eq.qindex);
657 if (rc)
658 goto error;
659 }
660
661 rc = kvmppc_xive_attach_escalation(vcpu, priority,
662 xive->single_escalation);
663error:
664 if (rc)
665 kvmppc_xive_native_cleanup_queue(vcpu, priority);
666 return rc;
667}
668
669static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
670 long eq_idx, u64 addr)
671{
672 struct kvm *kvm = xive->kvm;
673 struct kvm_vcpu *vcpu;
674 struct kvmppc_xive_vcpu *xc;
675 struct xive_q *q;
676 void __user *ubufp = (u64 __user *) addr;
677 u32 server;
678 u8 priority;
679 struct kvm_ppc_xive_eq kvm_eq;
680 u64 qaddr;
681 u64 qshift;
682 u64 qeoi_page;
683 u32 escalate_irq;
684 u64 qflags;
685 int rc;
686
687 /*
688 * Demangle priority/server tuple from the EQ identifier
689 */
690 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
691 KVM_XIVE_EQ_PRIORITY_SHIFT;
692 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
693 KVM_XIVE_EQ_SERVER_SHIFT;
694
695 vcpu = kvmppc_xive_find_server(kvm, server);
696 if (!vcpu) {
697 pr_err("Can't find server %d\n", server);
698 return -ENOENT;
699 }
700 xc = vcpu->arch.xive_vcpu;
701
702 if (priority != xive_prio_from_guest(priority)) {
703 pr_err("invalid priority for queue %d for VCPU %d\n",
704 priority, server);
705 return -EINVAL;
706 }
707 q = &xc->queues[priority];
708
709 memset(&kvm_eq, 0, sizeof(kvm_eq));
710
711 if (!q->qpage)
712 return 0;
713
714 rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
715 &qeoi_page, &escalate_irq, &qflags);
716 if (rc)
717 return rc;
718
719 kvm_eq.flags = 0;
720 if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
721 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
722
723 kvm_eq.qshift = q->guest_qshift;
724 kvm_eq.qaddr = q->guest_qaddr;
725
726 rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
727 &kvm_eq.qindex);
728 if (rc)
729 return rc;
730
731 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
732 __func__, server, priority, kvm_eq.flags,
733 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
734
735 if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
736 return -EFAULT;
737
738 return 0;
739}
740
741static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
742{
743 int i;
744
745 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
746 struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
747
748 if (!state->valid)
749 continue;
750
751 if (state->act_priority == MASKED)
752 continue;
753
754 state->eisn = 0;
755 state->act_server = 0;
756 state->act_priority = MASKED;
757 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
758 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
759 if (state->pt_number) {
760 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
761 xive_native_configure_irq(state->pt_number,
762 0, MASKED, 0);
763 }
764 }
765}
766
767static int kvmppc_xive_reset(struct kvmppc_xive *xive)
768{
769 struct kvm *kvm = xive->kvm;
770 struct kvm_vcpu *vcpu;
771 unsigned int i;
772
773 pr_devel("%s\n", __func__);
774
775 mutex_lock(&kvm->lock);
776
777 kvm_for_each_vcpu(i, vcpu, kvm) {
778 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
779 unsigned int prio;
780
781 if (!xc)
782 continue;
783
784 kvmppc_xive_disable_vcpu_interrupts(vcpu);
785
786 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
787
788 /* Single escalation, no queue 7 */
789 if (prio == 7 && xive->single_escalation)
790 break;
791
792 if (xc->esc_virq[prio]) {
793 free_irq(xc->esc_virq[prio], vcpu);
794 irq_dispose_mapping(xc->esc_virq[prio]);
795 kfree(xc->esc_virq_names[prio]);
796 xc->esc_virq[prio] = 0;
797 }
798
799 kvmppc_xive_native_cleanup_queue(vcpu, prio);
800 }
801 }
802
803 for (i = 0; i <= xive->max_sbid; i++) {
804 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
805
806 if (sb) {
807 arch_spin_lock(&sb->lock);
808 kvmppc_xive_reset_sources(sb);
809 arch_spin_unlock(&sb->lock);
810 }
811 }
812
813 mutex_unlock(&kvm->lock);
814
815 return 0;
816}
817
818static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
819{
820 int j;
821
822 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
823 struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
824 struct xive_irq_data *xd;
825 u32 hw_num;
826
827 if (!state->valid)
828 continue;
829
830 /*
831 * The struct kvmppc_xive_irq_state reflects the state
832 * of the EAS configuration and not the state of the
833 * source. The source is masked setting the PQ bits to
834 * '-Q', which is what is being done before calling
835 * the KVM_DEV_XIVE_EQ_SYNC control.
836 *
837 * If a source EAS is configured, OPAL syncs the XIVE
838 * IC of the source and the XIVE IC of the previous
839 * target if any.
840 *
841 * So it should be fine ignoring MASKED sources as
842 * they have been synced already.
843 */
844 if (state->act_priority == MASKED)
845 continue;
846
847 kvmppc_xive_select_irq(state, &hw_num, &xd);
848 xive_native_sync_source(hw_num);
849 xive_native_sync_queue(hw_num);
850 }
851}
852
853static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
854{
855 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
856 unsigned int prio;
857
858 if (!xc)
859 return -ENOENT;
860
861 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
862 struct xive_q *q = &xc->queues[prio];
863
864 if (!q->qpage)
865 continue;
866
867 /* Mark EQ page dirty for migration */
868 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
869 }
870 return 0;
871}
872
873static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
874{
875 struct kvm *kvm = xive->kvm;
876 struct kvm_vcpu *vcpu;
877 unsigned int i;
878
879 pr_devel("%s\n", __func__);
880
881 mutex_lock(&kvm->lock);
882 for (i = 0; i <= xive->max_sbid; i++) {
883 struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
884
885 if (sb) {
886 arch_spin_lock(&sb->lock);
887 kvmppc_xive_native_sync_sources(sb);
888 arch_spin_unlock(&sb->lock);
889 }
890 }
891
892 kvm_for_each_vcpu(i, vcpu, kvm) {
893 kvmppc_xive_native_vcpu_eq_sync(vcpu);
894 }
895 mutex_unlock(&kvm->lock);
896
897 return 0;
898}
899
900static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
901 struct kvm_device_attr *attr)
902{
903 struct kvmppc_xive *xive = dev->private;
904
905 switch (attr->group) {
906 case KVM_DEV_XIVE_GRP_CTRL:
907 switch (attr->attr) {
908 case KVM_DEV_XIVE_RESET:
909 return kvmppc_xive_reset(xive);
910 case KVM_DEV_XIVE_EQ_SYNC:
911 return kvmppc_xive_native_eq_sync(xive);
912 }
913 break;
914 case KVM_DEV_XIVE_GRP_SOURCE:
915 return kvmppc_xive_native_set_source(xive, attr->attr,
916 attr->addr);
917 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
918 return kvmppc_xive_native_set_source_config(xive, attr->attr,
919 attr->addr);
920 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
921 return kvmppc_xive_native_set_queue_config(xive, attr->attr,
922 attr->addr);
923 case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
924 return kvmppc_xive_native_sync_source(xive, attr->attr,
925 attr->addr);
926 }
927 return -ENXIO;
928}
929
930static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
931 struct kvm_device_attr *attr)
932{
933 struct kvmppc_xive *xive = dev->private;
934
935 switch (attr->group) {
936 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
937 return kvmppc_xive_native_get_queue_config(xive, attr->attr,
938 attr->addr);
939 }
940 return -ENXIO;
941}
942
943static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
944 struct kvm_device_attr *attr)
945{
946 switch (attr->group) {
947 case KVM_DEV_XIVE_GRP_CTRL:
948 switch (attr->attr) {
949 case KVM_DEV_XIVE_RESET:
950 case KVM_DEV_XIVE_EQ_SYNC:
951 return 0;
952 }
953 break;
954 case KVM_DEV_XIVE_GRP_SOURCE:
955 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
956 case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
957 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
958 attr->attr < KVMPPC_XIVE_NR_IRQS)
959 return 0;
960 break;
961 case KVM_DEV_XIVE_GRP_EQ_CONFIG:
962 return 0;
963 }
964 return -ENXIO;
965}
966
967/*
968 * Called when device fd is closed
969 */
970static void kvmppc_xive_native_release(struct kvm_device *dev)
971{
972 struct kvmppc_xive *xive = dev->private;
973 struct kvm *kvm = xive->kvm;
974 struct kvm_vcpu *vcpu;
975 int i;
976 int was_ready;
977
978 debugfs_remove(xive->dentry);
979
980 pr_devel("Releasing xive native device\n");
981
982 /*
983 * Clearing mmu_ready temporarily while holding kvm->lock
984 * is a way of ensuring that no vcpus can enter the guest
985 * until we drop kvm->lock. Doing kick_all_cpus_sync()
986 * ensures that any vcpu executing inside the guest has
987 * exited the guest. Once kick_all_cpus_sync() has finished,
988 * we know that no vcpu can be executing the XIVE push or
989 * pull code or accessing the XIVE MMIO regions.
990 *
991 * Since this is the device release function, we know that
992 * userspace does not have any open fd or mmap referring to
993 * the device. Therefore there can not be any of the
994 * device attribute set/get, mmap, or page fault functions
995 * being executed concurrently, and similarly, the
996 * connect_vcpu and set/clr_mapped functions also cannot
997 * be being executed.
998 */
999 was_ready = kvm->arch.mmu_ready;
1000 kvm->arch.mmu_ready = 0;
1001 kick_all_cpus_sync();
1002
1003 /*
1004 * We should clean up the vCPU interrupt presenters first.
1005 */
1006 kvm_for_each_vcpu(i, vcpu, kvm) {
1007 /*
1008 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1009 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1010 */
1011 mutex_lock(&vcpu->mutex);
1012 kvmppc_xive_native_cleanup_vcpu(vcpu);
1013 mutex_unlock(&vcpu->mutex);
1014 }
1015
1016 kvm->arch.xive = NULL;
1017
1018 for (i = 0; i <= xive->max_sbid; i++) {
1019 if (xive->src_blocks[i])
1020 kvmppc_xive_free_sources(xive->src_blocks[i]);
1021 kfree(xive->src_blocks[i]);
1022 xive->src_blocks[i] = NULL;
1023 }
1024
1025 if (xive->vp_base != XIVE_INVALID_VP)
1026 xive_native_free_vp_block(xive->vp_base);
1027
1028 kvm->arch.mmu_ready = was_ready;
1029
1030 /*
1031 * A reference of the kvmppc_xive pointer is now kept under
1032 * the xive_devices struct of the machine for reuse. It is
1033 * freed when the VM is destroyed for now until we fix all the
1034 * execution paths.
1035 */
1036
1037 kfree(dev);
1038}
1039
1040/*
1041 * Create a XIVE device. kvm->lock is held.
1042 */
1043static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1044{
1045 struct kvmppc_xive *xive;
1046 struct kvm *kvm = dev->kvm;
1047 int ret = 0;
1048
1049 pr_devel("Creating xive native device\n");
1050
1051 if (kvm->arch.xive)
1052 return -EEXIST;
1053
1054 xive = kvmppc_xive_get_device(kvm, type);
1055 if (!xive)
1056 return -ENOMEM;
1057
1058 dev->private = xive;
1059 xive->dev = dev;
1060 xive->kvm = kvm;
1061 kvm->arch.xive = xive;
1062 mutex_init(&xive->mapping_lock);
1063
1064 /*
1065 * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
1066 * a default. Getting the max number of CPUs the VM was
1067 * configured with would improve our usage of the XIVE VP space.
1068 */
1069 xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
1070 pr_devel("VP_Base=%x\n", xive->vp_base);
1071
1072 if (xive->vp_base == XIVE_INVALID_VP)
1073 ret = -ENXIO;
1074
1075 xive->single_escalation = xive_native_has_single_escalation();
1076 xive->ops = &kvmppc_xive_native_ops;
1077
1078 if (ret)
1079 kfree(xive);
1080
1081 return ret;
1082}
1083
1084/*
1085 * Interrupt Pending Buffer (IPB) offset
1086 */
1087#define TM_IPB_SHIFT 40
1088#define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT)
1089
1090int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1091{
1092 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1093 u64 opal_state;
1094 int rc;
1095
1096 if (!kvmppc_xive_enabled(vcpu))
1097 return -EPERM;
1098
1099 if (!xc)
1100 return -ENOENT;
1101
1102 /* Thread context registers. We only care about IPB and CPPR */
1103 val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1104
1105 /* Get the VP state from OPAL */
1106 rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1107 if (rc)
1108 return rc;
1109
1110 /*
1111 * Capture the backup of IPB register in the NVT structure and
1112 * merge it in our KVM VP state.
1113 */
1114 val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1115
1116 pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1117 __func__,
1118 vcpu->arch.xive_saved_state.nsr,
1119 vcpu->arch.xive_saved_state.cppr,
1120 vcpu->arch.xive_saved_state.ipb,
1121 vcpu->arch.xive_saved_state.pipr,
1122 vcpu->arch.xive_saved_state.w01,
1123 (u32) vcpu->arch.xive_cam_word, opal_state);
1124
1125 return 0;
1126}
1127
1128int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1129{
1130 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1131 struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1132
1133 pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1134 val->xive_timaval[0], val->xive_timaval[1]);
1135
1136 if (!kvmppc_xive_enabled(vcpu))
1137 return -EPERM;
1138
1139 if (!xc || !xive)
1140 return -ENOENT;
1141
1142 /* We can't update the state of a "pushed" VCPU */
1143 if (WARN_ON(vcpu->arch.xive_pushed))
1144 return -EBUSY;
1145
1146 /*
1147 * Restore the thread context registers. IPB and CPPR should
1148 * be the only ones that matter.
1149 */
1150 vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1151
1152 /*
1153 * There is no need to restore the XIVE internal state (IPB
1154 * stored in the NVT) as the IPB register was merged in KVM VP
1155 * state when captured.
1156 */
1157 return 0;
1158}
1159
1160static int xive_native_debug_show(struct seq_file *m, void *private)
1161{
1162 struct kvmppc_xive *xive = m->private;
1163 struct kvm *kvm = xive->kvm;
1164 struct kvm_vcpu *vcpu;
1165 unsigned int i;
1166
1167 if (!kvm)
1168 return 0;
1169
1170 seq_puts(m, "=========\nVCPU state\n=========\n");
1171
1172 kvm_for_each_vcpu(i, vcpu, kvm) {
1173 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1174
1175 if (!xc)
1176 continue;
1177
1178 seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1179 xc->server_num,
1180 vcpu->arch.xive_saved_state.nsr,
1181 vcpu->arch.xive_saved_state.cppr,
1182 vcpu->arch.xive_saved_state.ipb,
1183 vcpu->arch.xive_saved_state.pipr,
1184 vcpu->arch.xive_saved_state.w01,
1185 (u32) vcpu->arch.xive_cam_word);
1186
1187 kvmppc_xive_debug_show_queues(m, vcpu);
1188 }
1189
1190 return 0;
1191}
1192
1193static int xive_native_debug_open(struct inode *inode, struct file *file)
1194{
1195 return single_open(file, xive_native_debug_show, inode->i_private);
1196}
1197
1198static const struct file_operations xive_native_debug_fops = {
1199 .open = xive_native_debug_open,
1200 .read = seq_read,
1201 .llseek = seq_lseek,
1202 .release = single_release,
1203};
1204
1205static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1206{
1207 char *name;
1208
1209 name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1210 if (!name) {
1211 pr_err("%s: no memory for name\n", __func__);
1212 return;
1213 }
1214
1215 xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1216 xive, &xive_native_debug_fops);
1217
1218 pr_debug("%s: created %s\n", __func__, name);
1219 kfree(name);
1220}
1221
1222static void kvmppc_xive_native_init(struct kvm_device *dev)
1223{
1224 struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1225
1226 /* Register some debug interfaces */
1227 xive_native_debugfs_init(xive);
1228}
1229
1230struct kvm_device_ops kvm_xive_native_ops = {
1231 .name = "kvm-xive-native",
1232 .create = kvmppc_xive_native_create,
1233 .init = kvmppc_xive_native_init,
1234 .release = kvmppc_xive_native_release,
1235 .set_attr = kvmppc_xive_native_set_attr,
1236 .get_attr = kvmppc_xive_native_get_attr,
1237 .has_attr = kvmppc_xive_native_has_attr,
1238 .mmap = kvmppc_xive_native_mmap,
1239};
1240
1241void kvmppc_xive_native_init_module(void)
1242{
1243 ;
1244}
1245
1246void kvmppc_xive_native_exit_module(void)
1247{
1248 ;
1249}
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 033363d6e764..0737acfd17f1 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -130,24 +130,14 @@ static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
130 */ 130 */
131 prio = ffs(pending) - 1; 131 prio = ffs(pending) - 1;
132 132
133 /*
134 * If the most favoured prio we found pending is less
135 * favored (or equal) than a pending IPI, we return
136 * the IPI instead.
137 *
138 * Note: If pending was 0 and mfrr is 0xff, we will
139 * not spurriously take an IPI because mfrr cannot
140 * then be smaller than cppr.
141 */
142 if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
143 prio = xc->mfrr;
144 hirq = XICS_IPI;
145 break;
146 }
147
148 /* Don't scan past the guest cppr */ 133 /* Don't scan past the guest cppr */
149 if (prio >= xc->cppr || prio > 7) 134 if (prio >= xc->cppr || prio > 7) {
135 if (xc->mfrr < xc->cppr) {
136 prio = xc->mfrr;
137 hirq = XICS_IPI;
138 }
150 break; 139 break;
140 }
151 141
152 /* Grab queue and pointers */ 142 /* Grab queue and pointers */
153 q = &xc->queues[prio]; 143 q = &xc->queues[prio];
@@ -184,9 +174,12 @@ skip_ipi:
184 * been set and another occurrence of the IPI will trigger. 174 * been set and another occurrence of the IPI will trigger.
185 */ 175 */
186 if (hirq == XICS_IPI || (prio == 0 && !qpage)) { 176 if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
187 if (scan_type == scan_fetch) 177 if (scan_type == scan_fetch) {
188 GLUE(X_PFX,source_eoi)(xc->vp_ipi, 178 GLUE(X_PFX,source_eoi)(xc->vp_ipi,
189 &xc->vp_ipi_data); 179 &xc->vp_ipi_data);
180 q->idx = idx;
181 q->toggle = toggle;
182 }
190 /* Loop back on same queue with updated idx/toggle */ 183 /* Loop back on same queue with updated idx/toggle */
191#ifdef XIVE_RUNTIME_CHECKS 184#ifdef XIVE_RUNTIME_CHECKS
192 WARN_ON(hirq && hirq != XICS_IPI); 185 WARN_ON(hirq && hirq != XICS_IPI);
@@ -199,32 +192,41 @@ skip_ipi:
199 if (hirq == XICS_DUMMY) 192 if (hirq == XICS_DUMMY)
200 goto skip_ipi; 193 goto skip_ipi;
201 194
202 /* If fetching, update queue pointers */ 195 /* Clear the pending bit if the queue is now empty */
203 if (scan_type == scan_fetch) { 196 if (!hirq) {
204 q->idx = idx; 197 pending &= ~(1 << prio);
205 q->toggle = toggle;
206 }
207
208 /* Something found, stop searching */
209 if (hirq)
210 break;
211
212 /* Clear the pending bit on the now empty queue */
213 pending &= ~(1 << prio);
214 198
215 /* 199 /*
216 * Check if the queue count needs adjusting due to 200 * Check if the queue count needs adjusting due to
217 * interrupts being moved away. 201 * interrupts being moved away.
218 */ 202 */
219 if (atomic_read(&q->pending_count)) { 203 if (atomic_read(&q->pending_count)) {
220 int p = atomic_xchg(&q->pending_count, 0); 204 int p = atomic_xchg(&q->pending_count, 0);
221 if (p) { 205 if (p) {
222#ifdef XIVE_RUNTIME_CHECKS 206#ifdef XIVE_RUNTIME_CHECKS
223 WARN_ON(p > atomic_read(&q->count)); 207 WARN_ON(p > atomic_read(&q->count));
224#endif 208#endif
225 atomic_sub(p, &q->count); 209 atomic_sub(p, &q->count);
210 }
226 } 211 }
227 } 212 }
213
214 /*
215 * If the most favoured prio we found pending is less
216 * favored (or equal) than a pending IPI, we return
217 * the IPI instead.
218 */
219 if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
220 prio = xc->mfrr;
221 hirq = XICS_IPI;
222 break;
223 }
224
225 /* If fetching, update queue pointers */
226 if (scan_type == scan_fetch) {
227 q->idx = idx;
228 q->toggle = toggle;
229 }
228 } 230 }
229 231
230 /* If we are just taking a "peek", do nothing else */ 232 /* If we are just taking a "peek", do nothing else */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8885377ec3e0..3393b166817a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -570,6 +570,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
570 case KVM_CAP_PPC_GET_CPU_CHAR: 570 case KVM_CAP_PPC_GET_CPU_CHAR:
571 r = 1; 571 r = 1;
572 break; 572 break;
573#ifdef CONFIG_KVM_XIVE
574 case KVM_CAP_PPC_IRQ_XIVE:
575 /*
576 * We need XIVE to be enabled on the platform (implies
577 * a POWER9 processor) and the PowerNV platform, as
578 * nested is not yet supported.
579 */
580 r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE);
581 break;
582#endif
573 583
574 case KVM_CAP_PPC_ALLOC_HTAB: 584 case KVM_CAP_PPC_ALLOC_HTAB:
575 r = hv_enabled; 585 r = hv_enabled;
@@ -644,9 +654,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
644 else 654 else
645 r = num_online_cpus(); 655 r = num_online_cpus();
646 break; 656 break;
647 case KVM_CAP_NR_MEMSLOTS:
648 r = KVM_USER_MEM_SLOTS;
649 break;
650 case KVM_CAP_MAX_VCPUS: 657 case KVM_CAP_MAX_VCPUS:
651 r = KVM_MAX_VCPUS; 658 r = KVM_MAX_VCPUS;
652 break; 659 break;
@@ -753,6 +760,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
753 else 760 else
754 kvmppc_xics_free_icp(vcpu); 761 kvmppc_xics_free_icp(vcpu);
755 break; 762 break;
763 case KVMPPC_IRQ_XIVE:
764 kvmppc_xive_native_cleanup_vcpu(vcpu);
765 break;
756 } 766 }
757 767
758 kvmppc_core_vcpu_free(vcpu); 768 kvmppc_core_vcpu_free(vcpu);
@@ -1941,6 +1951,30 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
1941 break; 1951 break;
1942 } 1952 }
1943#endif /* CONFIG_KVM_XICS */ 1953#endif /* CONFIG_KVM_XICS */
1954#ifdef CONFIG_KVM_XIVE
1955 case KVM_CAP_PPC_IRQ_XIVE: {
1956 struct fd f;
1957 struct kvm_device *dev;
1958
1959 r = -EBADF;
1960 f = fdget(cap->args[0]);
1961 if (!f.file)
1962 break;
1963
1964 r = -ENXIO;
1965 if (!xive_enabled())
1966 break;
1967
1968 r = -EPERM;
1969 dev = kvm_device_from_filp(f.file);
1970 if (dev)
1971 r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
1972 cap->args[1]);
1973
1974 fdput(f);
1975 break;
1976 }
1977#endif /* CONFIG_KVM_XIVE */
1944#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 1978#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
1945 case KVM_CAP_PPC_FWNMI: 1979 case KVM_CAP_PPC_FWNMI:
1946 r = -EINVAL; 1980 r = -EINVAL;
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 0c037e933e55..7782201e5fe8 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -521,6 +521,9 @@ u32 xive_native_default_eq_shift(void)
521} 521}
522EXPORT_SYMBOL_GPL(xive_native_default_eq_shift); 522EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
523 523
524unsigned long xive_tima_os;
525EXPORT_SYMBOL_GPL(xive_tima_os);
526
524bool __init xive_native_init(void) 527bool __init xive_native_init(void)
525{ 528{
526 struct device_node *np; 529 struct device_node *np;
@@ -573,6 +576,14 @@ bool __init xive_native_init(void)
573 for_each_possible_cpu(cpu) 576 for_each_possible_cpu(cpu)
574 kvmppc_set_xive_tima(cpu, r.start, tima); 577 kvmppc_set_xive_tima(cpu, r.start, tima);
575 578
579 /* Resource 2 is OS window */
580 if (of_address_to_resource(np, 2, &r)) {
581 pr_err("Failed to get thread mgmnt area resource\n");
582 return false;
583 }
584
585 xive_tima_os = r.start;
586
576 /* Grab size of provisionning pages */ 587 /* Grab size of provisionning pages */
577 xive_parse_provisioning(np); 588 xive_parse_provisioning(np);
578 589
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index f316de40e51b..27696755daa9 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -28,6 +28,7 @@
28#define CPACF_KMCTR 0xb92d /* MSA4 */ 28#define CPACF_KMCTR 0xb92d /* MSA4 */
29#define CPACF_PRNO 0xb93c /* MSA5 */ 29#define CPACF_PRNO 0xb93c /* MSA5 */
30#define CPACF_KMA 0xb929 /* MSA8 */ 30#define CPACF_KMA 0xb929 /* MSA8 */
31#define CPACF_KDSA 0xb93a /* MSA9 */
31 32
32/* 33/*
33 * En/decryption modifier bits 34 * En/decryption modifier bits
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index c47e22bba87f..bdbc81b5bc91 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -278,6 +278,7 @@ struct kvm_s390_sie_block {
278#define ECD_HOSTREGMGMT 0x20000000 278#define ECD_HOSTREGMGMT 0x20000000
279#define ECD_MEF 0x08000000 279#define ECD_MEF 0x08000000
280#define ECD_ETOKENF 0x02000000 280#define ECD_ETOKENF 0x02000000
281#define ECD_ECC 0x00200000
281 __u32 ecd; /* 0x01c8 */ 282 __u32 ecd; /* 0x01c8 */
282 __u8 reserved1cc[18]; /* 0x01cc */ 283 __u8 reserved1cc[18]; /* 0x01cc */
283 __u64 pp; /* 0x01de */ 284 __u64 pp; /* 0x01de */
@@ -312,6 +313,7 @@ struct kvm_vcpu_stat {
312 u64 halt_successful_poll; 313 u64 halt_successful_poll;
313 u64 halt_attempted_poll; 314 u64 halt_attempted_poll;
314 u64 halt_poll_invalid; 315 u64 halt_poll_invalid;
316 u64 halt_no_poll_steal;
315 u64 halt_wakeup; 317 u64 halt_wakeup;
316 u64 instruction_lctl; 318 u64 instruction_lctl;
317 u64 instruction_lctlg; 319 u64 instruction_lctlg;
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 16511d97e8dc..47104e5b47fd 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -152,7 +152,10 @@ struct kvm_s390_vm_cpu_subfunc {
152 __u8 pcc[16]; /* with MSA4 */ 152 __u8 pcc[16]; /* with MSA4 */
153 __u8 ppno[16]; /* with MSA5 */ 153 __u8 ppno[16]; /* with MSA5 */
154 __u8 kma[16]; /* with MSA8 */ 154 __u8 kma[16]; /* with MSA8 */
155 __u8 reserved[1808]; 155 __u8 kdsa[16]; /* with MSA9 */
156 __u8 sortl[32]; /* with STFLE.150 */
157 __u8 dfltcc[32]; /* with STFLE.151 */
158 __u8 reserved[1728];
156}; 159};
157 160
158/* kvm attributes for crypto */ 161/* kvm attributes for crypto */
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 1816ee48eadd..d3db3d7ed077 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -30,6 +30,7 @@ config KVM
30 select HAVE_KVM_IRQFD 30 select HAVE_KVM_IRQFD
31 select HAVE_KVM_IRQ_ROUTING 31 select HAVE_KVM_IRQ_ROUTING
32 select HAVE_KVM_INVALID_WAKEUPS 32 select HAVE_KVM_INVALID_WAKEUPS
33 select HAVE_KVM_NO_POLL
33 select SRCU 34 select SRCU
34 select KVM_VFIO 35 select KVM_VFIO
35 ---help--- 36 ---help---
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 1fd706f6206c..9dde4d7d8704 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -14,6 +14,7 @@
14#include <linux/kvm_host.h> 14#include <linux/kvm_host.h>
15#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
16#include <linux/mmu_context.h> 16#include <linux/mmu_context.h>
17#include <linux/nospec.h>
17#include <linux/signal.h> 18#include <linux/signal.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/bitmap.h> 20#include <linux/bitmap.h>
@@ -2307,6 +2308,7 @@ static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id)
2307{ 2308{
2308 if (id >= MAX_S390_IO_ADAPTERS) 2309 if (id >= MAX_S390_IO_ADAPTERS)
2309 return NULL; 2310 return NULL;
2311 id = array_index_nospec(id, MAX_S390_IO_ADAPTERS);
2310 return kvm->arch.adapters[id]; 2312 return kvm->arch.adapters[id];
2311} 2313}
2312 2314
@@ -2320,8 +2322,13 @@ static int register_io_adapter(struct kvm_device *dev,
2320 (void __user *)attr->addr, sizeof(adapter_info))) 2322 (void __user *)attr->addr, sizeof(adapter_info)))
2321 return -EFAULT; 2323 return -EFAULT;
2322 2324
2323 if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) || 2325 if (adapter_info.id >= MAX_S390_IO_ADAPTERS)
2324 (dev->kvm->arch.adapters[adapter_info.id] != NULL)) 2326 return -EINVAL;
2327
2328 adapter_info.id = array_index_nospec(adapter_info.id,
2329 MAX_S390_IO_ADAPTERS);
2330
2331 if (dev->kvm->arch.adapters[adapter_info.id] != NULL)
2325 return -EINVAL; 2332 return -EINVAL;
2326 2333
2327 adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); 2334 adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4638303ba6a8..8d6d75db8de6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -75,6 +75,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
75 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, 75 { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
76 { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, 76 { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
77 { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, 77 { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
78 { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) },
78 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 79 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
79 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, 80 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
80 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 81 { "instruction_lctl", VCPU_STAT(instruction_lctl) },
@@ -177,6 +178,11 @@ static int hpage;
177module_param(hpage, int, 0444); 178module_param(hpage, int, 0444);
178MODULE_PARM_DESC(hpage, "1m huge page backing support"); 179MODULE_PARM_DESC(hpage, "1m huge page backing support");
179 180
181/* maximum percentage of steal time for polling. >100 is treated like 100 */
182static u8 halt_poll_max_steal = 10;
183module_param(halt_poll_max_steal, byte, 0644);
184MODULE_PARM_DESC(hpage, "Maximum percentage of steal time to allow polling");
185
180/* 186/*
181 * For now we handle at most 16 double words as this is what the s390 base 187 * For now we handle at most 16 double words as this is what the s390 base
182 * kernel handles and stores in the prefix page. If we ever need to go beyond 188 * kernel handles and stores in the prefix page. If we ever need to go beyond
@@ -321,6 +327,22 @@ static inline int plo_test_bit(unsigned char nr)
321 return cc == 0; 327 return cc == 0;
322} 328}
323 329
330static inline void __insn32_query(unsigned int opcode, u8 query[32])
331{
332 register unsigned long r0 asm("0") = 0; /* query function */
333 register unsigned long r1 asm("1") = (unsigned long) query;
334
335 asm volatile(
336 /* Parameter regs are ignored */
337 " .insn rrf,%[opc] << 16,2,4,6,0\n"
338 : "=m" (*query)
339 : "d" (r0), "a" (r1), [opc] "i" (opcode)
340 : "cc");
341}
342
343#define INSN_SORTL 0xb938
344#define INSN_DFLTCC 0xb939
345
324static void kvm_s390_cpu_feat_init(void) 346static void kvm_s390_cpu_feat_init(void)
325{ 347{
326 int i; 348 int i;
@@ -368,6 +390,16 @@ static void kvm_s390_cpu_feat_init(void)
368 __cpacf_query(CPACF_KMA, (cpacf_mask_t *) 390 __cpacf_query(CPACF_KMA, (cpacf_mask_t *)
369 kvm_s390_available_subfunc.kma); 391 kvm_s390_available_subfunc.kma);
370 392
393 if (test_facility(155)) /* MSA9 */
394 __cpacf_query(CPACF_KDSA, (cpacf_mask_t *)
395 kvm_s390_available_subfunc.kdsa);
396
397 if (test_facility(150)) /* SORTL */
398 __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
399
400 if (test_facility(151)) /* DFLTCC */
401 __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
402
371 if (MACHINE_HAS_ESOP) 403 if (MACHINE_HAS_ESOP)
372 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); 404 allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
373 /* 405 /*
@@ -513,9 +545,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
513 else if (sclp.has_esca && sclp.has_64bscao) 545 else if (sclp.has_esca && sclp.has_64bscao)
514 r = KVM_S390_ESCA_CPU_SLOTS; 546 r = KVM_S390_ESCA_CPU_SLOTS;
515 break; 547 break;
516 case KVM_CAP_NR_MEMSLOTS:
517 r = KVM_USER_MEM_SLOTS;
518 break;
519 case KVM_CAP_S390_COW: 548 case KVM_CAP_S390_COW:
520 r = MACHINE_HAS_ESOP; 549 r = MACHINE_HAS_ESOP;
521 break; 550 break;
@@ -657,6 +686,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
657 set_kvm_facility(kvm->arch.model.fac_mask, 135); 686 set_kvm_facility(kvm->arch.model.fac_mask, 135);
658 set_kvm_facility(kvm->arch.model.fac_list, 135); 687 set_kvm_facility(kvm->arch.model.fac_list, 135);
659 } 688 }
689 if (test_facility(148)) {
690 set_kvm_facility(kvm->arch.model.fac_mask, 148);
691 set_kvm_facility(kvm->arch.model.fac_list, 148);
692 }
693 if (test_facility(152)) {
694 set_kvm_facility(kvm->arch.model.fac_mask, 152);
695 set_kvm_facility(kvm->arch.model.fac_list, 152);
696 }
660 r = 0; 697 r = 0;
661 } else 698 } else
662 r = -EINVAL; 699 r = -EINVAL;
@@ -1323,6 +1360,19 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
1323 VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx", 1360 VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx",
1324 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], 1361 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0],
1325 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); 1362 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]);
1363 VM_EVENT(kvm, 3, "SET: guest KDSA subfunc 0x%16.16lx.%16.16lx",
1364 ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0],
1365 ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]);
1366 VM_EVENT(kvm, 3, "SET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1367 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0],
1368 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1],
1369 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2],
1370 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]);
1371 VM_EVENT(kvm, 3, "SET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1372 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0],
1373 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
1374 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
1375 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
1326 1376
1327 return 0; 1377 return 0;
1328} 1378}
@@ -1491,6 +1541,19 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
1491 VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx", 1541 VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx",
1492 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0], 1542 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0],
1493 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]); 1543 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]);
1544 VM_EVENT(kvm, 3, "GET: guest KDSA subfunc 0x%16.16lx.%16.16lx",
1545 ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[0],
1546 ((unsigned long *) &kvm->arch.model.subfuncs.kdsa)[1]);
1547 VM_EVENT(kvm, 3, "GET: guest SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1548 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[0],
1549 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[1],
1550 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[2],
1551 ((unsigned long *) &kvm->arch.model.subfuncs.sortl)[3]);
1552 VM_EVENT(kvm, 3, "GET: guest DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1553 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[0],
1554 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
1555 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
1556 ((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
1494 1557
1495 return 0; 1558 return 0;
1496} 1559}
@@ -1546,6 +1609,19 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
1546 VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx", 1609 VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx",
1547 ((unsigned long *) &kvm_s390_available_subfunc.kma)[0], 1610 ((unsigned long *) &kvm_s390_available_subfunc.kma)[0],
1548 ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]); 1611 ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]);
1612 VM_EVENT(kvm, 3, "GET: host KDSA subfunc 0x%16.16lx.%16.16lx",
1613 ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[0],
1614 ((unsigned long *) &kvm_s390_available_subfunc.kdsa)[1]);
1615 VM_EVENT(kvm, 3, "GET: host SORTL subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1616 ((unsigned long *) &kvm_s390_available_subfunc.sortl)[0],
1617 ((unsigned long *) &kvm_s390_available_subfunc.sortl)[1],
1618 ((unsigned long *) &kvm_s390_available_subfunc.sortl)[2],
1619 ((unsigned long *) &kvm_s390_available_subfunc.sortl)[3]);
1620 VM_EVENT(kvm, 3, "GET: host DFLTCC subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1621 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[0],
1622 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1],
1623 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2],
1624 ((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]);
1549 1625
1550 return 0; 1626 return 0;
1551} 1627}
@@ -2817,6 +2893,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
2817 vcpu->arch.enabled_gmap = vcpu->arch.gmap; 2893 vcpu->arch.enabled_gmap = vcpu->arch.gmap;
2818} 2894}
2819 2895
2896static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr)
2897{
2898 if (test_bit_inv(nr, (unsigned long *)&kvm->arch.model.subfuncs.pckmo) &&
2899 test_bit_inv(nr, (unsigned long *)&kvm_s390_available_subfunc.pckmo))
2900 return true;
2901 return false;
2902}
2903
2904static bool kvm_has_pckmo_ecc(struct kvm *kvm)
2905{
2906 /* At least one ECC subfunction must be present */
2907 return kvm_has_pckmo_subfunc(kvm, 32) ||
2908 kvm_has_pckmo_subfunc(kvm, 33) ||
2909 kvm_has_pckmo_subfunc(kvm, 34) ||
2910 kvm_has_pckmo_subfunc(kvm, 40) ||
2911 kvm_has_pckmo_subfunc(kvm, 41);
2912
2913}
2914
2820static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) 2915static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
2821{ 2916{
2822 /* 2917 /*
@@ -2829,13 +2924,19 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
2829 vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; 2924 vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
2830 vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); 2925 vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
2831 vcpu->arch.sie_block->eca &= ~ECA_APIE; 2926 vcpu->arch.sie_block->eca &= ~ECA_APIE;
2927 vcpu->arch.sie_block->ecd &= ~ECD_ECC;
2832 2928
2833 if (vcpu->kvm->arch.crypto.apie) 2929 if (vcpu->kvm->arch.crypto.apie)
2834 vcpu->arch.sie_block->eca |= ECA_APIE; 2930 vcpu->arch.sie_block->eca |= ECA_APIE;
2835 2931
2836 /* Set up protected key support */ 2932 /* Set up protected key support */
2837 if (vcpu->kvm->arch.crypto.aes_kw) 2933 if (vcpu->kvm->arch.crypto.aes_kw) {
2838 vcpu->arch.sie_block->ecb3 |= ECB3_AES; 2934 vcpu->arch.sie_block->ecb3 |= ECB3_AES;
2935 /* ecc is also wrapped with AES key */
2936 if (kvm_has_pckmo_ecc(vcpu->kvm))
2937 vcpu->arch.sie_block->ecd |= ECD_ECC;
2938 }
2939
2839 if (vcpu->kvm->arch.crypto.dea_kw) 2940 if (vcpu->kvm->arch.crypto.dea_kw)
2840 vcpu->arch.sie_block->ecb3 |= ECB3_DEA; 2941 vcpu->arch.sie_block->ecb3 |= ECB3_DEA;
2841} 2942}
@@ -3068,6 +3169,17 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
3068 } 3169 }
3069} 3170}
3070 3171
3172bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
3173{
3174 /* do not poll with more than halt_poll_max_steal percent of steal time */
3175 if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >=
3176 halt_poll_max_steal) {
3177 vcpu->stat.halt_no_poll_steal++;
3178 return true;
3179 }
3180 return false;
3181}
3182
3071int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 3183int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
3072{ 3184{
3073 /* kvm common code refers to this, but never calls it */ 3185 /* kvm common code refers to this, but never calls it */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index d62fa148558b..076090f9e666 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -288,7 +288,9 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
288 const u32 crycb_addr = crycbd_o & 0x7ffffff8U; 288 const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
289 unsigned long *b1, *b2; 289 unsigned long *b1, *b2;
290 u8 ecb3_flags; 290 u8 ecb3_flags;
291 u32 ecd_flags;
291 int apie_h; 292 int apie_h;
293 int apie_s;
292 int key_msk = test_kvm_facility(vcpu->kvm, 76); 294 int key_msk = test_kvm_facility(vcpu->kvm, 76);
293 int fmt_o = crycbd_o & CRYCB_FORMAT_MASK; 295 int fmt_o = crycbd_o & CRYCB_FORMAT_MASK;
294 int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK; 296 int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK;
@@ -297,7 +299,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
297 scb_s->crycbd = 0; 299 scb_s->crycbd = 0;
298 300
299 apie_h = vcpu->arch.sie_block->eca & ECA_APIE; 301 apie_h = vcpu->arch.sie_block->eca & ECA_APIE;
300 if (!apie_h && (!key_msk || fmt_o == CRYCB_FORMAT0)) 302 apie_s = apie_h & scb_o->eca;
303 if (!apie_s && (!key_msk || (fmt_o == CRYCB_FORMAT0)))
301 return 0; 304 return 0;
302 305
303 if (!crycb_addr) 306 if (!crycb_addr)
@@ -308,7 +311,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
308 ((crycb_addr + 128) & PAGE_MASK)) 311 ((crycb_addr + 128) & PAGE_MASK))
309 return set_validity_icpt(scb_s, 0x003CU); 312 return set_validity_icpt(scb_s, 0x003CU);
310 313
311 if (apie_h && (scb_o->eca & ECA_APIE)) { 314 if (apie_s) {
312 ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr, 315 ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr,
313 vcpu->kvm->arch.crypto.crycb, 316 vcpu->kvm->arch.crypto.crycb,
314 fmt_o, fmt_h); 317 fmt_o, fmt_h);
@@ -320,7 +323,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
320 /* we may only allow it if enabled for guest 2 */ 323 /* we may only allow it if enabled for guest 2 */
321 ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & 324 ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
322 (ECB3_AES | ECB3_DEA); 325 (ECB3_AES | ECB3_DEA);
323 if (!ecb3_flags) 326 ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
327 if (!ecb3_flags && !ecd_flags)
324 goto end; 328 goto end;
325 329
326 /* copy only the wrapping keys */ 330 /* copy only the wrapping keys */
@@ -329,6 +333,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
329 return set_validity_icpt(scb_s, 0x0035U); 333 return set_validity_icpt(scb_s, 0x0035U);
330 334
331 scb_s->ecb3 |= ecb3_flags; 335 scb_s->ecb3 |= ecb3_flags;
336 scb_s->ecd |= ecd_flags;
332 337
333 /* xor both blocks in one run */ 338 /* xor both blocks in one run */
334 b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; 339 b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
@@ -339,7 +344,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
339end: 344end:
340 switch (ret) { 345 switch (ret) {
341 case -EINVAL: 346 case -EINVAL:
342 return set_validity_icpt(scb_s, 0x0020U); 347 return set_validity_icpt(scb_s, 0x0022U);
343 case -EFAULT: 348 case -EFAULT:
344 return set_validity_icpt(scb_s, 0x0035U); 349 return set_validity_icpt(scb_s, 0x0035U);
345 case -EACCES: 350 case -EACCES:
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index fd788e0f2e5b..cead9e0dcffb 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -93,6 +93,9 @@ static struct facility_def facility_defs[] = {
93 131, /* enhanced-SOP 2 and side-effect */ 93 131, /* enhanced-SOP 2 and side-effect */
94 139, /* multiple epoch facility */ 94 139, /* multiple epoch facility */
95 146, /* msa extension 8 */ 95 146, /* msa extension 8 */
96 150, /* enhanced sort */
97 151, /* deflate conversion */
98 155, /* msa extension 9 */
96 -1 /* END */ 99 -1 /* END */
97 } 100 }
98 }, 101 },
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 12ec402f4114..546d13e436aa 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2384,7 +2384,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
2384 */ 2384 */
2385 if (__test_and_clear_bit(55, (unsigned long *)&status)) { 2385 if (__test_and_clear_bit(55, (unsigned long *)&status)) {
2386 handled++; 2386 handled++;
2387 intel_pt_interrupt(); 2387 if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
2388 perf_guest_cbs->handle_intel_pt_intr))
2389 perf_guest_cbs->handle_intel_pt_intr();
2390 else
2391 intel_pt_interrupt();
2388 } 2392 }
2389 2393
2390 /* 2394 /*
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 62be73b23d5c..e8f58ddd06d9 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -10,6 +10,7 @@ extern struct e820_table *e820_table_firmware;
10 10
11extern unsigned long pci_mem_start; 11extern unsigned long pci_mem_start;
12 12
13extern bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type);
13extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type); 14extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type);
14extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); 15extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
15 16
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c79abe7ca093..450d69a1e6fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -470,6 +470,7 @@ struct kvm_pmu {
470 u64 global_ovf_ctrl; 470 u64 global_ovf_ctrl;
471 u64 counter_bitmask[2]; 471 u64 counter_bitmask[2];
472 u64 global_ctrl_mask; 472 u64 global_ctrl_mask;
473 u64 global_ovf_ctrl_mask;
473 u64 reserved_bits; 474 u64 reserved_bits;
474 u8 version; 475 u8 version;
475 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; 476 struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
@@ -781,6 +782,9 @@ struct kvm_vcpu_arch {
781 782
782 /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ 783 /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
783 bool l1tf_flush_l1d; 784 bool l1tf_flush_l1d;
785
786 /* AMD MSRC001_0015 Hardware Configuration */
787 u64 msr_hwcr;
784}; 788};
785 789
786struct kvm_lpage_info { 790struct kvm_lpage_info {
@@ -1168,7 +1172,8 @@ struct kvm_x86_ops {
1168 uint32_t guest_irq, bool set); 1172 uint32_t guest_irq, bool set);
1169 void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); 1173 void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
1170 1174
1171 int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); 1175 int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
1176 bool *expired);
1172 void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); 1177 void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
1173 1178
1174 void (*setup_mce)(struct kvm_vcpu *vcpu); 1179 void (*setup_mce)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 88dd202c8b00..979ef971cc78 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -789,6 +789,14 @@
789#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f 789#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
790#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 790#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
791 791
792/* PERF_GLOBAL_OVF_CTL bits */
793#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55
794#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT)
795#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT 62
796#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF_BIT)
797#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT 63
798#define MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD_BIT)
799
792/* Geode defined MSRs */ 800/* Geode defined MSRs */
793#define MSR_GEODE_BUSCONT_CONF0 0x00001900 801#define MSR_GEODE_BUSCONT_CONF0 0x00001900
794 802
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 2879e234e193..76dd605ee2a3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -73,12 +73,13 @@ EXPORT_SYMBOL(pci_mem_start);
73 * This function checks if any part of the range <start,end> is mapped 73 * This function checks if any part of the range <start,end> is mapped
74 * with type. 74 * with type.
75 */ 75 */
76bool e820__mapped_any(u64 start, u64 end, enum e820_type type) 76static bool _e820__mapped_any(struct e820_table *table,
77 u64 start, u64 end, enum e820_type type)
77{ 78{
78 int i; 79 int i;
79 80
80 for (i = 0; i < e820_table->nr_entries; i++) { 81 for (i = 0; i < table->nr_entries; i++) {
81 struct e820_entry *entry = &e820_table->entries[i]; 82 struct e820_entry *entry = &table->entries[i];
82 83
83 if (type && entry->type != type) 84 if (type && entry->type != type)
84 continue; 85 continue;
@@ -88,6 +89,17 @@ bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
88 } 89 }
89 return 0; 90 return 0;
90} 91}
92
93bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
94{
95 return _e820__mapped_any(e820_table_firmware, start, end, type);
96}
97EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
98
99bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
100{
101 return _e820__mapped_any(e820_table, start, end, type);
102}
91EXPORT_SYMBOL_GPL(e820__mapped_any); 103EXPORT_SYMBOL_GPL(e820__mapped_any);
92 104
93/* 105/*
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbbe611f0c49..80a642a0143d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -963,13 +963,13 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
963 if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) 963 if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
964 return 1; 964 return 1;
965 965
966 eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 966 eax = kvm_rax_read(vcpu);
967 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 967 ecx = kvm_rcx_read(vcpu);
968 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); 968 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
969 kvm_register_write(vcpu, VCPU_REGS_RAX, eax); 969 kvm_rax_write(vcpu, eax);
970 kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); 970 kvm_rbx_write(vcpu, ebx);
971 kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); 971 kvm_rcx_write(vcpu, ecx);
972 kvm_register_write(vcpu, VCPU_REGS_RDX, edx); 972 kvm_rdx_write(vcpu, edx);
973 return kvm_skip_emulated_instruction(vcpu); 973 return kvm_skip_emulated_instruction(vcpu);
974} 974}
975EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 975EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index cc24b3a32c44..8ca4b39918e0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1535,10 +1535,10 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
1535 1535
1536 longmode = is_64_bit_mode(vcpu); 1536 longmode = is_64_bit_mode(vcpu);
1537 if (longmode) 1537 if (longmode)
1538 kvm_register_write(vcpu, VCPU_REGS_RAX, result); 1538 kvm_rax_write(vcpu, result);
1539 else { 1539 else {
1540 kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); 1540 kvm_rdx_write(vcpu, result >> 32);
1541 kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); 1541 kvm_rax_write(vcpu, result & 0xffffffff);
1542 } 1542 }
1543} 1543}
1544 1544
@@ -1611,18 +1611,18 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1611 longmode = is_64_bit_mode(vcpu); 1611 longmode = is_64_bit_mode(vcpu);
1612 1612
1613 if (!longmode) { 1613 if (!longmode) {
1614 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | 1614 param = ((u64)kvm_rdx_read(vcpu) << 32) |
1615 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); 1615 (kvm_rax_read(vcpu) & 0xffffffff);
1616 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | 1616 ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
1617 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); 1617 (kvm_rcx_read(vcpu) & 0xffffffff);
1618 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | 1618 outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
1619 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); 1619 (kvm_rsi_read(vcpu) & 0xffffffff);
1620 } 1620 }
1621#ifdef CONFIG_X86_64 1621#ifdef CONFIG_X86_64
1622 else { 1622 else {
1623 param = kvm_register_read(vcpu, VCPU_REGS_RCX); 1623 param = kvm_rcx_read(vcpu);
1624 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); 1624 ingpa = kvm_rdx_read(vcpu);
1625 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); 1625 outgpa = kvm_r8_read(vcpu);
1626 } 1626 }
1627#endif 1627#endif
1628 1628
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f8f56a93358b..1cc6c47dc77e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,34 @@
9 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 9 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
10 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) 10 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
11 11
12#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
13static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
14{ \
15 return vcpu->arch.regs[VCPU_REGS_##uname]; \
16} \
17static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
18 unsigned long val) \
19{ \
20 vcpu->arch.regs[VCPU_REGS_##uname] = val; \
21}
22BUILD_KVM_GPR_ACCESSORS(rax, RAX)
23BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
24BUILD_KVM_GPR_ACCESSORS(rcx, RCX)
25BUILD_KVM_GPR_ACCESSORS(rdx, RDX)
26BUILD_KVM_GPR_ACCESSORS(rbp, RBP)
27BUILD_KVM_GPR_ACCESSORS(rsi, RSI)
28BUILD_KVM_GPR_ACCESSORS(rdi, RDI)
29#ifdef CONFIG_X86_64
30BUILD_KVM_GPR_ACCESSORS(r8, R8)
31BUILD_KVM_GPR_ACCESSORS(r9, R9)
32BUILD_KVM_GPR_ACCESSORS(r10, R10)
33BUILD_KVM_GPR_ACCESSORS(r11, R11)
34BUILD_KVM_GPR_ACCESSORS(r12, R12)
35BUILD_KVM_GPR_ACCESSORS(r13, R13)
36BUILD_KVM_GPR_ACCESSORS(r14, R14)
37BUILD_KVM_GPR_ACCESSORS(r15, R15)
38#endif
39
12static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 40static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
13 enum kvm_reg reg) 41 enum kvm_reg reg)
14{ 42{
@@ -37,6 +65,16 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
37 kvm_register_write(vcpu, VCPU_REGS_RIP, val); 65 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
38} 66}
39 67
68static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
69{
70 return kvm_register_read(vcpu, VCPU_REGS_RSP);
71}
72
73static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
74{
75 kvm_register_write(vcpu, VCPU_REGS_RSP, val);
76}
77
40static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 78static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
41{ 79{
42 might_sleep(); /* on svm */ 80 might_sleep(); /* on svm */
@@ -83,8 +121,8 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
83 121
84static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) 122static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
85{ 123{
86 return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) 124 return (kvm_rax_read(vcpu) & -1u)
87 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 125 | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32);
88} 126}
89 127
90static inline void enter_guest_mode(struct kvm_vcpu *vcpu) 128static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bd13fdddbdc4..4924f83ed4f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1454,7 +1454,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1454 if (swait_active(q)) 1454 if (swait_active(q))
1455 swake_up_one(q); 1455 swake_up_one(q);
1456 1456
1457 if (apic_lvtt_tscdeadline(apic)) 1457 if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
1458 ktimer->expired_tscdeadline = ktimer->tscdeadline; 1458 ktimer->expired_tscdeadline = ktimer->tscdeadline;
1459} 1459}
1460 1460
@@ -1696,37 +1696,42 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
1696static bool start_hv_timer(struct kvm_lapic *apic) 1696static bool start_hv_timer(struct kvm_lapic *apic)
1697{ 1697{
1698 struct kvm_timer *ktimer = &apic->lapic_timer; 1698 struct kvm_timer *ktimer = &apic->lapic_timer;
1699 int r; 1699 struct kvm_vcpu *vcpu = apic->vcpu;
1700 bool expired;
1700 1701
1701 WARN_ON(preemptible()); 1702 WARN_ON(preemptible());
1702 if (!kvm_x86_ops->set_hv_timer) 1703 if (!kvm_x86_ops->set_hv_timer)
1703 return false; 1704 return false;
1704 1705
1705 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
1706 return false;
1707
1708 if (!ktimer->tscdeadline) 1706 if (!ktimer->tscdeadline)
1709 return false; 1707 return false;
1710 1708
1711 r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); 1709 if (kvm_x86_ops->set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
1712 if (r < 0)
1713 return false; 1710 return false;
1714 1711
1715 ktimer->hv_timer_in_use = true; 1712 ktimer->hv_timer_in_use = true;
1716 hrtimer_cancel(&ktimer->timer); 1713 hrtimer_cancel(&ktimer->timer);
1717 1714
1718 /* 1715 /*
1719 * Also recheck ktimer->pending, in case the sw timer triggered in 1716 * To simplify handling the periodic timer, leave the hv timer running
1720 * the window. For periodic timer, leave the hv timer running for 1717 * even if the deadline timer has expired, i.e. rely on the resulting
1721 * simplicity, and the deadline will be recomputed on the next vmexit. 1718 * VM-Exit to recompute the periodic timer's target expiration.
1722 */ 1719 */
1723 if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { 1720 if (!apic_lvtt_period(apic)) {
1724 if (r) 1721 /*
1722 * Cancel the hv timer if the sw timer fired while the hv timer
1723 * was being programmed, or if the hv timer itself expired.
1724 */
1725 if (atomic_read(&ktimer->pending)) {
1726 cancel_hv_timer(apic);
1727 } else if (expired) {
1725 apic_timer_expired(apic); 1728 apic_timer_expired(apic);
1726 return false; 1729 cancel_hv_timer(apic);
1730 }
1727 } 1731 }
1728 1732
1729 trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); 1733 trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
1734
1730 return true; 1735 return true;
1731} 1736}
1732 1737
@@ -1750,8 +1755,13 @@ static void start_sw_timer(struct kvm_lapic *apic)
1750static void restart_apic_timer(struct kvm_lapic *apic) 1755static void restart_apic_timer(struct kvm_lapic *apic)
1751{ 1756{
1752 preempt_disable(); 1757 preempt_disable();
1758
1759 if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
1760 goto out;
1761
1753 if (!start_hv_timer(apic)) 1762 if (!start_hv_timer(apic))
1754 start_sw_timer(apic); 1763 start_sw_timer(apic);
1764out:
1755 preempt_enable(); 1765 preempt_enable();
1756} 1766}
1757 1767
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9c7b45d231f..1e9ba81accba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -44,6 +44,7 @@
44#include <asm/page.h> 44#include <asm/page.h>
45#include <asm/pat.h> 45#include <asm/pat.h>
46#include <asm/cmpxchg.h> 46#include <asm/cmpxchg.h>
47#include <asm/e820/api.h>
47#include <asm/io.h> 48#include <asm/io.h>
48#include <asm/vmx.h> 49#include <asm/vmx.h>
49#include <asm/kvm_page_track.h> 50#include <asm/kvm_page_track.h>
@@ -487,16 +488,24 @@ static void kvm_mmu_reset_all_pte_masks(void)
487 * If the CPU has 46 or less physical address bits, then set an 488 * If the CPU has 46 or less physical address bits, then set an
488 * appropriate mask to guard against L1TF attacks. Otherwise, it is 489 * appropriate mask to guard against L1TF attacks. Otherwise, it is
489 * assumed that the CPU is not vulnerable to L1TF. 490 * assumed that the CPU is not vulnerable to L1TF.
491 *
492 * Some Intel CPUs address the L1 cache using more PA bits than are
493 * reported by CPUID. Use the PA width of the L1 cache when possible
494 * to achieve more effective mitigation, e.g. if system RAM overlaps
495 * the most significant bits of legal physical address space.
490 */ 496 */
491 low_phys_bits = boot_cpu_data.x86_phys_bits; 497 shadow_nonpresent_or_rsvd_mask = 0;
492 if (boot_cpu_data.x86_phys_bits < 498 low_phys_bits = boot_cpu_data.x86_cache_bits;
499 if (boot_cpu_data.x86_cache_bits <
493 52 - shadow_nonpresent_or_rsvd_mask_len) { 500 52 - shadow_nonpresent_or_rsvd_mask_len) {
494 shadow_nonpresent_or_rsvd_mask = 501 shadow_nonpresent_or_rsvd_mask =
495 rsvd_bits(boot_cpu_data.x86_phys_bits - 502 rsvd_bits(boot_cpu_data.x86_cache_bits -
496 shadow_nonpresent_or_rsvd_mask_len, 503 shadow_nonpresent_or_rsvd_mask_len,
497 boot_cpu_data.x86_phys_bits - 1); 504 boot_cpu_data.x86_cache_bits - 1);
498 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; 505 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
499 } 506 } else
507 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
508
500 shadow_nonpresent_or_rsvd_lower_gfn_mask = 509 shadow_nonpresent_or_rsvd_lower_gfn_mask =
501 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); 510 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
502} 511}
@@ -2892,7 +2901,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2892 */ 2901 */
2893 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); 2902 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2894 2903
2895 return true; 2904 return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2905 pfn_to_hpa(pfn + 1) - 1,
2906 E820_TYPE_RAM);
2896} 2907}
2897 2908
2898/* Bits which may be returned by set_spte() */ 2909/* Bits which may be returned by set_spte() */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index e9ea2d45ae66..9f72cc427158 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -48,11 +48,6 @@ static bool msr_mtrr_valid(unsigned msr)
48 return false; 48 return false;
49} 49}
50 50
51static bool valid_pat_type(unsigned t)
52{
53 return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
54}
55
56static bool valid_mtrr_type(unsigned t) 51static bool valid_mtrr_type(unsigned t)
57{ 52{
58 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ 53 return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
@@ -67,10 +62,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
67 return false; 62 return false;
68 63
69 if (msr == MSR_IA32_CR_PAT) { 64 if (msr == MSR_IA32_CR_PAT) {
70 for (i = 0; i < 8; i++) 65 return kvm_pat_valid(data);
71 if (!valid_pat_type((data >> (i * 8)) & 0xff))
72 return false;
73 return true;
74 } else if (msr == MSR_MTRRdefType) { 66 } else if (msr == MSR_MTRRdefType) {
75 if (data & ~0xcff) 67 if (data & ~0xcff)
76 return false; 68 return false;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 08715034e315..367a47df4ba0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
141 struct page *page; 141 struct page *page;
142 142
143 npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); 143 npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
144 /* Check if the user is doing something meaningless. */ 144 if (likely(npages == 1)) {
145 if (unlikely(npages != 1)) 145 table = kmap_atomic(page);
146 return -EFAULT; 146 ret = CMPXCHG(&table[index], orig_pte, new_pte);
147 147 kunmap_atomic(table);
148 table = kmap_atomic(page); 148
149 ret = CMPXCHG(&table[index], orig_pte, new_pte); 149 kvm_release_page_dirty(page);
150 kunmap_atomic(table); 150 } else {
151 151 struct vm_area_struct *vma;
152 kvm_release_page_dirty(page); 152 unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
153 unsigned long pfn;
154 unsigned long paddr;
155
156 down_read(&current->mm->mmap_sem);
157 vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
158 if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
159 up_read(&current->mm->mmap_sem);
160 return -EFAULT;
161 }
162 pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
163 paddr = pfn << PAGE_SHIFT;
164 table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
165 if (!table) {
166 up_read(&current->mm->mmap_sem);
167 return -EFAULT;
168 }
169 ret = CMPXCHG(&table[index], orig_pte, new_pte);
170 memunmap(table);
171 up_read(&current->mm->mmap_sem);
172 }
153 173
154 return (ret != orig_pte); 174 return (ret != orig_pte);
155} 175}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b92eaf4a3b1..a849dcb7fbc5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2091,7 +2091,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2091 init_vmcb(svm); 2091 init_vmcb(svm);
2092 2092
2093 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); 2093 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
2094 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 2094 kvm_rdx_write(vcpu, eax);
2095 2095
2096 if (kvm_vcpu_apicv_active(vcpu) && !init_event) 2096 if (kvm_vcpu_apicv_active(vcpu) && !init_event)
2097 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); 2097 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
@@ -3071,32 +3071,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
3071 return false; 3071 return false;
3072} 3072}
3073 3073
3074static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
3075{
3076 struct page *page;
3077
3078 might_sleep();
3079
3080 page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
3081 if (is_error_page(page))
3082 goto error;
3083
3084 *_page = page;
3085
3086 return kmap(page);
3087
3088error:
3089 kvm_inject_gp(&svm->vcpu, 0);
3090
3091 return NULL;
3092}
3093
3094static void nested_svm_unmap(struct page *page)
3095{
3096 kunmap(page);
3097 kvm_release_page_dirty(page);
3098}
3099
3100static int nested_svm_intercept_ioio(struct vcpu_svm *svm) 3074static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
3101{ 3075{
3102 unsigned port, size, iopm_len; 3076 unsigned port, size, iopm_len;
@@ -3299,10 +3273,11 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
3299 3273
3300static int nested_svm_vmexit(struct vcpu_svm *svm) 3274static int nested_svm_vmexit(struct vcpu_svm *svm)
3301{ 3275{
3276 int rc;
3302 struct vmcb *nested_vmcb; 3277 struct vmcb *nested_vmcb;
3303 struct vmcb *hsave = svm->nested.hsave; 3278 struct vmcb *hsave = svm->nested.hsave;
3304 struct vmcb *vmcb = svm->vmcb; 3279 struct vmcb *vmcb = svm->vmcb;
3305 struct page *page; 3280 struct kvm_host_map map;
3306 3281
3307 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 3282 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
3308 vmcb->control.exit_info_1, 3283 vmcb->control.exit_info_1,
@@ -3311,9 +3286,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
3311 vmcb->control.exit_int_info_err, 3286 vmcb->control.exit_int_info_err,
3312 KVM_ISA_SVM); 3287 KVM_ISA_SVM);
3313 3288
3314 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); 3289 rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
3315 if (!nested_vmcb) 3290 if (rc) {
3291 if (rc == -EINVAL)
3292 kvm_inject_gp(&svm->vcpu, 0);
3316 return 1; 3293 return 1;
3294 }
3295
3296 nested_vmcb = map.hva;
3317 3297
3318 /* Exit Guest-Mode */ 3298 /* Exit Guest-Mode */
3319 leave_guest_mode(&svm->vcpu); 3299 leave_guest_mode(&svm->vcpu);
@@ -3408,16 +3388,16 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
3408 } else { 3388 } else {
3409 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 3389 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
3410 } 3390 }
3411 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 3391 kvm_rax_write(&svm->vcpu, hsave->save.rax);
3412 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); 3392 kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
3413 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); 3393 kvm_rip_write(&svm->vcpu, hsave->save.rip);
3414 svm->vmcb->save.dr7 = 0; 3394 svm->vmcb->save.dr7 = 0;
3415 svm->vmcb->save.cpl = 0; 3395 svm->vmcb->save.cpl = 0;
3416 svm->vmcb->control.exit_int_info = 0; 3396 svm->vmcb->control.exit_int_info = 0;
3417 3397
3418 mark_all_dirty(svm->vmcb); 3398 mark_all_dirty(svm->vmcb);
3419 3399
3420 nested_svm_unmap(page); 3400 kvm_vcpu_unmap(&svm->vcpu, &map, true);
3421 3401
3422 nested_svm_uninit_mmu_context(&svm->vcpu); 3402 nested_svm_uninit_mmu_context(&svm->vcpu);
3423 kvm_mmu_reset_context(&svm->vcpu); 3403 kvm_mmu_reset_context(&svm->vcpu);
@@ -3483,7 +3463,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
3483} 3463}
3484 3464
3485static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, 3465static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3486 struct vmcb *nested_vmcb, struct page *page) 3466 struct vmcb *nested_vmcb, struct kvm_host_map *map)
3487{ 3467{
3488 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) 3468 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
3489 svm->vcpu.arch.hflags |= HF_HIF_MASK; 3469 svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3516,9 +3496,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3516 kvm_mmu_reset_context(&svm->vcpu); 3496 kvm_mmu_reset_context(&svm->vcpu);
3517 3497
3518 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 3498 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
3519 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 3499 kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
3520 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 3500 kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
3521 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 3501 kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
3522 3502
3523 /* In case we don't even reach vcpu_run, the fields are not updated */ 3503 /* In case we don't even reach vcpu_run, the fields are not updated */
3524 svm->vmcb->save.rax = nested_vmcb->save.rax; 3504 svm->vmcb->save.rax = nested_vmcb->save.rax;
@@ -3567,7 +3547,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3567 svm->vmcb->control.pause_filter_thresh = 3547 svm->vmcb->control.pause_filter_thresh =
3568 nested_vmcb->control.pause_filter_thresh; 3548 nested_vmcb->control.pause_filter_thresh;
3569 3549
3570 nested_svm_unmap(page); 3550 kvm_vcpu_unmap(&svm->vcpu, map, true);
3571 3551
3572 /* Enter Guest-Mode */ 3552 /* Enter Guest-Mode */
3573 enter_guest_mode(&svm->vcpu); 3553 enter_guest_mode(&svm->vcpu);
@@ -3587,17 +3567,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3587 3567
3588static bool nested_svm_vmrun(struct vcpu_svm *svm) 3568static bool nested_svm_vmrun(struct vcpu_svm *svm)
3589{ 3569{
3570 int rc;
3590 struct vmcb *nested_vmcb; 3571 struct vmcb *nested_vmcb;
3591 struct vmcb *hsave = svm->nested.hsave; 3572 struct vmcb *hsave = svm->nested.hsave;
3592 struct vmcb *vmcb = svm->vmcb; 3573 struct vmcb *vmcb = svm->vmcb;
3593 struct page *page; 3574 struct kvm_host_map map;
3594 u64 vmcb_gpa; 3575 u64 vmcb_gpa;
3595 3576
3596 vmcb_gpa = svm->vmcb->save.rax; 3577 vmcb_gpa = svm->vmcb->save.rax;
3597 3578
3598 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3579 rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
3599 if (!nested_vmcb) 3580 if (rc) {
3581 if (rc == -EINVAL)
3582 kvm_inject_gp(&svm->vcpu, 0);
3600 return false; 3583 return false;
3584 }
3585
3586 nested_vmcb = map.hva;
3601 3587
3602 if (!nested_vmcb_checks(nested_vmcb)) { 3588 if (!nested_vmcb_checks(nested_vmcb)) {
3603 nested_vmcb->control.exit_code = SVM_EXIT_ERR; 3589 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
@@ -3605,7 +3591,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
3605 nested_vmcb->control.exit_info_1 = 0; 3591 nested_vmcb->control.exit_info_1 = 0;
3606 nested_vmcb->control.exit_info_2 = 0; 3592 nested_vmcb->control.exit_info_2 = 0;
3607 3593
3608 nested_svm_unmap(page); 3594 kvm_vcpu_unmap(&svm->vcpu, &map, true);
3609 3595
3610 return false; 3596 return false;
3611 } 3597 }
@@ -3649,7 +3635,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
3649 3635
3650 copy_vmcb_control_area(hsave, vmcb); 3636 copy_vmcb_control_area(hsave, vmcb);
3651 3637
3652 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); 3638 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
3653 3639
3654 return true; 3640 return true;
3655} 3641}
@@ -3673,21 +3659,26 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
3673static int vmload_interception(struct vcpu_svm *svm) 3659static int vmload_interception(struct vcpu_svm *svm)
3674{ 3660{
3675 struct vmcb *nested_vmcb; 3661 struct vmcb *nested_vmcb;
3676 struct page *page; 3662 struct kvm_host_map map;
3677 int ret; 3663 int ret;
3678 3664
3679 if (nested_svm_check_permissions(svm)) 3665 if (nested_svm_check_permissions(svm))
3680 return 1; 3666 return 1;
3681 3667
3682 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3668 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
3683 if (!nested_vmcb) 3669 if (ret) {
3670 if (ret == -EINVAL)
3671 kvm_inject_gp(&svm->vcpu, 0);
3684 return 1; 3672 return 1;
3673 }
3674
3675 nested_vmcb = map.hva;
3685 3676
3686 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3677 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3687 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3678 ret = kvm_skip_emulated_instruction(&svm->vcpu);
3688 3679
3689 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 3680 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
3690 nested_svm_unmap(page); 3681 kvm_vcpu_unmap(&svm->vcpu, &map, true);
3691 3682
3692 return ret; 3683 return ret;
3693} 3684}
@@ -3695,21 +3686,26 @@ static int vmload_interception(struct vcpu_svm *svm)
3695static int vmsave_interception(struct vcpu_svm *svm) 3686static int vmsave_interception(struct vcpu_svm *svm)
3696{ 3687{
3697 struct vmcb *nested_vmcb; 3688 struct vmcb *nested_vmcb;
3698 struct page *page; 3689 struct kvm_host_map map;
3699 int ret; 3690 int ret;
3700 3691
3701 if (nested_svm_check_permissions(svm)) 3692 if (nested_svm_check_permissions(svm))
3702 return 1; 3693 return 1;
3703 3694
3704 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 3695 ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
3705 if (!nested_vmcb) 3696 if (ret) {
3697 if (ret == -EINVAL)
3698 kvm_inject_gp(&svm->vcpu, 0);
3706 return 1; 3699 return 1;
3700 }
3701
3702 nested_vmcb = map.hva;
3707 3703
3708 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3704 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3709 ret = kvm_skip_emulated_instruction(&svm->vcpu); 3705 ret = kvm_skip_emulated_instruction(&svm->vcpu);
3710 3706
3711 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 3707 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
3712 nested_svm_unmap(page); 3708 kvm_vcpu_unmap(&svm->vcpu, &map, true);
3713 3709
3714 return ret; 3710 return ret;
3715} 3711}
@@ -3791,11 +3787,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
3791{ 3787{
3792 struct kvm_vcpu *vcpu = &svm->vcpu; 3788 struct kvm_vcpu *vcpu = &svm->vcpu;
3793 3789
3794 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX), 3790 trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
3795 kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3791 kvm_rax_read(&svm->vcpu));
3796 3792
3797 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 3793 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
3798 kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3794 kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
3799 3795
3800 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3796 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3801 return kvm_skip_emulated_instruction(&svm->vcpu); 3797 return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3803,7 +3799,7 @@ static int invlpga_interception(struct vcpu_svm *svm)
3803 3799
3804static int skinit_interception(struct vcpu_svm *svm) 3800static int skinit_interception(struct vcpu_svm *svm)
3805{ 3801{
3806 trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); 3802 trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
3807 3803
3808 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 3804 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3809 return 1; 3805 return 1;
@@ -3817,7 +3813,7 @@ static int wbinvd_interception(struct vcpu_svm *svm)
3817static int xsetbv_interception(struct vcpu_svm *svm) 3813static int xsetbv_interception(struct vcpu_svm *svm)
3818{ 3814{
3819 u64 new_bv = kvm_read_edx_eax(&svm->vcpu); 3815 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3820 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 3816 u32 index = kvm_rcx_read(&svm->vcpu);
3821 3817
3822 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { 3818 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3823 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 3819 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
@@ -4213,7 +4209,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4213 4209
4214static int rdmsr_interception(struct vcpu_svm *svm) 4210static int rdmsr_interception(struct vcpu_svm *svm)
4215{ 4211{
4216 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 4212 u32 ecx = kvm_rcx_read(&svm->vcpu);
4217 struct msr_data msr_info; 4213 struct msr_data msr_info;
4218 4214
4219 msr_info.index = ecx; 4215 msr_info.index = ecx;
@@ -4225,10 +4221,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
4225 } else { 4221 } else {
4226 trace_kvm_msr_read(ecx, msr_info.data); 4222 trace_kvm_msr_read(ecx, msr_info.data);
4227 4223
4228 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, 4224 kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff);
4229 msr_info.data & 0xffffffff); 4225 kvm_rdx_write(&svm->vcpu, msr_info.data >> 32);
4230 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
4231 msr_info.data >> 32);
4232 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 4226 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
4233 return kvm_skip_emulated_instruction(&svm->vcpu); 4227 return kvm_skip_emulated_instruction(&svm->vcpu);
4234 } 4228 }
@@ -4422,7 +4416,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
4422static int wrmsr_interception(struct vcpu_svm *svm) 4416static int wrmsr_interception(struct vcpu_svm *svm)
4423{ 4417{
4424 struct msr_data msr; 4418 struct msr_data msr;
4425 u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); 4419 u32 ecx = kvm_rcx_read(&svm->vcpu);
4426 u64 data = kvm_read_edx_eax(&svm->vcpu); 4420 u64 data = kvm_read_edx_eax(&svm->vcpu);
4427 4421
4428 msr.data = data; 4422 msr.data = data;
@@ -6236,7 +6230,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
6236{ 6230{
6237 struct vcpu_svm *svm = to_svm(vcpu); 6231 struct vcpu_svm *svm = to_svm(vcpu);
6238 struct vmcb *nested_vmcb; 6232 struct vmcb *nested_vmcb;
6239 struct page *page; 6233 struct kvm_host_map map;
6240 u64 guest; 6234 u64 guest;
6241 u64 vmcb; 6235 u64 vmcb;
6242 6236
@@ -6244,10 +6238,10 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
6244 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); 6238 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
6245 6239
6246 if (guest) { 6240 if (guest) {
6247 nested_vmcb = nested_svm_map(svm, vmcb, &page); 6241 if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
6248 if (!nested_vmcb)
6249 return 1; 6242 return 1;
6250 enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); 6243 nested_vmcb = map.hva;
6244 enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
6251 } 6245 }
6252 return 0; 6246 return 0;
6253} 6247}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 854e144131c6..d6664ee3d127 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -2,6 +2,8 @@
2#ifndef __KVM_X86_VMX_CAPS_H 2#ifndef __KVM_X86_VMX_CAPS_H
3#define __KVM_X86_VMX_CAPS_H 3#define __KVM_X86_VMX_CAPS_H
4 4
5#include <asm/vmx.h>
6
5#include "lapic.h" 7#include "lapic.h"
6 8
7extern bool __read_mostly enable_vpid; 9extern bool __read_mostly enable_vpid;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c601d079cd2..f1a69117ac0f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
193 if (!vmx->nested.hv_evmcs) 193 if (!vmx->nested.hv_evmcs)
194 return; 194 return;
195 195
196 kunmap(vmx->nested.hv_evmcs_page); 196 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
197 kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
198 vmx->nested.hv_evmcs_vmptr = -1ull; 197 vmx->nested.hv_evmcs_vmptr = -1ull;
199 vmx->nested.hv_evmcs_page = NULL;
200 vmx->nested.hv_evmcs = NULL; 198 vmx->nested.hv_evmcs = NULL;
201} 199}
202 200
@@ -229,16 +227,9 @@ static void free_nested(struct kvm_vcpu *vcpu)
229 kvm_release_page_dirty(vmx->nested.apic_access_page); 227 kvm_release_page_dirty(vmx->nested.apic_access_page);
230 vmx->nested.apic_access_page = NULL; 228 vmx->nested.apic_access_page = NULL;
231 } 229 }
232 if (vmx->nested.virtual_apic_page) { 230 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
233 kvm_release_page_dirty(vmx->nested.virtual_apic_page); 231 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
234 vmx->nested.virtual_apic_page = NULL; 232 vmx->nested.pi_desc = NULL;
235 }
236 if (vmx->nested.pi_desc_page) {
237 kunmap(vmx->nested.pi_desc_page);
238 kvm_release_page_dirty(vmx->nested.pi_desc_page);
239 vmx->nested.pi_desc_page = NULL;
240 vmx->nested.pi_desc = NULL;
241 }
242 233
243 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 234 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
244 235
@@ -519,39 +510,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
519 struct vmcs12 *vmcs12) 510 struct vmcs12 *vmcs12)
520{ 511{
521 int msr; 512 int msr;
522 struct page *page;
523 unsigned long *msr_bitmap_l1; 513 unsigned long *msr_bitmap_l1;
524 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 514 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
525 /* 515 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
526 * pred_cmd & spec_ctrl are trying to verify two things:
527 *
528 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
529 * ensures that we do not accidentally generate an L02 MSR bitmap
530 * from the L12 MSR bitmap that is too permissive.
531 * 2. That L1 or L2s have actually used the MSR. This avoids
532 * unnecessarily merging of the bitmap if the MSR is unused. This
533 * works properly because we only update the L01 MSR bitmap lazily.
534 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
535 * updated to reflect this when L1 (or its L2s) actually write to
536 * the MSR.
537 */
538 bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
539 bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
540 516
541 /* Nothing to do if the MSR bitmap is not in use. */ 517 /* Nothing to do if the MSR bitmap is not in use. */
542 if (!cpu_has_vmx_msr_bitmap() || 518 if (!cpu_has_vmx_msr_bitmap() ||
543 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 519 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
544 return false; 520 return false;
545 521
546 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 522 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
547 !pred_cmd && !spec_ctrl)
548 return false;
549
550 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
551 if (is_error_page(page))
552 return false; 523 return false;
553 524
554 msr_bitmap_l1 = (unsigned long *)kmap(page); 525 msr_bitmap_l1 = (unsigned long *)map->hva;
555 526
556 /* 527 /*
557 * To keep the control flow simple, pay eight 8-byte writes (sixteen 528 * To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -592,20 +563,42 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
592 } 563 }
593 } 564 }
594 565
595 if (spec_ctrl) 566 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
567 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
568 MSR_FS_BASE, MSR_TYPE_RW);
569
570 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
571 MSR_GS_BASE, MSR_TYPE_RW);
572
573 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
574 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
575
576 /*
577 * Checking the L0->L1 bitmap is trying to verify two things:
578 *
579 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
580 * ensures that we do not accidentally generate an L02 MSR bitmap
581 * from the L12 MSR bitmap that is too permissive.
582 * 2. That L1 or L2s have actually used the MSR. This avoids
583 * unnecessarily merging of the bitmap if the MSR is unused. This
584 * works properly because we only update the L01 MSR bitmap lazily.
585 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
586 * updated to reflect this when L1 (or its L2s) actually write to
587 * the MSR.
588 */
589 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
596 nested_vmx_disable_intercept_for_msr( 590 nested_vmx_disable_intercept_for_msr(
597 msr_bitmap_l1, msr_bitmap_l0, 591 msr_bitmap_l1, msr_bitmap_l0,
598 MSR_IA32_SPEC_CTRL, 592 MSR_IA32_SPEC_CTRL,
599 MSR_TYPE_R | MSR_TYPE_W); 593 MSR_TYPE_R | MSR_TYPE_W);
600 594
601 if (pred_cmd) 595 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
602 nested_vmx_disable_intercept_for_msr( 596 nested_vmx_disable_intercept_for_msr(
603 msr_bitmap_l1, msr_bitmap_l0, 597 msr_bitmap_l1, msr_bitmap_l0,
604 MSR_IA32_PRED_CMD, 598 MSR_IA32_PRED_CMD,
605 MSR_TYPE_W); 599 MSR_TYPE_W);
606 600
607 kunmap(page); 601 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
608 kvm_release_page_clean(page);
609 602
610 return true; 603 return true;
611} 604}
@@ -613,20 +606,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
613static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 606static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
614 struct vmcs12 *vmcs12) 607 struct vmcs12 *vmcs12)
615{ 608{
609 struct kvm_host_map map;
616 struct vmcs12 *shadow; 610 struct vmcs12 *shadow;
617 struct page *page;
618 611
619 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 612 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
620 vmcs12->vmcs_link_pointer == -1ull) 613 vmcs12->vmcs_link_pointer == -1ull)
621 return; 614 return;
622 615
623 shadow = get_shadow_vmcs12(vcpu); 616 shadow = get_shadow_vmcs12(vcpu);
624 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
625 617
626 memcpy(shadow, kmap(page), VMCS12_SIZE); 618 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
619 return;
627 620
628 kunmap(page); 621 memcpy(shadow, map.hva, VMCS12_SIZE);
629 kvm_release_page_clean(page); 622 kvm_vcpu_unmap(vcpu, &map, false);
630} 623}
631 624
632static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 625static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -930,7 +923,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
930 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 923 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
931 if (!nested_cr3_valid(vcpu, cr3)) { 924 if (!nested_cr3_valid(vcpu, cr3)) {
932 *entry_failure_code = ENTRY_FAIL_DEFAULT; 925 *entry_failure_code = ENTRY_FAIL_DEFAULT;
933 return 1; 926 return -EINVAL;
934 } 927 }
935 928
936 /* 929 /*
@@ -941,7 +934,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
941 !nested_ept) { 934 !nested_ept) {
942 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { 935 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
943 *entry_failure_code = ENTRY_FAIL_PDPTE; 936 *entry_failure_code = ENTRY_FAIL_PDPTE;
944 return 1; 937 return -EINVAL;
945 } 938 }
946 } 939 }
947 } 940 }
@@ -1794,13 +1787,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1794 1787
1795 nested_release_evmcs(vcpu); 1788 nested_release_evmcs(vcpu);
1796 1789
1797 vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( 1790 if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
1798 vcpu, assist_page.current_nested_vmcs); 1791 &vmx->nested.hv_evmcs_map))
1799
1800 if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
1801 return 0; 1792 return 0;
1802 1793
1803 vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); 1794 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1804 1795
1805 /* 1796 /*
1806 * Currently, KVM only supports eVMCS version 1 1797 * Currently, KVM only supports eVMCS version 1
@@ -2373,19 +2364,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2373 */ 2364 */
2374 if (vmx->emulation_required) { 2365 if (vmx->emulation_required) {
2375 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2366 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2376 return 1; 2367 return -EINVAL;
2377 } 2368 }
2378 2369
2379 /* Shadow page tables on either EPT or shadow page tables. */ 2370 /* Shadow page tables on either EPT or shadow page tables. */
2380 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2371 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2381 entry_failure_code)) 2372 entry_failure_code))
2382 return 1; 2373 return -EINVAL;
2383 2374
2384 if (!enable_ept) 2375 if (!enable_ept)
2385 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2376 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2386 2377
2387 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 2378 kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2388 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 2379 kvm_rip_write(vcpu, vmcs12->guest_rip);
2389 return 0; 2380 return 0;
2390} 2381}
2391 2382
@@ -2589,11 +2580,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2589 return 0; 2580 return 0;
2590} 2581}
2591 2582
2592/* 2583static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2593 * Checks related to Host Control Registers and MSRs 2584 struct vmcs12 *vmcs12)
2594 */ 2585{
2595static int nested_check_host_control_regs(struct kvm_vcpu *vcpu, 2586 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2596 struct vmcs12 *vmcs12) 2587 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2588 nested_check_vm_entry_controls(vcpu, vmcs12))
2589 return -EINVAL;
2590
2591 return 0;
2592}
2593
2594static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2595 struct vmcs12 *vmcs12)
2597{ 2596{
2598 bool ia32e; 2597 bool ia32e;
2599 2598
@@ -2606,6 +2605,10 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
2606 is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) 2605 is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
2607 return -EINVAL; 2606 return -EINVAL;
2608 2607
2608 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2609 !kvm_pat_valid(vmcs12->host_ia32_pat))
2610 return -EINVAL;
2611
2609 /* 2612 /*
2610 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2613 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2611 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2614 * IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2624,41 +2627,12 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
2624 return 0; 2627 return 0;
2625} 2628}
2626 2629
2627/*
2628 * Checks related to Guest Non-register State
2629 */
2630static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2631{
2632 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2633 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2634 return -EINVAL;
2635
2636 return 0;
2637}
2638
2639static int nested_vmx_check_vmentry_prereqs(struct kvm_vcpu *vcpu,
2640 struct vmcs12 *vmcs12)
2641{
2642 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2643 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2644 nested_check_vm_entry_controls(vcpu, vmcs12))
2645 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
2646
2647 if (nested_check_host_control_regs(vcpu, vmcs12))
2648 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
2649
2650 if (nested_check_guest_non_reg_state(vmcs12))
2651 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
2652
2653 return 0;
2654}
2655
2656static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2630static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2657 struct vmcs12 *vmcs12) 2631 struct vmcs12 *vmcs12)
2658{ 2632{
2659 int r; 2633 int r = 0;
2660 struct page *page;
2661 struct vmcs12 *shadow; 2634 struct vmcs12 *shadow;
2635 struct kvm_host_map map;
2662 2636
2663 if (vmcs12->vmcs_link_pointer == -1ull) 2637 if (vmcs12->vmcs_link_pointer == -1ull)
2664 return 0; 2638 return 0;
@@ -2666,23 +2640,34 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2666 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) 2640 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
2667 return -EINVAL; 2641 return -EINVAL;
2668 2642
2669 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); 2643 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
2670 if (is_error_page(page))
2671 return -EINVAL; 2644 return -EINVAL;
2672 2645
2673 r = 0; 2646 shadow = map.hva;
2674 shadow = kmap(page); 2647
2675 if (shadow->hdr.revision_id != VMCS12_REVISION || 2648 if (shadow->hdr.revision_id != VMCS12_REVISION ||
2676 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) 2649 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
2677 r = -EINVAL; 2650 r = -EINVAL;
2678 kunmap(page); 2651
2679 kvm_release_page_clean(page); 2652 kvm_vcpu_unmap(vcpu, &map, false);
2680 return r; 2653 return r;
2681} 2654}
2682 2655
2683static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu, 2656/*
2684 struct vmcs12 *vmcs12, 2657 * Checks related to Guest Non-register State
2685 u32 *exit_qual) 2658 */
2659static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2660{
2661 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2662 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2663 return -EINVAL;
2664
2665 return 0;
2666}
2667
2668static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2669 struct vmcs12 *vmcs12,
2670 u32 *exit_qual)
2686{ 2671{
2687 bool ia32e; 2672 bool ia32e;
2688 2673
@@ -2690,11 +2675,15 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
2690 2675
2691 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || 2676 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
2692 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) 2677 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
2693 return 1; 2678 return -EINVAL;
2679
2680 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2681 !kvm_pat_valid(vmcs12->guest_ia32_pat))
2682 return -EINVAL;
2694 2683
2695 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2684 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2696 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2685 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2697 return 1; 2686 return -EINVAL;
2698 } 2687 }
2699 2688
2700 /* 2689 /*
@@ -2713,13 +2702,16 @@ static int nested_vmx_check_vmentry_postreqs(struct kvm_vcpu *vcpu,
2713 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 2702 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
2714 ((vmcs12->guest_cr0 & X86_CR0_PG) && 2703 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
2715 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) 2704 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
2716 return 1; 2705 return -EINVAL;
2717 } 2706 }
2718 2707
2719 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2708 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2720 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || 2709 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
2721 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) 2710 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
2722 return 1; 2711 return -EINVAL;
2712
2713 if (nested_check_guest_non_reg_state(vmcs12))
2714 return -EINVAL;
2723 2715
2724 return 0; 2716 return 0;
2725} 2717}
@@ -2832,6 +2824,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2832{ 2824{
2833 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2825 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2834 struct vcpu_vmx *vmx = to_vmx(vcpu); 2826 struct vcpu_vmx *vmx = to_vmx(vcpu);
2827 struct kvm_host_map *map;
2835 struct page *page; 2828 struct page *page;
2836 u64 hpa; 2829 u64 hpa;
2837 2830
@@ -2864,20 +2857,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2864 } 2857 }
2865 2858
2866 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2859 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2867 if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ 2860 map = &vmx->nested.virtual_apic_map;
2868 kvm_release_page_dirty(vmx->nested.virtual_apic_page);
2869 vmx->nested.virtual_apic_page = NULL;
2870 }
2871 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
2872 2861
2873 /* 2862 /*
2874 * If translation failed, VM entry will fail because 2863 * If translation failed, VM entry will fail because
2875 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. 2864 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
2876 */ 2865 */
2877 if (!is_error_page(page)) { 2866 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2878 vmx->nested.virtual_apic_page = page; 2867 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2879 hpa = page_to_phys(vmx->nested.virtual_apic_page);
2880 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
2881 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2868 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
2882 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 2869 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
2883 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2870 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -2898,26 +2885,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2898 } 2885 }
2899 2886
2900 if (nested_cpu_has_posted_intr(vmcs12)) { 2887 if (nested_cpu_has_posted_intr(vmcs12)) {
2901 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 2888 map = &vmx->nested.pi_desc_map;
2902 kunmap(vmx->nested.pi_desc_page); 2889
2903 kvm_release_page_dirty(vmx->nested.pi_desc_page); 2890 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
2904 vmx->nested.pi_desc_page = NULL; 2891 vmx->nested.pi_desc =
2905 vmx->nested.pi_desc = NULL; 2892 (struct pi_desc *)(((void *)map->hva) +
2906 vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull); 2893 offset_in_page(vmcs12->posted_intr_desc_addr));
2894 vmcs_write64(POSTED_INTR_DESC_ADDR,
2895 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
2907 } 2896 }
2908 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
2909 if (is_error_page(page))
2910 return;
2911 vmx->nested.pi_desc_page = page;
2912 vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
2913 vmx->nested.pi_desc =
2914 (struct pi_desc *)((void *)vmx->nested.pi_desc +
2915 (unsigned long)(vmcs12->posted_intr_desc_addr &
2916 (PAGE_SIZE - 1)));
2917 vmcs_write64(POSTED_INTR_DESC_ADDR,
2918 page_to_phys(vmx->nested.pi_desc_page) +
2919 (unsigned long)(vmcs12->posted_intr_desc_addr &
2920 (PAGE_SIZE - 1)));
2921 } 2897 }
2922 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2898 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2923 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 2899 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -3000,7 +2976,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3000 return -1; 2976 return -1;
3001 } 2977 }
3002 2978
3003 if (nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) 2979 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3004 goto vmentry_fail_vmexit; 2980 goto vmentry_fail_vmexit;
3005 } 2981 }
3006 2982
@@ -3145,9 +3121,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3145 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3121 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3146 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3122 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3147 3123
3148 ret = nested_vmx_check_vmentry_prereqs(vcpu, vmcs12); 3124 if (nested_vmx_check_controls(vcpu, vmcs12))
3149 if (ret) 3125 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3150 return nested_vmx_failValid(vcpu, ret); 3126
3127 if (nested_vmx_check_host_state(vcpu, vmcs12))
3128 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3151 3129
3152 /* 3130 /*
3153 * We're finally done with prerequisite checking, and can start with 3131 * We're finally done with prerequisite checking, and can start with
@@ -3310,11 +3288,12 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3310 3288
3311 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3289 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3312 if (max_irr != 256) { 3290 if (max_irr != 256) {
3313 vapic_page = kmap(vmx->nested.virtual_apic_page); 3291 vapic_page = vmx->nested.virtual_apic_map.hva;
3292 if (!vapic_page)
3293 return;
3294
3314 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3295 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3315 vapic_page, &max_irr); 3296 vapic_page, &max_irr);
3316 kunmap(vmx->nested.virtual_apic_page);
3317
3318 status = vmcs_read16(GUEST_INTR_STATUS); 3297 status = vmcs_read16(GUEST_INTR_STATUS);
3319 if ((u8)max_irr > ((u8)status & 0xff)) { 3298 if ((u8)max_irr > ((u8)status & 0xff)) {
3320 status &= ~0xff; 3299 status &= ~0xff;
@@ -3425,8 +3404,8 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3425 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3404 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3426 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3405 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3427 3406
3428 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 3407 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3429 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); 3408 vmcs12->guest_rip = kvm_rip_read(vcpu);
3430 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3409 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3431 3410
3432 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3411 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
@@ -3609,8 +3588,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3609 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3588 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3610 vmx_set_efer(vcpu, vcpu->arch.efer); 3589 vmx_set_efer(vcpu, vcpu->arch.efer);
3611 3590
3612 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 3591 kvm_rsp_write(vcpu, vmcs12->host_rsp);
3613 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 3592 kvm_rip_write(vcpu, vmcs12->host_rip);
3614 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3593 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3615 vmx_set_interrupt_shadow(vcpu, 0); 3594 vmx_set_interrupt_shadow(vcpu, 0);
3616 3595
@@ -3955,16 +3934,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3955 kvm_release_page_dirty(vmx->nested.apic_access_page); 3934 kvm_release_page_dirty(vmx->nested.apic_access_page);
3956 vmx->nested.apic_access_page = NULL; 3935 vmx->nested.apic_access_page = NULL;
3957 } 3936 }
3958 if (vmx->nested.virtual_apic_page) { 3937 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3959 kvm_release_page_dirty(vmx->nested.virtual_apic_page); 3938 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
3960 vmx->nested.virtual_apic_page = NULL; 3939 vmx->nested.pi_desc = NULL;
3961 }
3962 if (vmx->nested.pi_desc_page) {
3963 kunmap(vmx->nested.pi_desc_page);
3964 kvm_release_page_dirty(vmx->nested.pi_desc_page);
3965 vmx->nested.pi_desc_page = NULL;
3966 vmx->nested.pi_desc = NULL;
3967 }
3968 3940
3969 /* 3941 /*
3970 * We are now running in L2, mmu_notifier will force to reload the 3942 * We are now running in L2, mmu_notifier will force to reload the
@@ -4260,7 +4232,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
4260{ 4232{
4261 int ret; 4233 int ret;
4262 gpa_t vmptr; 4234 gpa_t vmptr;
4263 struct page *page; 4235 uint32_t revision;
4264 struct vcpu_vmx *vmx = to_vmx(vcpu); 4236 struct vcpu_vmx *vmx = to_vmx(vcpu);
4265 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4237 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4266 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4238 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -4306,20 +4278,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
4306 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4278 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4307 * which replaces physical address width with 32 4279 * which replaces physical address width with 32
4308 */ 4280 */
4309 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) 4281 if (!page_address_valid(vcpu, vmptr))
4310 return nested_vmx_failInvalid(vcpu);
4311
4312 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
4313 if (is_error_page(page))
4314 return nested_vmx_failInvalid(vcpu); 4282 return nested_vmx_failInvalid(vcpu);
4315 4283
4316 if (*(u32 *)kmap(page) != VMCS12_REVISION) { 4284 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4317 kunmap(page); 4285 revision != VMCS12_REVISION)
4318 kvm_release_page_clean(page);
4319 return nested_vmx_failInvalid(vcpu); 4286 return nested_vmx_failInvalid(vcpu);
4320 }
4321 kunmap(page);
4322 kvm_release_page_clean(page);
4323 4287
4324 vmx->nested.vmxon_ptr = vmptr; 4288 vmx->nested.vmxon_ptr = vmptr;
4325 ret = enter_vmx_operation(vcpu); 4289 ret = enter_vmx_operation(vcpu);
@@ -4377,7 +4341,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
4377 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4341 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4378 return 1; 4342 return 1;
4379 4343
4380 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) 4344 if (!page_address_valid(vcpu, vmptr))
4381 return nested_vmx_failValid(vcpu, 4345 return nested_vmx_failValid(vcpu,
4382 VMXERR_VMCLEAR_INVALID_ADDRESS); 4346 VMXERR_VMCLEAR_INVALID_ADDRESS);
4383 4347
@@ -4385,7 +4349,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
4385 return nested_vmx_failValid(vcpu, 4349 return nested_vmx_failValid(vcpu,
4386 VMXERR_VMCLEAR_VMXON_POINTER); 4350 VMXERR_VMCLEAR_VMXON_POINTER);
4387 4351
4388 if (vmx->nested.hv_evmcs_page) { 4352 if (vmx->nested.hv_evmcs_map.hva) {
4389 if (vmptr == vmx->nested.hv_evmcs_vmptr) 4353 if (vmptr == vmx->nested.hv_evmcs_vmptr)
4390 nested_release_evmcs(vcpu); 4354 nested_release_evmcs(vcpu);
4391 } else { 4355 } else {
@@ -4584,7 +4548,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
4584 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4548 if (nested_vmx_get_vmptr(vcpu, &vmptr))
4585 return 1; 4549 return 1;
4586 4550
4587 if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) 4551 if (!page_address_valid(vcpu, vmptr))
4588 return nested_vmx_failValid(vcpu, 4552 return nested_vmx_failValid(vcpu,
4589 VMXERR_VMPTRLD_INVALID_ADDRESS); 4553 VMXERR_VMPTRLD_INVALID_ADDRESS);
4590 4554
@@ -4597,11 +4561,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
4597 return 1; 4561 return 1;
4598 4562
4599 if (vmx->nested.current_vmptr != vmptr) { 4563 if (vmx->nested.current_vmptr != vmptr) {
4564 struct kvm_host_map map;
4600 struct vmcs12 *new_vmcs12; 4565 struct vmcs12 *new_vmcs12;
4601 struct page *page;
4602 4566
4603 page = kvm_vcpu_gpa_to_page(vcpu, vmptr); 4567 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4604 if (is_error_page(page)) {
4605 /* 4568 /*
4606 * Reads from an unbacked page return all 1s, 4569 * Reads from an unbacked page return all 1s,
4607 * which means that the 32 bits located at the 4570 * which means that the 32 bits located at the
@@ -4611,12 +4574,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
4611 return nested_vmx_failValid(vcpu, 4574 return nested_vmx_failValid(vcpu,
4612 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4575 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4613 } 4576 }
4614 new_vmcs12 = kmap(page); 4577
4578 new_vmcs12 = map.hva;
4579
4615 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 4580 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4616 (new_vmcs12->hdr.shadow_vmcs && 4581 (new_vmcs12->hdr.shadow_vmcs &&
4617 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 4582 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4618 kunmap(page); 4583 kvm_vcpu_unmap(vcpu, &map, false);
4619 kvm_release_page_clean(page);
4620 return nested_vmx_failValid(vcpu, 4584 return nested_vmx_failValid(vcpu,
4621 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4585 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4622 } 4586 }
@@ -4628,8 +4592,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
4628 * cached. 4592 * cached.
4629 */ 4593 */
4630 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 4594 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4631 kunmap(page); 4595 kvm_vcpu_unmap(vcpu, &map, false);
4632 kvm_release_page_clean(page);
4633 4596
4634 set_current_vmptr(vmx, vmptr); 4597 set_current_vmptr(vmx, vmptr);
4635 } 4598 }
@@ -4804,7 +4767,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
4804static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 4767static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
4805 struct vmcs12 *vmcs12) 4768 struct vmcs12 *vmcs12)
4806{ 4769{
4807 u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; 4770 u32 index = kvm_rcx_read(vcpu);
4808 u64 address; 4771 u64 address;
4809 bool accessed_dirty; 4772 bool accessed_dirty;
4810 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 4773 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
@@ -4850,7 +4813,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
4850{ 4813{
4851 struct vcpu_vmx *vmx = to_vmx(vcpu); 4814 struct vcpu_vmx *vmx = to_vmx(vcpu);
4852 struct vmcs12 *vmcs12; 4815 struct vmcs12 *vmcs12;
4853 u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; 4816 u32 function = kvm_rax_read(vcpu);
4854 4817
4855 /* 4818 /*
4856 * VMFUNC is only supported for nested guests, but we always enable the 4819 * VMFUNC is only supported for nested guests, but we always enable the
@@ -4936,7 +4899,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
4936static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 4899static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
4937 struct vmcs12 *vmcs12, u32 exit_reason) 4900 struct vmcs12 *vmcs12, u32 exit_reason)
4938{ 4901{
4939 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 4902 u32 msr_index = kvm_rcx_read(vcpu);
4940 gpa_t bitmap; 4903 gpa_t bitmap;
4941 4904
4942 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 4905 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
@@ -5373,9 +5336,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5373 if (kvm_state->format != 0) 5336 if (kvm_state->format != 0)
5374 return -EINVAL; 5337 return -EINVAL;
5375 5338
5376 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
5377 nested_enable_evmcs(vcpu, NULL);
5378
5379 if (!nested_vmx_allowed(vcpu)) 5339 if (!nested_vmx_allowed(vcpu))
5380 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; 5340 return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
5381 5341
@@ -5417,6 +5377,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5417 if (kvm_state->vmx.vmxon_pa == -1ull) 5377 if (kvm_state->vmx.vmxon_pa == -1ull)
5418 return 0; 5378 return 0;
5419 5379
5380 if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
5381 nested_enable_evmcs(vcpu, NULL);
5382
5420 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; 5383 vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
5421 ret = enter_vmx_operation(vcpu); 5384 ret = enter_vmx_operation(vcpu);
5422 if (ret) 5385 if (ret)
@@ -5460,9 +5423,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5460 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5423 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5461 return 0; 5424 return 0;
5462 5425
5463 vmx->nested.nested_run_pending =
5464 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5465
5466 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5426 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5467 vmcs12->vmcs_link_pointer != -1ull) { 5427 vmcs12->vmcs_link_pointer != -1ull) {
5468 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5428 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
@@ -5480,14 +5440,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5480 return -EINVAL; 5440 return -EINVAL;
5481 } 5441 }
5482 5442
5483 if (nested_vmx_check_vmentry_prereqs(vcpu, vmcs12) || 5443 if (nested_vmx_check_controls(vcpu, vmcs12) ||
5484 nested_vmx_check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) 5444 nested_vmx_check_host_state(vcpu, vmcs12) ||
5445 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5485 return -EINVAL; 5446 return -EINVAL;
5486 5447
5487 vmx->nested.dirty_vmcs12 = true; 5448 vmx->nested.dirty_vmcs12 = true;
5449 vmx->nested.nested_run_pending =
5450 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5451
5488 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5452 ret = nested_vmx_enter_non_root_mode(vcpu, false);
5489 if (ret) 5453 if (ret) {
5454 vmx->nested.nested_run_pending = 0;
5490 return -EINVAL; 5455 return -EINVAL;
5456 }
5491 5457
5492 return 0; 5458 return 0;
5493} 5459}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a364348e..f8502c376b37 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -227,7 +227,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
227 } 227 }
228 break; 228 break;
229 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 229 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
230 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { 230 if (!(data & pmu->global_ovf_ctrl_mask)) {
231 if (!msr_info->host_initiated) 231 if (!msr_info->host_initiated)
232 pmu->global_status &= ~data; 232 pmu->global_status &= ~data;
233 pmu->global_ovf_ctrl = data; 233 pmu->global_ovf_ctrl = data;
@@ -297,6 +297,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
297 pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | 297 pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
298 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); 298 (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
299 pmu->global_ctrl_mask = ~pmu->global_ctrl; 299 pmu->global_ctrl_mask = ~pmu->global_ctrl;
300 pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
301 & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
302 MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
303 if (kvm_x86_ops->pt_supported())
304 pmu->global_ovf_ctrl_mask &=
305 ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
300 306
301 entry = kvm_find_cpuid_entry(vcpu, 7, 0); 307 entry = kvm_find_cpuid_entry(vcpu, 7, 0);
302 if (entry && 308 if (entry &&
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e1fa935a545f..1ac167614032 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1692,6 +1692,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1692 case MSR_IA32_SYSENTER_ESP: 1692 case MSR_IA32_SYSENTER_ESP:
1693 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 1693 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
1694 break; 1694 break;
1695 case MSR_IA32_POWER_CTL:
1696 msr_info->data = vmx->msr_ia32_power_ctl;
1697 break;
1695 case MSR_IA32_BNDCFGS: 1698 case MSR_IA32_BNDCFGS:
1696 if (!kvm_mpx_supported() || 1699 if (!kvm_mpx_supported() ||
1697 (!msr_info->host_initiated && 1700 (!msr_info->host_initiated &&
@@ -1822,6 +1825,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1822 case MSR_IA32_SYSENTER_ESP: 1825 case MSR_IA32_SYSENTER_ESP:
1823 vmcs_writel(GUEST_SYSENTER_ESP, data); 1826 vmcs_writel(GUEST_SYSENTER_ESP, data);
1824 break; 1827 break;
1828 case MSR_IA32_POWER_CTL:
1829 vmx->msr_ia32_power_ctl = data;
1830 break;
1825 case MSR_IA32_BNDCFGS: 1831 case MSR_IA32_BNDCFGS:
1826 if (!kvm_mpx_supported() || 1832 if (!kvm_mpx_supported() ||
1827 (!msr_info->host_initiated && 1833 (!msr_info->host_initiated &&
@@ -1891,7 +1897,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1891 break; 1897 break;
1892 case MSR_IA32_CR_PAT: 1898 case MSR_IA32_CR_PAT:
1893 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1899 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1894 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 1900 if (!kvm_pat_valid(data))
1895 return 1; 1901 return 1;
1896 vmcs_write64(GUEST_IA32_PAT, data); 1902 vmcs_write64(GUEST_IA32_PAT, data);
1897 vcpu->arch.pat = data; 1903 vcpu->arch.pat = data;
@@ -2288,7 +2294,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2288 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2294 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2289#endif 2295#endif
2290 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 2296 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
2291 VM_EXIT_SAVE_IA32_PAT |
2292 VM_EXIT_LOAD_IA32_PAT | 2297 VM_EXIT_LOAD_IA32_PAT |
2293 VM_EXIT_LOAD_IA32_EFER | 2298 VM_EXIT_LOAD_IA32_EFER |
2294 VM_EXIT_CLEAR_BNDCFGS | 2299 VM_EXIT_CLEAR_BNDCFGS |
@@ -3619,14 +3624,13 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3619 3624
3620 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 3625 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
3621 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 3626 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
3622 WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) 3627 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
3623 return false; 3628 return false;
3624 3629
3625 rvi = vmx_get_rvi(); 3630 rvi = vmx_get_rvi();
3626 3631
3627 vapic_page = kmap(vmx->nested.virtual_apic_page); 3632 vapic_page = vmx->nested.virtual_apic_map.hva;
3628 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 3633 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
3629 kunmap(vmx->nested.virtual_apic_page);
3630 3634
3631 return ((rvi & 0xf0) > (vppr & 0xf0)); 3635 return ((rvi & 0xf0) > (vppr & 0xf0));
3632} 3636}
@@ -4827,7 +4831,7 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
4827 4831
4828static int handle_rdmsr(struct kvm_vcpu *vcpu) 4832static int handle_rdmsr(struct kvm_vcpu *vcpu)
4829{ 4833{
4830 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4834 u32 ecx = kvm_rcx_read(vcpu);
4831 struct msr_data msr_info; 4835 struct msr_data msr_info;
4832 4836
4833 msr_info.index = ecx; 4837 msr_info.index = ecx;
@@ -4840,18 +4844,16 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
4840 4844
4841 trace_kvm_msr_read(ecx, msr_info.data); 4845 trace_kvm_msr_read(ecx, msr_info.data);
4842 4846
4843 /* FIXME: handling of bits 32:63 of rax, rdx */ 4847 kvm_rax_write(vcpu, msr_info.data & -1u);
4844 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; 4848 kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u);
4845 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
4846 return kvm_skip_emulated_instruction(vcpu); 4849 return kvm_skip_emulated_instruction(vcpu);
4847} 4850}
4848 4851
4849static int handle_wrmsr(struct kvm_vcpu *vcpu) 4852static int handle_wrmsr(struct kvm_vcpu *vcpu)
4850{ 4853{
4851 struct msr_data msr; 4854 struct msr_data msr;
4852 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4855 u32 ecx = kvm_rcx_read(vcpu);
4853 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4856 u64 data = kvm_read_edx_eax(vcpu);
4854 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
4855 4857
4856 msr.data = data; 4858 msr.data = data;
4857 msr.index = ecx; 4859 msr.index = ecx;
@@ -4922,7 +4924,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
4922static int handle_xsetbv(struct kvm_vcpu *vcpu) 4924static int handle_xsetbv(struct kvm_vcpu *vcpu)
4923{ 4925{
4924 u64 new_bv = kvm_read_edx_eax(vcpu); 4926 u64 new_bv = kvm_read_edx_eax(vcpu);
4925 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 4927 u32 index = kvm_rcx_read(vcpu);
4926 4928
4927 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 4929 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
4928 return kvm_skip_emulated_instruction(vcpu); 4930 return kvm_skip_emulated_instruction(vcpu);
@@ -5723,8 +5725,16 @@ void dump_vmcs(void)
5723 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 5725 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
5724 pr_err("TSC Multiplier = 0x%016llx\n", 5726 pr_err("TSC Multiplier = 0x%016llx\n",
5725 vmcs_read64(TSC_MULTIPLIER)); 5727 vmcs_read64(TSC_MULTIPLIER));
5726 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) 5728 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
5727 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 5729 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
5730 u16 status = vmcs_read16(GUEST_INTR_STATUS);
5731 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
5732 }
5733 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
5734 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
5735 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
5736 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
5737 }
5728 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 5738 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
5729 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 5739 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
5730 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 5740 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
@@ -6856,30 +6866,6 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
6856 } 6866 }
6857} 6867}
6858 6868
6859static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu)
6860{
6861 struct kvm_cpuid_entry2 *entry;
6862 union cpuid10_eax eax;
6863
6864 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
6865 if (!entry)
6866 return false;
6867
6868 eax.full = entry->eax;
6869 return (eax.split.version_id > 0);
6870}
6871
6872static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu)
6873{
6874 struct vcpu_vmx *vmx = to_vmx(vcpu);
6875 bool pmu_enabled = guest_cpuid_has_pmu(vcpu);
6876
6877 if (pmu_enabled)
6878 vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING;
6879 else
6880 vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING;
6881}
6882
6883static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 6869static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
6884{ 6870{
6885 struct vcpu_vmx *vmx = to_vmx(vcpu); 6871 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6968,7 +6954,6 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
6968 if (nested_vmx_allowed(vcpu)) { 6954 if (nested_vmx_allowed(vcpu)) {
6969 nested_vmx_cr_fixed1_bits_update(vcpu); 6955 nested_vmx_cr_fixed1_bits_update(vcpu);
6970 nested_vmx_entry_exit_ctls_update(vcpu); 6956 nested_vmx_entry_exit_ctls_update(vcpu);
6971 nested_vmx_procbased_ctls_update(vcpu);
6972 } 6957 }
6973 6958
6974 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 6959 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@ -7028,7 +7013,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7028 return 0; 7013 return 0;
7029} 7014}
7030 7015
7031static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) 7016static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7017 bool *expired)
7032{ 7018{
7033 struct vcpu_vmx *vmx; 7019 struct vcpu_vmx *vmx;
7034 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7020 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -7051,10 +7037,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
7051 7037
7052 /* Convert to host delta tsc if tsc scaling is enabled */ 7038 /* Convert to host delta tsc if tsc scaling is enabled */
7053 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && 7039 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
7054 u64_shl_div_u64(delta_tsc, 7040 delta_tsc && u64_shl_div_u64(delta_tsc,
7055 kvm_tsc_scaling_ratio_frac_bits, 7041 kvm_tsc_scaling_ratio_frac_bits,
7056 vcpu->arch.tsc_scaling_ratio, 7042 vcpu->arch.tsc_scaling_ratio, &delta_tsc))
7057 &delta_tsc))
7058 return -ERANGE; 7043 return -ERANGE;
7059 7044
7060 /* 7045 /*
@@ -7067,7 +7052,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
7067 return -ERANGE; 7052 return -ERANGE;
7068 7053
7069 vmx->hv_deadline_tsc = tscl + delta_tsc; 7054 vmx->hv_deadline_tsc = tscl + delta_tsc;
7070 return delta_tsc == 0; 7055 *expired = !delta_tsc;
7056 return 0;
7071} 7057}
7072 7058
7073static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 7059static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
@@ -7104,9 +7090,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
7104{ 7090{
7105 struct vmcs12 *vmcs12; 7091 struct vmcs12 *vmcs12;
7106 struct vcpu_vmx *vmx = to_vmx(vcpu); 7092 struct vcpu_vmx *vmx = to_vmx(vcpu);
7107 gpa_t gpa; 7093 gpa_t gpa, dst;
7108 struct page *page = NULL;
7109 u64 *pml_address;
7110 7094
7111 if (is_guest_mode(vcpu)) { 7095 if (is_guest_mode(vcpu)) {
7112 WARN_ON_ONCE(vmx->nested.pml_full); 7096 WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7126,15 +7110,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
7126 } 7110 }
7127 7111
7128 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; 7112 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
7113 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
7129 7114
7130 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); 7115 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
7131 if (is_error_page(page)) 7116 offset_in_page(dst), sizeof(gpa)))
7132 return 0; 7117 return 0;
7133 7118
7134 pml_address = kmap(page); 7119 vmcs12->guest_pml_index--;
7135 pml_address[vmcs12->guest_pml_index--] = gpa;
7136 kunmap(page);
7137 kvm_release_page_clean(page);
7138 } 7120 }
7139 7121
7140 return 0; 7122 return 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f879529906b4..63d37ccce3dc 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -142,8 +142,11 @@ struct nested_vmx {
142 * pointers, so we must keep them pinned while L2 runs. 142 * pointers, so we must keep them pinned while L2 runs.
143 */ 143 */
144 struct page *apic_access_page; 144 struct page *apic_access_page;
145 struct page *virtual_apic_page; 145 struct kvm_host_map virtual_apic_map;
146 struct page *pi_desc_page; 146 struct kvm_host_map pi_desc_map;
147
148 struct kvm_host_map msr_bitmap_map;
149
147 struct pi_desc *pi_desc; 150 struct pi_desc *pi_desc;
148 bool pi_pending; 151 bool pi_pending;
149 u16 posted_intr_nv; 152 u16 posted_intr_nv;
@@ -169,7 +172,7 @@ struct nested_vmx {
169 } smm; 172 } smm;
170 173
171 gpa_t hv_evmcs_vmptr; 174 gpa_t hv_evmcs_vmptr;
172 struct page *hv_evmcs_page; 175 struct kvm_host_map hv_evmcs_map;
173 struct hv_enlightened_vmcs *hv_evmcs; 176 struct hv_enlightened_vmcs *hv_evmcs;
174}; 177};
175 178
@@ -257,6 +260,8 @@ struct vcpu_vmx {
257 260
258 unsigned long host_debugctlmsr; 261 unsigned long host_debugctlmsr;
259 262
263 u64 msr_ia32_power_ctl;
264
260 /* 265 /*
261 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in 266 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
262 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included 267 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b9591abde62a..536b78c4af6e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1100,15 +1100,15 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
1100 1100
1101bool kvm_rdpmc(struct kvm_vcpu *vcpu) 1101bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1102{ 1102{
1103 u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 1103 u32 ecx = kvm_rcx_read(vcpu);
1104 u64 data; 1104 u64 data;
1105 int err; 1105 int err;
1106 1106
1107 err = kvm_pmu_rdpmc(vcpu, ecx, &data); 1107 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1108 if (err) 1108 if (err)
1109 return err; 1109 return err;
1110 kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); 1110 kvm_rax_write(vcpu, (u32)data);
1111 kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); 1111 kvm_rdx_write(vcpu, data >> 32);
1112 return err; 1112 return err;
1113} 1113}
1114EXPORT_SYMBOL_GPL(kvm_rdpmc); 1114EXPORT_SYMBOL_GPL(kvm_rdpmc);
@@ -1174,6 +1174,9 @@ static u32 emulated_msrs[] = {
1174 MSR_PLATFORM_INFO, 1174 MSR_PLATFORM_INFO,
1175 MSR_MISC_FEATURES_ENABLES, 1175 MSR_MISC_FEATURES_ENABLES,
1176 MSR_AMD64_VIRT_SPEC_CTRL, 1176 MSR_AMD64_VIRT_SPEC_CTRL,
1177 MSR_IA32_POWER_CTL,
1178
1179 MSR_K7_HWCR,
1177}; 1180};
1178 1181
1179static unsigned num_emulated_msrs; 1182static unsigned num_emulated_msrs;
@@ -1262,31 +1265,49 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1262 return 0; 1265 return 0;
1263} 1266}
1264 1267
1265bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) 1268static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1266{ 1269{
1267 if (efer & efer_reserved_bits)
1268 return false;
1269
1270 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) 1270 if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1271 return false; 1271 return false;
1272 1272
1273 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 1273 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1274 return false; 1274 return false;
1275
1276 if (efer & (EFER_LME | EFER_LMA) &&
1277 !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1278 return false;
1279
1280 if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1281 return false;
1275 1282
1276 return true; 1283 return true;
1284
1285}
1286bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1287{
1288 if (efer & efer_reserved_bits)
1289 return false;
1290
1291 return __kvm_valid_efer(vcpu, efer);
1277} 1292}
1278EXPORT_SYMBOL_GPL(kvm_valid_efer); 1293EXPORT_SYMBOL_GPL(kvm_valid_efer);
1279 1294
1280static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 1295static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1281{ 1296{
1282 u64 old_efer = vcpu->arch.efer; 1297 u64 old_efer = vcpu->arch.efer;
1298 u64 efer = msr_info->data;
1283 1299
1284 if (!kvm_valid_efer(vcpu, efer)) 1300 if (efer & efer_reserved_bits)
1285 return 1; 1301 return false;
1286 1302
1287 if (is_paging(vcpu) 1303 if (!msr_info->host_initiated) {
1288 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) 1304 if (!__kvm_valid_efer(vcpu, efer))
1289 return 1; 1305 return 1;
1306
1307 if (is_paging(vcpu) &&
1308 (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1309 return 1;
1310 }
1290 1311
1291 efer &= ~EFER_LMA; 1312 efer &= ~EFER_LMA;
1292 efer |= vcpu->arch.efer & EFER_LMA; 1313 efer |= vcpu->arch.efer & EFER_LMA;
@@ -2279,6 +2300,18 @@ static void kvmclock_sync_fn(struct work_struct *work)
2279 KVMCLOCK_SYNC_PERIOD); 2300 KVMCLOCK_SYNC_PERIOD);
2280} 2301}
2281 2302
2303/*
2304 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2305 */
2306static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2307{
2308 /* McStatusWrEn enabled? */
2309 if (guest_cpuid_is_amd(vcpu))
2310 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2311
2312 return false;
2313}
2314
2282static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2315static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2283{ 2316{
2284 u64 mcg_cap = vcpu->arch.mcg_cap; 2317 u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2310,9 +2343,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2310 if ((offset & 0x3) == 0 && 2343 if ((offset & 0x3) == 0 &&
2311 data != 0 && (data | (1 << 10)) != ~(u64)0) 2344 data != 0 && (data | (1 << 10)) != ~(u64)0)
2312 return -1; 2345 return -1;
2346
2347 /* MCi_STATUS */
2313 if (!msr_info->host_initiated && 2348 if (!msr_info->host_initiated &&
2314 (offset & 0x3) == 1 && data != 0) 2349 (offset & 0x3) == 1 && data != 0) {
2315 return -1; 2350 if (!can_set_mci_status(vcpu))
2351 return -1;
2352 }
2353
2316 vcpu->arch.mce_banks[offset] = data; 2354 vcpu->arch.mce_banks[offset] = data;
2317 break; 2355 break;
2318 } 2356 }
@@ -2456,13 +2494,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2456 vcpu->arch.arch_capabilities = data; 2494 vcpu->arch.arch_capabilities = data;
2457 break; 2495 break;
2458 case MSR_EFER: 2496 case MSR_EFER:
2459 return set_efer(vcpu, data); 2497 return set_efer(vcpu, msr_info);
2460 case MSR_K7_HWCR: 2498 case MSR_K7_HWCR:
2461 data &= ~(u64)0x40; /* ignore flush filter disable */ 2499 data &= ~(u64)0x40; /* ignore flush filter disable */
2462 data &= ~(u64)0x100; /* ignore ignne emulation enable */ 2500 data &= ~(u64)0x100; /* ignore ignne emulation enable */
2463 data &= ~(u64)0x8; /* ignore TLB cache disable */ 2501 data &= ~(u64)0x8; /* ignore TLB cache disable */
2464 data &= ~(u64)0x40000; /* ignore Mc status write enable */ 2502
2465 if (data != 0) { 2503 /* Handle McStatusWrEn */
2504 if (data == BIT_ULL(18)) {
2505 vcpu->arch.msr_hwcr = data;
2506 } else if (data != 0) {
2466 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 2507 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2467 data); 2508 data);
2468 return 1; 2509 return 1;
@@ -2736,7 +2777,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2736 case MSR_K8_SYSCFG: 2777 case MSR_K8_SYSCFG:
2737 case MSR_K8_TSEG_ADDR: 2778 case MSR_K8_TSEG_ADDR:
2738 case MSR_K8_TSEG_MASK: 2779 case MSR_K8_TSEG_MASK:
2739 case MSR_K7_HWCR:
2740 case MSR_VM_HSAVE_PA: 2780 case MSR_VM_HSAVE_PA:
2741 case MSR_K8_INT_PENDING_MSG: 2781 case MSR_K8_INT_PENDING_MSG:
2742 case MSR_AMD64_NB_CFG: 2782 case MSR_AMD64_NB_CFG:
@@ -2900,6 +2940,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2900 case MSR_MISC_FEATURES_ENABLES: 2940 case MSR_MISC_FEATURES_ENABLES:
2901 msr_info->data = vcpu->arch.msr_misc_features_enables; 2941 msr_info->data = vcpu->arch.msr_misc_features_enables;
2902 break; 2942 break;
2943 case MSR_K7_HWCR:
2944 msr_info->data = vcpu->arch.msr_hwcr;
2945 break;
2903 default: 2946 default:
2904 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 2947 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2905 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); 2948 return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3079,9 +3122,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3079 case KVM_CAP_MAX_VCPUS: 3122 case KVM_CAP_MAX_VCPUS:
3080 r = KVM_MAX_VCPUS; 3123 r = KVM_MAX_VCPUS;
3081 break; 3124 break;
3082 case KVM_CAP_NR_MEMSLOTS:
3083 r = KVM_USER_MEM_SLOTS;
3084 break;
3085 case KVM_CAP_PV_MMU: /* obsolete */ 3125 case KVM_CAP_PV_MMU: /* obsolete */
3086 r = 0; 3126 r = 0;
3087 break; 3127 break;
@@ -5521,9 +5561,9 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5521 unsigned int bytes, 5561 unsigned int bytes,
5522 struct x86_exception *exception) 5562 struct x86_exception *exception)
5523{ 5563{
5564 struct kvm_host_map map;
5524 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5565 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5525 gpa_t gpa; 5566 gpa_t gpa;
5526 struct page *page;
5527 char *kaddr; 5567 char *kaddr;
5528 bool exchanged; 5568 bool exchanged;
5529 5569
@@ -5540,12 +5580,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5540 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 5580 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
5541 goto emul_write; 5581 goto emul_write;
5542 5582
5543 page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); 5583 if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
5544 if (is_error_page(page))
5545 goto emul_write; 5584 goto emul_write;
5546 5585
5547 kaddr = kmap_atomic(page); 5586 kaddr = map.hva + offset_in_page(gpa);
5548 kaddr += offset_in_page(gpa); 5587
5549 switch (bytes) { 5588 switch (bytes) {
5550 case 1: 5589 case 1:
5551 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); 5590 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5562,13 +5601,12 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
5562 default: 5601 default:
5563 BUG(); 5602 BUG();
5564 } 5603 }
5565 kunmap_atomic(kaddr); 5604
5566 kvm_release_page_dirty(page); 5605 kvm_vcpu_unmap(vcpu, &map, true);
5567 5606
5568 if (!exchanged) 5607 if (!exchanged)
5569 return X86EMUL_CMPXCHG_FAILED; 5608 return X86EMUL_CMPXCHG_FAILED;
5570 5609
5571 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5572 kvm_page_track_write(vcpu, gpa, new, bytes); 5610 kvm_page_track_write(vcpu, gpa, new, bytes);
5573 5611
5574 return X86EMUL_CONTINUE; 5612 return X86EMUL_CONTINUE;
@@ -6558,7 +6596,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
6558static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, 6596static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6559 unsigned short port) 6597 unsigned short port)
6560{ 6598{
6561 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 6599 unsigned long val = kvm_rax_read(vcpu);
6562 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, 6600 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6563 size, port, &val, 1); 6601 size, port, &val, 1);
6564 if (ret) 6602 if (ret)
@@ -6593,8 +6631,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6593 } 6631 }
6594 6632
6595 /* For size less than 4 we merge, else we zero extend */ 6633 /* For size less than 4 we merge, else we zero extend */
6596 val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) 6634 val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
6597 : 0;
6598 6635
6599 /* 6636 /*
6600 * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform 6637 * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
@@ -6602,7 +6639,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6602 */ 6639 */
6603 emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, 6640 emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
6604 vcpu->arch.pio.port, &val, 1); 6641 vcpu->arch.pio.port, &val, 1);
6605 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 6642 kvm_rax_write(vcpu, val);
6606 6643
6607 return kvm_skip_emulated_instruction(vcpu); 6644 return kvm_skip_emulated_instruction(vcpu);
6608} 6645}
@@ -6614,12 +6651,12 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
6614 int ret; 6651 int ret;
6615 6652
6616 /* For size less than 4 we merge, else we zero extend */ 6653 /* For size less than 4 we merge, else we zero extend */
6617 val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; 6654 val = (size < 4) ? kvm_rax_read(vcpu) : 0;
6618 6655
6619 ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, 6656 ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
6620 &val, 1); 6657 &val, 1);
6621 if (ret) { 6658 if (ret) {
6622 kvm_register_write(vcpu, VCPU_REGS_RAX, val); 6659 kvm_rax_write(vcpu, val);
6623 return ret; 6660 return ret;
6624 } 6661 }
6625 6662
@@ -6854,10 +6891,20 @@ static unsigned long kvm_get_guest_ip(void)
6854 return ip; 6891 return ip;
6855} 6892}
6856 6893
6894static void kvm_handle_intel_pt_intr(void)
6895{
6896 struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
6897
6898 kvm_make_request(KVM_REQ_PMI, vcpu);
6899 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
6900 (unsigned long *)&vcpu->arch.pmu.global_status);
6901}
6902
6857static struct perf_guest_info_callbacks kvm_guest_cbs = { 6903static struct perf_guest_info_callbacks kvm_guest_cbs = {
6858 .is_in_guest = kvm_is_in_guest, 6904 .is_in_guest = kvm_is_in_guest,
6859 .is_user_mode = kvm_is_user_mode, 6905 .is_user_mode = kvm_is_user_mode,
6860 .get_guest_ip = kvm_get_guest_ip, 6906 .get_guest_ip = kvm_get_guest_ip,
6907 .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
6861}; 6908};
6862 6909
6863static void kvm_set_mmio_spte_mask(void) 6910static void kvm_set_mmio_spte_mask(void)
@@ -7133,11 +7180,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
7133 if (kvm_hv_hypercall_enabled(vcpu->kvm)) 7180 if (kvm_hv_hypercall_enabled(vcpu->kvm))
7134 return kvm_hv_hypercall(vcpu); 7181 return kvm_hv_hypercall(vcpu);
7135 7182
7136 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 7183 nr = kvm_rax_read(vcpu);
7137 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 7184 a0 = kvm_rbx_read(vcpu);
7138 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 7185 a1 = kvm_rcx_read(vcpu);
7139 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 7186 a2 = kvm_rdx_read(vcpu);
7140 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 7187 a3 = kvm_rsi_read(vcpu);
7141 7188
7142 trace_kvm_hypercall(nr, a0, a1, a2, a3); 7189 trace_kvm_hypercall(nr, a0, a1, a2, a3);
7143 7190
@@ -7178,7 +7225,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
7178out: 7225out:
7179 if (!op_64_bit) 7226 if (!op_64_bit)
7180 ret = (u32)ret; 7227 ret = (u32)ret;
7181 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 7228 kvm_rax_write(vcpu, ret);
7182 7229
7183 ++vcpu->stat.hypercalls; 7230 ++vcpu->stat.hypercalls;
7184 return kvm_skip_emulated_instruction(vcpu); 7231 return kvm_skip_emulated_instruction(vcpu);
@@ -8280,23 +8327,23 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
8280 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); 8327 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
8281 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 8328 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8282 } 8329 }
8283 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 8330 regs->rax = kvm_rax_read(vcpu);
8284 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 8331 regs->rbx = kvm_rbx_read(vcpu);
8285 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 8332 regs->rcx = kvm_rcx_read(vcpu);
8286 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); 8333 regs->rdx = kvm_rdx_read(vcpu);
8287 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); 8334 regs->rsi = kvm_rsi_read(vcpu);
8288 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); 8335 regs->rdi = kvm_rdi_read(vcpu);
8289 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 8336 regs->rsp = kvm_rsp_read(vcpu);
8290 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); 8337 regs->rbp = kvm_rbp_read(vcpu);
8291#ifdef CONFIG_X86_64 8338#ifdef CONFIG_X86_64
8292 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); 8339 regs->r8 = kvm_r8_read(vcpu);
8293 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); 8340 regs->r9 = kvm_r9_read(vcpu);
8294 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); 8341 regs->r10 = kvm_r10_read(vcpu);
8295 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); 8342 regs->r11 = kvm_r11_read(vcpu);
8296 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); 8343 regs->r12 = kvm_r12_read(vcpu);
8297 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); 8344 regs->r13 = kvm_r13_read(vcpu);
8298 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); 8345 regs->r14 = kvm_r14_read(vcpu);
8299 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); 8346 regs->r15 = kvm_r15_read(vcpu);
8300#endif 8347#endif
8301 8348
8302 regs->rip = kvm_rip_read(vcpu); 8349 regs->rip = kvm_rip_read(vcpu);
@@ -8316,23 +8363,23 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
8316 vcpu->arch.emulate_regs_need_sync_from_vcpu = true; 8363 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
8317 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 8364 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8318 8365
8319 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 8366 kvm_rax_write(vcpu, regs->rax);
8320 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 8367 kvm_rbx_write(vcpu, regs->rbx);
8321 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 8368 kvm_rcx_write(vcpu, regs->rcx);
8322 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); 8369 kvm_rdx_write(vcpu, regs->rdx);
8323 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); 8370 kvm_rsi_write(vcpu, regs->rsi);
8324 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); 8371 kvm_rdi_write(vcpu, regs->rdi);
8325 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); 8372 kvm_rsp_write(vcpu, regs->rsp);
8326 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); 8373 kvm_rbp_write(vcpu, regs->rbp);
8327#ifdef CONFIG_X86_64 8374#ifdef CONFIG_X86_64
8328 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); 8375 kvm_r8_write(vcpu, regs->r8);
8329 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); 8376 kvm_r9_write(vcpu, regs->r9);
8330 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); 8377 kvm_r10_write(vcpu, regs->r10);
8331 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); 8378 kvm_r11_write(vcpu, regs->r11);
8332 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); 8379 kvm_r12_write(vcpu, regs->r12);
8333 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 8380 kvm_r13_write(vcpu, regs->r13);
8334 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 8381 kvm_r14_write(vcpu, regs->r14);
8335 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 8382 kvm_r15_write(vcpu, regs->r15);
8336#endif 8383#endif
8337 8384
8338 kvm_rip_write(vcpu, regs->rip); 8385 kvm_rip_write(vcpu, regs->rip);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 534d3f28bb01..a470ff0868c5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -345,6 +345,16 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
345 __this_cpu_write(current_vcpu, NULL); 345 __this_cpu_write(current_vcpu, NULL);
346} 346}
347 347
348
349static inline bool kvm_pat_valid(u64 data)
350{
351 if (data & 0xF8F8F8F8F8F8F8F8ull)
352 return false;
353 /* 0, 1, 4, 5, 6, 7 are valid values. */
354 return (data | ((data & 0x0202020202020202ull) << 1)) == data;
355}
356
348void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); 357void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
349void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); 358void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
359
350#endif 360#endif