aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt13
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-its.txt20
-rw-r--r--Documentation/virtual/kvm/devices/s390_flic.txt5
-rw-r--r--arch/arm/include/asm/kvm_asm.h2
-rw-r--r--arch/arm/include/asm/kvm_emulate.h38
-rw-r--r--arch/arm/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm/include/uapi/asm/kvm.h7
-rw-r--r--arch/arm/kvm/emulate.c137
-rw-r--r--arch/arm/kvm/hyp/switch.c7
-rw-r--r--arch/arm64/include/asm/arch_timer.h8
-rw-r--r--arch/arm64/include/asm/kvm_asm.h2
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm64/include/asm/timex.h2
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h7
-rw-r--r--arch/arm64/kvm/hyp/switch.c6
-rw-r--r--arch/arm64/kvm/inject_fault.c88
-rw-r--r--arch/arm64/kvm/sys_regs.c41
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h140
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h13
-rw-r--r--arch/powerpc/include/asm/kvm_host.h6
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c128
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c51
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c347
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c117
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c65
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S197
-rw-r--r--arch/powerpc/kvm/book3s_pr.c16
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c2
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h25
-rw-r--r--arch/s390/kvm/interrupt.c26
-rw-r--r--arch/s390/kvm/kvm-s390.c1
-rw-r--r--arch/s390/kvm/vsie.c50
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h8
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/kvm/emulate.c9
-rw-r--r--arch/x86/kvm/lapic.c91
-rw-r--r--arch/x86/kvm/mmu.c115
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c241
-rw-r--r--arch/x86/kvm/vmx.c208
-rw-r--r--arch/x86/kvm/x86.c94
-rw-r--r--drivers/clocksource/arm_arch_timer.c35
-rw-r--r--drivers/irqchip/irq-gic-v3.c8
-rw-r--r--drivers/irqchip/irq-gic.c6
-rw-r--r--include/kvm/arm_arch_timer.h26
-rw-r--r--include/linux/kvm_host.h1
-rw-r--r--include/uapi/linux/kvm.h1
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat30
-rw-r--r--virt/kvm/arm/aarch32.c97
-rw-r--r--virt/kvm/arm/arch_timer.c452
-rw-r--r--virt/kvm/arm/arm.c45
-rw-r--r--virt/kvm/arm/hyp/timer-sr.c74
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c199
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c22
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c17
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c44
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c5
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c12
-rw-r--r--virt/kvm/arm/vgic/vgic.c62
-rw-r--r--virt/kvm/arm/vgic/vgic.h3
-rw-r--r--virt/kvm/kvm_main.c6
70 files changed, 2264 insertions, 1270 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e63a35fafef0..f670e4b9e7f3 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1124,10 +1124,14 @@ guest physical address space and must not conflict with any memory slot
1124or any mmio address. The guest may malfunction if it accesses this memory 1124or any mmio address. The guest may malfunction if it accesses this memory
1125region. 1125region.
1126 1126
1127Setting the address to 0 will result in resetting the address to its default
1128(0xfffbc000).
1129
1127This ioctl is required on Intel-based hosts. This is needed on Intel hardware 1130This ioctl is required on Intel-based hosts. This is needed on Intel hardware
1128because of a quirk in the virtualization implementation (see the internals 1131because of a quirk in the virtualization implementation (see the internals
1129documentation when it pops into existence). 1132documentation when it pops into existence).
1130 1133
1134Fails if any VCPU has already been created.
1131 1135
11324.41 KVM_SET_BOOT_CPU_ID 11364.41 KVM_SET_BOOT_CPU_ID
1133 1137
@@ -4347,3 +4351,12 @@ This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its
4347value is used to denote the target vcpu for a SynIC interrupt. For 4351value is used to denote the target vcpu for a SynIC interrupt. For
4348compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this 4352compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this
4349capability is absent, userspace can still query this msr's value. 4353capability is absent, userspace can still query this msr's value.
4354
43558.13 KVM_CAP_S390_AIS_MIGRATION
4356
4357Architectures: s390
4358Parameters: none
4359
4360This capability indicates if the flic device will be able to get/set the
4361AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
4362to discover this without having to create a flic device.
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
index eb06beb75960..8d5830eab26a 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic-its.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic-its.txt
@@ -33,6 +33,10 @@ Groups:
33 request the initialization of the ITS, no additional parameter in 33 request the initialization of the ITS, no additional parameter in
34 kvm_device_attr.addr. 34 kvm_device_attr.addr.
35 35
36 KVM_DEV_ARM_ITS_CTRL_RESET
37 reset the ITS, no additional parameter in kvm_device_attr.addr.
38 See "ITS Reset State" section.
39
36 KVM_DEV_ARM_ITS_SAVE_TABLES 40 KVM_DEV_ARM_ITS_SAVE_TABLES
37 save the ITS table data into guest RAM, at the location provisioned 41 save the ITS table data into guest RAM, at the location provisioned
38 by the guest in corresponding registers/table entries. 42 by the guest in corresponding registers/table entries.
@@ -157,3 +161,19 @@ Then vcpus can be started.
157 - pINTID is the physical LPI ID; if zero, it means the entry is not valid 161 - pINTID is the physical LPI ID; if zero, it means the entry is not valid
158 and other fields are not meaningful. 162 and other fields are not meaningful.
159 - ICID is the collection ID 163 - ICID is the collection ID
164
165 ITS Reset State:
166 ----------------
167
168RESET returns the ITS to the same state that it was when first created and
169initialized. When the RESET command returns, the following things are
170guaranteed:
171
172- The ITS is not enabled and quiescent
173 GITS_CTLR.Enabled = 0 .Quiescent=1
174- There is no internally cached state
175- No collection or device table are used
176 GITS_BASER<n>.Valid = 0
177- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0
178- The ABI version is unchanged and remains the one set when the ITS
179 device was first created.
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index 2f1cbf1301d2..a4e20a090174 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -151,8 +151,13 @@ struct kvm_s390_ais_all {
151 to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and 151 to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and
152 nimm bit presents AIS mode for a ISC. 152 nimm bit presents AIS mode for a ISC.
153 153
154 KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION.
155
154Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on 156Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
155FLIC with an unknown group or attribute gives the error code EINVAL (instead of 157FLIC with an unknown group or attribute gives the error code EINVAL (instead of
156ENXIO, as specified in the API documentation). It is not possible to conclude 158ENXIO, as specified in the API documentation). It is not possible to conclude
157that a FLIC operation is unavailable based on the error code resulting from a 159that a FLIC operation is unavailable based on the error code resulting from a
158usage attempt. 160usage attempt.
161
162Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero
163schid is specified.
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 14d68a4d826f..36dd2962a42d 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -68,6 +68,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
68extern void __kvm_tlb_flush_vmid(struct kvm *kvm); 68extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
69extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); 69extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
70 70
71extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
72
71extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 73extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
72 74
73extern void __init_stage2_translation(void); 75extern void __init_stage2_translation(void);
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 98089ffd91bb..3d22eb87f919 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -25,7 +25,22 @@
25#include <asm/kvm_arm.h> 25#include <asm/kvm_arm.h>
26#include <asm/cputype.h> 26#include <asm/cputype.h>
27 27
28/* arm64 compatibility macros */
29#define COMPAT_PSR_MODE_ABT ABT_MODE
30#define COMPAT_PSR_MODE_UND UND_MODE
31#define COMPAT_PSR_T_BIT PSR_T_BIT
32#define COMPAT_PSR_I_BIT PSR_I_BIT
33#define COMPAT_PSR_A_BIT PSR_A_BIT
34#define COMPAT_PSR_E_BIT PSR_E_BIT
35#define COMPAT_PSR_IT_MASK PSR_IT_MASK
36
28unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); 37unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
38
39static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num)
40{
41 return vcpu_reg(vcpu, reg_num);
42}
43
29unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); 44unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
30 45
31static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, 46static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu,
@@ -42,10 +57,25 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
42 57
43bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); 58bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
44void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); 59void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
45void kvm_inject_undefined(struct kvm_vcpu *vcpu); 60void kvm_inject_undef32(struct kvm_vcpu *vcpu);
61void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
62void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
46void kvm_inject_vabt(struct kvm_vcpu *vcpu); 63void kvm_inject_vabt(struct kvm_vcpu *vcpu);
47void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); 64
48void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); 65static inline void kvm_inject_undefined(struct kvm_vcpu *vcpu)
66{
67 kvm_inject_undef32(vcpu);
68}
69
70static inline void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
71{
72 kvm_inject_dabt32(vcpu, addr);
73}
74
75static inline void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
76{
77 kvm_inject_pabt32(vcpu, addr);
78}
49 79
50static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) 80static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
51{ 81{
@@ -203,7 +233,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu)
203 233
204static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) 234static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu)
205{ 235{
206 switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 236 switch (kvm_vcpu_trap_get_fault(vcpu)) {
207 case FSC_SEA: 237 case FSC_SEA:
208 case FSC_SEA_TTW0: 238 case FSC_SEA_TTW0:
209 case FSC_SEA_TTW1: 239 case FSC_SEA_TTW1:
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 14b5903f0224..ab20ffa8b9e7 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -98,8 +98,8 @@
98#define cntvoff_el2 CNTVOFF 98#define cntvoff_el2 CNTVOFF
99#define cnthctl_el2 CNTHCTL 99#define cnthctl_el2 CNTHCTL
100 100
101void __timer_save_state(struct kvm_vcpu *vcpu); 101void __timer_enable_traps(struct kvm_vcpu *vcpu);
102void __timer_restore_state(struct kvm_vcpu *vcpu); 102void __timer_disable_traps(struct kvm_vcpu *vcpu);
103 103
104void __vgic_v2_save_state(struct kvm_vcpu *vcpu); 104void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
105void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); 105void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 1f57bbe82b6f..6edd177bb1c7 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -152,6 +152,12 @@ struct kvm_arch_memory_slot {
152 (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) 152 (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
153#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) 153#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)
154 154
155/* PL1 Physical Timer Registers */
156#define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1)
157#define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14)
158#define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14)
159
160/* Virtual Timer Registers */
155#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) 161#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1)
156#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) 162#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14)
157#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) 163#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14)
@@ -216,6 +222,7 @@ struct kvm_arch_memory_slot {
216#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 222#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
217#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 223#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
218#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 224#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
225#define KVM_DEV_ARM_ITS_CTRL_RESET 4
219 226
220/* KVM_IRQ_LINE irq field index values */ 227/* KVM_IRQ_LINE irq field index values */
221#define KVM_ARM_IRQ_TYPE_SHIFT 24 228#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index 30a13647c54c..cdff963f133a 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -165,143 +165,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
165 * Inject exceptions into the guest 165 * Inject exceptions into the guest
166 */ 166 */
167 167
168static u32 exc_vector_base(struct kvm_vcpu *vcpu)
169{
170 u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
171 u32 vbar = vcpu_cp15(vcpu, c12_VBAR);
172
173 if (sctlr & SCTLR_V)
174 return 0xffff0000;
175 else /* always have security exceptions */
176 return vbar;
177}
178
179/*
180 * Switch to an exception mode, updating both CPSR and SPSR. Follow
181 * the logic described in AArch32.EnterMode() from the ARMv8 ARM.
182 */
183static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode)
184{
185 unsigned long cpsr = *vcpu_cpsr(vcpu);
186 u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
187
188 *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode;
189
190 switch (mode) {
191 case FIQ_MODE:
192 *vcpu_cpsr(vcpu) |= PSR_F_BIT;
193 /* Fall through */
194 case ABT_MODE:
195 case IRQ_MODE:
196 *vcpu_cpsr(vcpu) |= PSR_A_BIT;
197 /* Fall through */
198 default:
199 *vcpu_cpsr(vcpu) |= PSR_I_BIT;
200 }
201
202 *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
203
204 if (sctlr & SCTLR_TE)
205 *vcpu_cpsr(vcpu) |= PSR_T_BIT;
206 if (sctlr & SCTLR_EE)
207 *vcpu_cpsr(vcpu) |= PSR_E_BIT;
208
209 /* Note: These now point to the mode banked copies */
210 *vcpu_spsr(vcpu) = cpsr;
211}
212
213/**
214 * kvm_inject_undefined - inject an undefined exception into the guest
215 * @vcpu: The VCPU to receive the undefined exception
216 *
217 * It is assumed that this code is called from the VCPU thread and that the
218 * VCPU therefore is not currently executing guest code.
219 *
220 * Modelled after TakeUndefInstrException() pseudocode.
221 */
222void kvm_inject_undefined(struct kvm_vcpu *vcpu)
223{
224 unsigned long cpsr = *vcpu_cpsr(vcpu);
225 bool is_thumb = (cpsr & PSR_T_BIT);
226 u32 vect_offset = 4;
227 u32 return_offset = (is_thumb) ? 2 : 4;
228
229 kvm_update_psr(vcpu, UND_MODE);
230 *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
231
232 /* Branch to exception vector */
233 *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
234}
235
236/*
237 * Modelled after TakeDataAbortException() and TakePrefetchAbortException
238 * pseudocode.
239 */
240static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
241{
242 u32 vect_offset;
243 u32 return_offset = (is_pabt) ? 4 : 8;
244 bool is_lpae;
245
246 kvm_update_psr(vcpu, ABT_MODE);
247 *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
248
249 if (is_pabt)
250 vect_offset = 12;
251 else
252 vect_offset = 16;
253
254 /* Branch to exception vector */
255 *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
256
257 if (is_pabt) {
258 /* Set IFAR and IFSR */
259 vcpu_cp15(vcpu, c6_IFAR) = addr;
260 is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
261 /* Always give debug fault for now - should give guest a clue */
262 if (is_lpae)
263 vcpu_cp15(vcpu, c5_IFSR) = 1 << 9 | 0x22;
264 else
265 vcpu_cp15(vcpu, c5_IFSR) = 2;
266 } else { /* !iabt */
267 /* Set DFAR and DFSR */
268 vcpu_cp15(vcpu, c6_DFAR) = addr;
269 is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
270 /* Always give debug fault for now - should give guest a clue */
271 if (is_lpae)
272 vcpu_cp15(vcpu, c5_DFSR) = 1 << 9 | 0x22;
273 else
274 vcpu_cp15(vcpu, c5_DFSR) = 2;
275 }
276
277}
278
279/**
280 * kvm_inject_dabt - inject a data abort into the guest
281 * @vcpu: The VCPU to receive the undefined exception
282 * @addr: The address to report in the DFAR
283 *
284 * It is assumed that this code is called from the VCPU thread and that the
285 * VCPU therefore is not currently executing guest code.
286 */
287void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
288{
289 inject_abt(vcpu, false, addr);
290}
291
292/**
293 * kvm_inject_pabt - inject a prefetch abort into the guest
294 * @vcpu: The VCPU to receive the undefined exception
295 * @addr: The address to report in the DFAR
296 *
297 * It is assumed that this code is called from the VCPU thread and that the
298 * VCPU therefore is not currently executing guest code.
299 */
300void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
301{
302 inject_abt(vcpu, true, addr);
303}
304
305/** 168/**
306 * kvm_inject_vabt - inject an async abort / SError into the guest 169 * kvm_inject_vabt - inject an async abort / SError into the guest
307 * @vcpu: The VCPU to receive the exception 170 * @vcpu: The VCPU to receive the exception
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index ebd2dd46adf7..330c9ce34ba5 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -174,7 +174,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
174 __activate_vm(vcpu); 174 __activate_vm(vcpu);
175 175
176 __vgic_restore_state(vcpu); 176 __vgic_restore_state(vcpu);
177 __timer_restore_state(vcpu); 177 __timer_enable_traps(vcpu);
178 178
179 __sysreg_restore_state(guest_ctxt); 179 __sysreg_restore_state(guest_ctxt);
180 __banked_restore_state(guest_ctxt); 180 __banked_restore_state(guest_ctxt);
@@ -191,7 +191,8 @@ again:
191 191
192 __banked_save_state(guest_ctxt); 192 __banked_save_state(guest_ctxt);
193 __sysreg_save_state(guest_ctxt); 193 __sysreg_save_state(guest_ctxt);
194 __timer_save_state(vcpu); 194 __timer_disable_traps(vcpu);
195
195 __vgic_save_state(vcpu); 196 __vgic_save_state(vcpu);
196 197
197 __deactivate_traps(vcpu); 198 __deactivate_traps(vcpu);
@@ -237,7 +238,7 @@ void __hyp_text __noreturn __hyp_panic(int cause)
237 238
238 vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); 239 vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR);
239 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 240 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
240 __timer_save_state(vcpu); 241 __timer_disable_traps(vcpu);
241 __deactivate_traps(vcpu); 242 __deactivate_traps(vcpu);
242 __deactivate_vm(vcpu); 243 __deactivate_vm(vcpu);
243 __banked_restore_state(host_ctxt); 244 __banked_restore_state(host_ctxt);
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index bdedd8f748d1..f2a234d6516c 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -52,6 +52,7 @@ struct arch_timer_erratum_workaround {
52 const char *desc; 52 const char *desc;
53 u32 (*read_cntp_tval_el0)(void); 53 u32 (*read_cntp_tval_el0)(void);
54 u32 (*read_cntv_tval_el0)(void); 54 u32 (*read_cntv_tval_el0)(void);
55 u64 (*read_cntpct_el0)(void);
55 u64 (*read_cntvct_el0)(void); 56 u64 (*read_cntvct_el0)(void);
56 int (*set_next_event_phys)(unsigned long, struct clock_event_device *); 57 int (*set_next_event_phys)(unsigned long, struct clock_event_device *);
57 int (*set_next_event_virt)(unsigned long, struct clock_event_device *); 58 int (*set_next_event_virt)(unsigned long, struct clock_event_device *);
@@ -149,11 +150,8 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl)
149 150
150static inline u64 arch_counter_get_cntpct(void) 151static inline u64 arch_counter_get_cntpct(void)
151{ 152{
152 /* 153 isb();
153 * AArch64 kernel and user space mandate the use of CNTVCT. 154 return arch_timer_reg_read_stable(cntpct_el0);
154 */
155 BUG();
156 return 0;
157} 155}
158 156
159static inline u64 arch_counter_get_cntvct(void) 157static inline u64 arch_counter_get_cntvct(void)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 26a64d0f9ab9..ab4d0a926043 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -55,6 +55,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
55extern void __kvm_tlb_flush_vmid(struct kvm *kvm); 55extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
56extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); 56extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
57 57
58extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
59
58extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 60extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
59 61
60extern u64 __vgic_v3_get_ich_vtr_el2(void); 62extern u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index e5df3fce0008..5f28dfa14cee 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -41,6 +41,9 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
41void kvm_inject_vabt(struct kvm_vcpu *vcpu); 41void kvm_inject_vabt(struct kvm_vcpu *vcpu);
42void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr); 42void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
43void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr); 43void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
44void kvm_inject_undef32(struct kvm_vcpu *vcpu);
45void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
46void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
44 47
45static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) 48static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
46{ 49{
@@ -237,7 +240,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
237 240
238static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) 241static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
239{ 242{
240 switch (kvm_vcpu_trap_get_fault_type(vcpu)) { 243 switch (kvm_vcpu_trap_get_fault(vcpu)) {
241 case FSC_SEA: 244 case FSC_SEA:
242 case FSC_SEA_TTW0: 245 case FSC_SEA_TTW0:
243 case FSC_SEA_TTW1: 246 case FSC_SEA_TTW1:
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4572a9b560fa..08d3bb66c8b7 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -129,8 +129,8 @@ void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
129void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); 129void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
130int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); 130int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
131 131
132void __timer_save_state(struct kvm_vcpu *vcpu); 132void __timer_enable_traps(struct kvm_vcpu *vcpu);
133void __timer_restore_state(struct kvm_vcpu *vcpu); 133void __timer_disable_traps(struct kvm_vcpu *vcpu);
134 134
135void __sysreg_save_host_state(struct kvm_cpu_context *ctxt); 135void __sysreg_save_host_state(struct kvm_cpu_context *ctxt);
136void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt); 136void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt);
diff --git a/arch/arm64/include/asm/timex.h b/arch/arm64/include/asm/timex.h
index 81a076eb37fa..9ad60bae5c8d 100644
--- a/arch/arm64/include/asm/timex.h
+++ b/arch/arm64/include/asm/timex.h
@@ -22,7 +22,7 @@
22 * Use the current timer as a cycle counter since this is what we use for 22 * Use the current timer as a cycle counter since this is what we use for
23 * the delay loop. 23 * the delay loop.
24 */ 24 */
25#define get_cycles() arch_counter_get_cntvct() 25#define get_cycles() arch_timer_read_counter()
26 26
27#include <asm-generic/timex.h> 27#include <asm-generic/timex.h>
28 28
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 51149ec75fe4..9abbf3044654 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -196,6 +196,12 @@ struct kvm_arch_memory_slot {
196 196
197#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64) 197#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)
198 198
199/* Physical Timer EL0 Registers */
200#define KVM_REG_ARM_PTIMER_CTL ARM64_SYS_REG(3, 3, 14, 2, 1)
201#define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2)
202#define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1)
203
204/* EL0 Virtual Timer Registers */
199#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) 205#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1)
200#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) 206#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2)
201#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) 207#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2)
@@ -228,6 +234,7 @@ struct kvm_arch_memory_slot {
228#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 234#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
229#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 235#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
230#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 236#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
237#define KVM_DEV_ARM_ITS_CTRL_RESET 4
231 238
232/* Device Control API on vcpu fd */ 239/* Device Control API on vcpu fd */
233#define KVM_ARM_VCPU_PMU_V3_CTRL 0 240#define KVM_ARM_VCPU_PMU_V3_CTRL 0
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 951f3ebaff26..525c01f48867 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -304,7 +304,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
304 __activate_vm(vcpu); 304 __activate_vm(vcpu);
305 305
306 __vgic_restore_state(vcpu); 306 __vgic_restore_state(vcpu);
307 __timer_restore_state(vcpu); 307 __timer_enable_traps(vcpu);
308 308
309 /* 309 /*
310 * We must restore the 32-bit state before the sysregs, thanks 310 * We must restore the 32-bit state before the sysregs, thanks
@@ -374,7 +374,7 @@ again:
374 374
375 __sysreg_save_guest_state(guest_ctxt); 375 __sysreg_save_guest_state(guest_ctxt);
376 __sysreg32_save_state(vcpu); 376 __sysreg32_save_state(vcpu);
377 __timer_save_state(vcpu); 377 __timer_disable_traps(vcpu);
378 __vgic_save_state(vcpu); 378 __vgic_save_state(vcpu);
379 379
380 __deactivate_traps(vcpu); 380 __deactivate_traps(vcpu);
@@ -442,7 +442,7 @@ void __hyp_text __noreturn __hyp_panic(void)
442 442
443 vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); 443 vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
444 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 444 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
445 __timer_save_state(vcpu); 445 __timer_disable_traps(vcpu);
446 __deactivate_traps(vcpu); 446 __deactivate_traps(vcpu);
447 __deactivate_vm(vcpu); 447 __deactivate_vm(vcpu);
448 __sysreg_restore_host_state(host_ctxt); 448 __sysreg_restore_host_state(host_ctxt);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 3556715a774e..8ecbcb40e317 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -33,88 +33,6 @@
33#define LOWER_EL_AArch64_VECTOR 0x400 33#define LOWER_EL_AArch64_VECTOR 0x400
34#define LOWER_EL_AArch32_VECTOR 0x600 34#define LOWER_EL_AArch32_VECTOR 0x600
35 35
36/*
37 * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
38 */
39static const u8 return_offsets[8][2] = {
40 [0] = { 0, 0 }, /* Reset, unused */
41 [1] = { 4, 2 }, /* Undefined */
42 [2] = { 0, 0 }, /* SVC, unused */
43 [3] = { 4, 4 }, /* Prefetch abort */
44 [4] = { 8, 8 }, /* Data abort */
45 [5] = { 0, 0 }, /* HVC, unused */
46 [6] = { 4, 4 }, /* IRQ, unused */
47 [7] = { 4, 4 }, /* FIQ, unused */
48};
49
50static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
51{
52 unsigned long cpsr;
53 unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
54 bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT);
55 u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
56 u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
57
58 cpsr = mode | COMPAT_PSR_I_BIT;
59
60 if (sctlr & (1 << 30))
61 cpsr |= COMPAT_PSR_T_BIT;
62 if (sctlr & (1 << 25))
63 cpsr |= COMPAT_PSR_E_BIT;
64
65 *vcpu_cpsr(vcpu) = cpsr;
66
67 /* Note: These now point to the banked copies */
68 *vcpu_spsr(vcpu) = new_spsr_value;
69 *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
70
71 /* Branch to exception vector */
72 if (sctlr & (1 << 13))
73 vect_offset += 0xffff0000;
74 else /* always have security exceptions */
75 vect_offset += vcpu_cp15(vcpu, c12_VBAR);
76
77 *vcpu_pc(vcpu) = vect_offset;
78}
79
80static void inject_undef32(struct kvm_vcpu *vcpu)
81{
82 prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4);
83}
84
85/*
86 * Modelled after TakeDataAbortException() and TakePrefetchAbortException
87 * pseudocode.
88 */
89static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
90 unsigned long addr)
91{
92 u32 vect_offset;
93 u32 *far, *fsr;
94 bool is_lpae;
95
96 if (is_pabt) {
97 vect_offset = 12;
98 far = &vcpu_cp15(vcpu, c6_IFAR);
99 fsr = &vcpu_cp15(vcpu, c5_IFSR);
100 } else { /* !iabt */
101 vect_offset = 16;
102 far = &vcpu_cp15(vcpu, c6_DFAR);
103 fsr = &vcpu_cp15(vcpu, c5_DFSR);
104 }
105
106 prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset);
107
108 *far = addr;
109
110 /* Give the guest an IMPLEMENTATION DEFINED exception */
111 is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
112 if (is_lpae)
113 *fsr = 1 << 9 | 0x34;
114 else
115 *fsr = 0x14;
116}
117
118enum exception_type { 36enum exception_type {
119 except_type_sync = 0, 37 except_type_sync = 0,
120 except_type_irq = 0x80, 38 except_type_irq = 0x80,
@@ -211,7 +129,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
211void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) 129void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
212{ 130{
213 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 131 if (!(vcpu->arch.hcr_el2 & HCR_RW))
214 inject_abt32(vcpu, false, addr); 132 kvm_inject_dabt32(vcpu, addr);
215 else 133 else
216 inject_abt64(vcpu, false, addr); 134 inject_abt64(vcpu, false, addr);
217} 135}
@@ -227,7 +145,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
227void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) 145void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
228{ 146{
229 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 147 if (!(vcpu->arch.hcr_el2 & HCR_RW))
230 inject_abt32(vcpu, true, addr); 148 kvm_inject_pabt32(vcpu, addr);
231 else 149 else
232 inject_abt64(vcpu, true, addr); 150 inject_abt64(vcpu, true, addr);
233} 151}
@@ -241,7 +159,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
241void kvm_inject_undefined(struct kvm_vcpu *vcpu) 159void kvm_inject_undefined(struct kvm_vcpu *vcpu)
242{ 160{
243 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 161 if (!(vcpu->arch.hcr_el2 & HCR_RW))
244 inject_undef32(vcpu); 162 kvm_inject_undef32(vcpu);
245 else 163 else
246 inject_undef64(vcpu); 164 inject_undef64(vcpu);
247} 165}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a0ee9b05e3d4..1830ebc227d1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -842,13 +842,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu,
842 struct sys_reg_params *p, 842 struct sys_reg_params *p,
843 const struct sys_reg_desc *r) 843 const struct sys_reg_desc *r)
844{ 844{
845 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
846 u64 now = kvm_phys_timer_read(); 845 u64 now = kvm_phys_timer_read();
846 u64 cval;
847 847
848 if (p->is_write) 848 if (p->is_write) {
849 ptimer->cnt_cval = p->regval + now; 849 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL,
850 else 850 p->regval + now);
851 p->regval = ptimer->cnt_cval - now; 851 } else {
852 cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
853 p->regval = cval - now;
854 }
852 855
853 return true; 856 return true;
854} 857}
@@ -857,24 +860,10 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
857 struct sys_reg_params *p, 860 struct sys_reg_params *p,
858 const struct sys_reg_desc *r) 861 const struct sys_reg_desc *r)
859{ 862{
860 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 863 if (p->is_write)
861 864 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
862 if (p->is_write) { 865 else
863 /* ISTATUS bit is read-only */ 866 p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
864 ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
865 } else {
866 u64 now = kvm_phys_timer_read();
867
868 p->regval = ptimer->cnt_ctl;
869 /*
870 * Set ISTATUS bit if it's expired.
871 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
872 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
873 * regardless of ENABLE bit for our implementation convenience.
874 */
875 if (ptimer->cnt_cval <= now)
876 p->regval |= ARCH_TIMER_CTRL_IT_STAT;
877 }
878 867
879 return true; 868 return true;
880} 869}
@@ -883,12 +872,10 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
883 struct sys_reg_params *p, 872 struct sys_reg_params *p,
884 const struct sys_reg_desc *r) 873 const struct sys_reg_desc *r)
885{ 874{
886 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
887
888 if (p->is_write) 875 if (p->is_write)
889 ptimer->cnt_cval = p->regval; 876 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval);
890 else 877 else
891 p->regval = ptimer->cnt_cval; 878 p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
892 879
893 return true; 880 return true;
894} 881}
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b8d5b8e35244..9a667007bff8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
216 bool writing, bool *writable); 216 bool writing, bool *writable);
217extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, 217extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
218 unsigned long *rmap, long pte_index, int realmode); 218 unsigned long *rmap, long pte_index, int realmode);
219extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); 219extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
220 unsigned long gfn, unsigned long psize);
220extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, 221extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
221 unsigned long pte_index); 222 unsigned long pte_index);
222void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, 223void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d55c7f881ce7..735cfa35298a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
20#ifndef __ASM_KVM_BOOK3S_64_H__ 20#ifndef __ASM_KVM_BOOK3S_64_H__
21#define __ASM_KVM_BOOK3S_64_H__ 21#define __ASM_KVM_BOOK3S_64_H__
22 22
23#include <linux/string.h>
24#include <asm/bitops.h>
23#include <asm/book3s/64/mmu-hash.h> 25#include <asm/book3s/64/mmu-hash.h>
24 26
25/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ 27/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
@@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
107 hpte[0] = cpu_to_be64(hpte_v); 109 hpte[0] = cpu_to_be64(hpte_v);
108} 110}
109 111
112/*
113 * These functions encode knowledge of the POWER7/8/9 hardware
114 * interpretations of the HPTE LP (large page size) field.
115 */
116static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
117{
118 unsigned int lphi;
119
120 if (!(h & HPTE_V_LARGE))
121 return 12; /* 4kB */
122 lphi = (l >> 16) & 0xf;
123 switch ((l >> 12) & 0xf) {
124 case 0:
125 return !lphi ? 24 : -1; /* 16MB */
126 break;
127 case 1:
128 return 16; /* 64kB */
129 break;
130 case 3:
131 return !lphi ? 34 : -1; /* 16GB */
132 break;
133 case 7:
134 return (16 << 8) + 12; /* 64kB in 4kB */
135 break;
136 case 8:
137 if (!lphi)
138 return (24 << 8) + 16; /* 16MB in 64kkB */
139 if (lphi == 3)
140 return (24 << 8) + 12; /* 16MB in 4kB */
141 break;
142 }
143 return -1;
144}
145
146static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
147{
148 return kvmppc_hpte_page_shifts(h, l) & 0xff;
149}
150
151static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l)
152{
153 int tmp = kvmppc_hpte_page_shifts(h, l);
154
155 if (tmp >= 0x100)
156 tmp >>= 8;
157 return tmp;
158}
159
160static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
161{
162 return 1ul << kvmppc_hpte_actual_page_shift(v, r);
163}
164
165static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
166{
167 switch (base_shift) {
168 case 12:
169 switch (actual_shift) {
170 case 12:
171 return 0;
172 case 16:
173 return 7;
174 case 24:
175 return 0x38;
176 }
177 break;
178 case 16:
179 switch (actual_shift) {
180 case 16:
181 return 1;
182 case 24:
183 return 8;
184 }
185 break;
186 case 24:
187 return 0;
188 }
189 return -1;
190}
191
110static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, 192static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
111 unsigned long pte_index) 193 unsigned long pte_index)
112{ 194{
113 int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K; 195 int a_pgshift, b_pgshift;
114 unsigned int penc;
115 unsigned long rb = 0, va_low, sllp; 196 unsigned long rb = 0, va_low, sllp;
116 unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
117 197
118 if (v & HPTE_V_LARGE) { 198 b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r);
119 i = hpte_page_sizes[lp]; 199 if (a_pgshift >= 0x100) {
120 b_psize = i & 0xf; 200 b_pgshift &= 0xff;
121 a_psize = i >> 4; 201 a_pgshift >>= 8;
122 } 202 }
123 203
124 /* 204 /*
@@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
152 va_low ^= v >> (SID_SHIFT_1T - 16); 232 va_low ^= v >> (SID_SHIFT_1T - 16);
153 va_low &= 0x7ff; 233 va_low &= 0x7ff;
154 234
155 switch (b_psize) { 235 if (b_pgshift == 12) {
156 case MMU_PAGE_4K: 236 if (a_pgshift > 12) {
157 sllp = get_sllp_encoding(a_psize); 237 sllp = (a_pgshift == 16) ? 5 : 4;
158 rb |= sllp << 5; /* AP field */ 238 rb |= sllp << 5; /* AP field */
239 }
159 rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */ 240 rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */
160 break; 241 } else {
161 default:
162 {
163 int aval_shift; 242 int aval_shift;
164 /* 243 /*
165 * remaining bits of AVA/LP fields 244 * remaining bits of AVA/LP fields
166 * Also contain the rr bits of LP 245 * Also contain the rr bits of LP
167 */ 246 */
168 rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000; 247 rb |= (va_low << b_pgshift) & 0x7ff000;
169 /* 248 /*
170 * Now clear not needed LP bits based on actual psize 249 * Now clear not needed LP bits based on actual psize
171 */ 250 */
172 rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1); 251 rb &= ~((1ul << a_pgshift) - 1);
173 /* 252 /*
174 * AVAL field 58..77 - base_page_shift bits of va 253 * AVAL field 58..77 - base_page_shift bits of va
175 * we have space for 58..64 bits, Missing bits should 254 * we have space for 58..64 bits, Missing bits should
176 * be zero filled. +1 is to take care of L bit shift 255 * be zero filled. +1 is to take care of L bit shift
177 */ 256 */
178 aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1; 257 aval_shift = 64 - (77 - b_pgshift) + 1;
179 rb |= ((va_low << aval_shift) & 0xfe); 258 rb |= ((va_low << aval_shift) & 0xfe);
180 259
181 rb |= 1; /* L field */ 260 rb |= 1; /* L field */
182 penc = mmu_psize_defs[b_psize].penc[a_psize]; 261 rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
183 rb |= penc << 12; /* LP field */
184 break;
185 }
186 } 262 }
187 rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */ 263 rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
188 return rb; 264 return rb;
@@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
370 return (1UL << (hpt->order - 7)) - 1; 446 return (1UL << (hpt->order - 7)) - 1;
371} 447}
372 448
449/* Set bits in a dirty bitmap, which is in LE format */
450static inline void set_dirty_bits(unsigned long *map, unsigned long i,
451 unsigned long npages)
452{
453
454 if (npages >= 8)
455 memset((char *)map + i / 8, 0xff, npages / 8);
456 else
457 for (; npages; ++i, --npages)
458 __set_bit_le(i, map);
459}
460
461static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i,
462 unsigned long npages)
463{
464 if (npages >= 8)
465 memset((char *)map + i / 8, 0xff, npages / 8);
466 else
467 for (; npages; ++i, --npages)
468 set_bit_le(i, map);
469}
470
373#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 471#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
374 472
375#endif /* __ASM_KVM_BOOK3S_64_H__ */ 473#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 7cea76f11c26..ab386af2904f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,6 +82,16 @@ struct kvm_split_mode {
82 u8 do_nap; 82 u8 do_nap;
83 u8 napped[MAX_SMT_THREADS]; 83 u8 napped[MAX_SMT_THREADS];
84 struct kvmppc_vcore *vc[MAX_SUBCORES]; 84 struct kvmppc_vcore *vc[MAX_SUBCORES];
85 /* Bits for changing lpcr on P9 */
86 unsigned long lpcr_req;
87 unsigned long lpidr_req;
88 unsigned long host_lpcr;
89 u32 do_set;
90 u32 do_restore;
91 union {
92 u32 allphases;
93 u8 phase[4];
94 } lpcr_sync;
85}; 95};
86 96
87/* 97/*
@@ -107,7 +117,8 @@ struct kvmppc_host_state {
107 u8 hwthread_req; 117 u8 hwthread_req;
108 u8 hwthread_state; 118 u8 hwthread_state;
109 u8 host_ipi; 119 u8 host_ipi;
110 u8 ptid; 120 u8 ptid; /* thread number within subcore when split */
121 u8 tid; /* thread number within whole core */
111 struct kvm_vcpu *kvm_vcpu; 122 struct kvm_vcpu *kvm_vcpu;
112 struct kvmppc_vcore *kvm_vcore; 123 struct kvmppc_vcore *kvm_vcore;
113 void __iomem *xics_phys; 124 void __iomem *xics_phys;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e372ed871c51..3aa5b577cd60 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -235,10 +235,7 @@ struct revmap_entry {
235 */ 235 */
236#define KVMPPC_RMAP_LOCK_BIT 63 236#define KVMPPC_RMAP_LOCK_BIT 63
237#define KVMPPC_RMAP_RC_SHIFT 32 237#define KVMPPC_RMAP_RC_SHIFT 32
238#define KVMPPC_RMAP_CHG_SHIFT 48
239#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) 238#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
240#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
241#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
242#define KVMPPC_RMAP_PRESENT 0x100000000ul 239#define KVMPPC_RMAP_PRESENT 0x100000000ul
243#define KVMPPC_RMAP_INDEX 0xfffffffful 240#define KVMPPC_RMAP_INDEX 0xfffffffful
244 241
@@ -276,7 +273,7 @@ struct kvm_arch {
276 int tlbie_lock; 273 int tlbie_lock;
277 unsigned long lpcr; 274 unsigned long lpcr;
278 unsigned long vrma_slb_v; 275 unsigned long vrma_slb_v;
279 int hpte_setup_done; 276 int mmu_ready;
280 atomic_t vcpus_running; 277 atomic_t vcpus_running;
281 u32 online_vcores; 278 u32 online_vcores;
282 atomic_t hpte_mod_interest; 279 atomic_t hpte_mod_interest;
@@ -284,6 +281,7 @@ struct kvm_arch {
284 cpumask_t cpu_in_guest; 281 cpumask_t cpu_in_guest;
285 u8 radix; 282 u8 radix;
286 u8 fwnmi_enabled; 283 u8 fwnmi_enabled;
284 bool threads_indep;
287 pgd_t *pgtable; 285 pgd_t *pgtable;
288 u64 process_table; 286 u64 process_table;
289 struct dentry *debugfs_dir; 287 struct dentry *debugfs_dir;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ba5fadd6f3c9..96753f3aac6d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
168extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); 168extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
169extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); 169extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
170extern void kvmppc_free_hpt(struct kvm_hpt_info *info); 170extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
171extern void kvmppc_rmap_reset(struct kvm *kvm);
171extern long kvmppc_prepare_vrma(struct kvm *kvm, 172extern long kvmppc_prepare_vrma(struct kvm *kvm,
172 struct kvm_userspace_memory_region *mem); 173 struct kvm_userspace_memory_region *mem);
173extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, 174extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
177 struct iommu_group *grp); 178 struct iommu_group *grp);
178extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, 179extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
179 struct iommu_group *grp); 180 struct iommu_group *grp);
181extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);
182extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm);
180 183
181extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 184extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
182 struct kvm_create_spapr_tce_64 *args); 185 struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9aace433491a..6b958414b4e0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -642,6 +642,7 @@ int main(void)
642 HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); 642 HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
643 HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); 643 HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
644 HSTATE_FIELD(HSTATE_PTID, ptid); 644 HSTATE_FIELD(HSTATE_PTID, ptid);
645 HSTATE_FIELD(HSTATE_TID, tid);
645 HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); 646 HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
646 HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); 647 HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
647 HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); 648 HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
667 OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); 668 OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
668 OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); 669 OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
669 OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); 670 OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
671 OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
672 OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
670#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 673#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
671 674
672#ifdef CONFIG_PPC_BOOK3S_64 675#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 59247af5fd45..235319c2574e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -73,8 +73,6 @@ struct kvm_resize_hpt {
73 struct kvm_hpt_info hpt; 73 struct kvm_hpt_info hpt;
74}; 74};
75 75
76static void kvmppc_rmap_reset(struct kvm *kvm);
77
78int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) 76int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
79{ 77{
80 unsigned long hpt = 0; 78 unsigned long hpt = 0;
@@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
106 /* Allocate reverse map array */ 104 /* Allocate reverse map array */
107 rev = vmalloc(sizeof(struct revmap_entry) * npte); 105 rev = vmalloc(sizeof(struct revmap_entry) * npte);
108 if (!rev) { 106 if (!rev) {
109 pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
110 if (cma) 107 if (cma)
111 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); 108 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
112 else 109 else
@@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
137 long err = -EBUSY; 134 long err = -EBUSY;
138 struct kvm_hpt_info info; 135 struct kvm_hpt_info info;
139 136
140 if (kvm_is_radix(kvm))
141 return -EINVAL;
142
143 mutex_lock(&kvm->lock); 137 mutex_lock(&kvm->lock);
144 if (kvm->arch.hpte_setup_done) { 138 if (kvm->arch.mmu_ready) {
145 kvm->arch.hpte_setup_done = 0; 139 kvm->arch.mmu_ready = 0;
146 /* order hpte_setup_done vs. vcpus_running */ 140 /* order mmu_ready vs. vcpus_running */
147 smp_mb(); 141 smp_mb();
148 if (atomic_read(&kvm->arch.vcpus_running)) { 142 if (atomic_read(&kvm->arch.vcpus_running)) {
149 kvm->arch.hpte_setup_done = 1; 143 kvm->arch.mmu_ready = 1;
150 goto out; 144 goto out;
151 } 145 }
152 } 146 }
147 if (kvm_is_radix(kvm)) {
148 err = kvmppc_switch_mmu_to_hpt(kvm);
149 if (err)
150 goto out;
151 }
152
153 if (kvm->arch.hpt.order == order) { 153 if (kvm->arch.hpt.order == order) {
154 /* We already have a suitable HPT */ 154 /* We already have a suitable HPT */
155 155
@@ -183,6 +183,7 @@ out:
183void kvmppc_free_hpt(struct kvm_hpt_info *info) 183void kvmppc_free_hpt(struct kvm_hpt_info *info)
184{ 184{
185 vfree(info->rev); 185 vfree(info->rev);
186 info->rev = NULL;
186 if (info->cma) 187 if (info->cma)
187 kvm_free_hpt_cma(virt_to_page(info->virt), 188 kvm_free_hpt_cma(virt_to_page(info->virt),
188 1 << (info->order - PAGE_SHIFT)); 189 1 << (info->order - PAGE_SHIFT));
@@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
334{ 335{
335 unsigned long ra_mask; 336 unsigned long ra_mask;
336 337
337 ra_mask = hpte_page_size(v, r) - 1; 338 ra_mask = kvmppc_actual_pgsz(v, r) - 1;
338 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); 339 return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
339} 340}
340 341
@@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
350 int index; 351 int index;
351 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); 352 int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
352 353
354 if (kvm_is_radix(vcpu->kvm))
355 return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
356
353 /* Get SLB entry */ 357 /* Get SLB entry */
354 if (virtmode) { 358 if (virtmode) {
355 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); 359 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
505 mmio_update = atomic64_read(&kvm->arch.mmio_update); 509 mmio_update = atomic64_read(&kvm->arch.mmio_update);
506 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { 510 if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
507 r = vcpu->arch.pgfault_cache->rpte; 511 r = vcpu->arch.pgfault_cache->rpte;
508 psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); 512 psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
513 r);
509 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 514 gpa_base = r & HPTE_R_RPN & ~(psize - 1);
510 gfn_base = gpa_base >> PAGE_SHIFT; 515 gfn_base = gpa_base >> PAGE_SHIFT;
511 gpa = gpa_base | (ea & (psize - 1)); 516 gpa = gpa_base | (ea & (psize - 1));
@@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
534 return RESUME_GUEST; 539 return RESUME_GUEST;
535 540
536 /* Translate the logical address and get the page */ 541 /* Translate the logical address and get the page */
537 psize = hpte_page_size(hpte[0], r); 542 psize = kvmppc_actual_pgsz(hpte[0], r);
538 gpa_base = r & HPTE_R_RPN & ~(psize - 1); 543 gpa_base = r & HPTE_R_RPN & ~(psize - 1);
539 gfn_base = gpa_base >> PAGE_SHIFT; 544 gfn_base = gpa_base >> PAGE_SHIFT;
540 gpa = gpa_base | (ea & (psize - 1)); 545 gpa = gpa_base | (ea & (psize - 1));
@@ -650,10 +655,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
650 /* 655 /*
651 * If the HPT is being resized, don't update the HPTE, 656 * If the HPT is being resized, don't update the HPTE,
652 * instead let the guest retry after the resize operation is complete. 657 * instead let the guest retry after the resize operation is complete.
653 * The synchronization for hpte_setup_done test vs. set is provided 658 * The synchronization for mmu_ready test vs. set is provided
654 * by the HPTE lock. 659 * by the HPTE lock.
655 */ 660 */
656 if (!kvm->arch.hpte_setup_done) 661 if (!kvm->arch.mmu_ready)
657 goto out_unlock; 662 goto out_unlock;
658 663
659 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || 664 if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
@@ -720,7 +725,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
720 goto out_put; 725 goto out_put;
721} 726}
722 727
723static void kvmppc_rmap_reset(struct kvm *kvm) 728void kvmppc_rmap_reset(struct kvm *kvm)
724{ 729{
725 struct kvm_memslots *slots; 730 struct kvm_memslots *slots;
726 struct kvm_memory_slot *memslot; 731 struct kvm_memory_slot *memslot;
@@ -786,6 +791,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
786 791
787/* Must be called with both HPTE and rmap locked */ 792/* Must be called with both HPTE and rmap locked */
788static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, 793static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
794 struct kvm_memory_slot *memslot,
789 unsigned long *rmapp, unsigned long gfn) 795 unsigned long *rmapp, unsigned long gfn)
790{ 796{
791 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); 797 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
@@ -808,7 +814,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
808 814
809 /* Now check and modify the HPTE */ 815 /* Now check and modify the HPTE */
810 ptel = rev[i].guest_rpte; 816 ptel = rev[i].guest_rpte;
811 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); 817 psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
812 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && 818 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
813 hpte_rpn(ptel, psize) == gfn) { 819 hpte_rpn(ptel, psize) == gfn) {
814 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); 820 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
@@ -817,8 +823,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
817 /* Harvest R and C */ 823 /* Harvest R and C */
818 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); 824 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
819 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 825 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
820 if (rcbits & HPTE_R_C) 826 if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
821 kvmppc_update_rmap_change(rmapp, psize); 827 kvmppc_update_dirty_map(memslot, gfn, psize);
822 if (rcbits & ~rev[i].guest_rpte) { 828 if (rcbits & ~rev[i].guest_rpte) {
823 rev[i].guest_rpte = ptel | rcbits; 829 rev[i].guest_rpte = ptel | rcbits;
824 note_hpte_modification(kvm, &rev[i]); 830 note_hpte_modification(kvm, &rev[i]);
@@ -856,7 +862,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
856 continue; 862 continue;
857 } 863 }
858 864
859 kvmppc_unmap_hpte(kvm, i, rmapp, gfn); 865 kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
860 unlock_rmap(rmapp); 866 unlock_rmap(rmapp);
861 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 867 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
862 } 868 }
@@ -1039,14 +1045,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1039 1045
1040 retry: 1046 retry:
1041 lock_rmap(rmapp); 1047 lock_rmap(rmapp);
1042 if (*rmapp & KVMPPC_RMAP_CHANGED) {
1043 long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
1044 >> KVMPPC_RMAP_CHG_SHIFT;
1045 *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
1046 npages_dirty = 1;
1047 if (change_order > PAGE_SHIFT)
1048 npages_dirty = 1ul << (change_order - PAGE_SHIFT);
1049 }
1050 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { 1048 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
1051 unlock_rmap(rmapp); 1049 unlock_rmap(rmapp);
1052 return npages_dirty; 1050 return npages_dirty;
@@ -1102,7 +1100,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1102 rev[i].guest_rpte |= HPTE_R_C; 1100 rev[i].guest_rpte |= HPTE_R_C;
1103 note_hpte_modification(kvm, &rev[i]); 1101 note_hpte_modification(kvm, &rev[i]);
1104 } 1102 }
1105 n = hpte_page_size(v, r); 1103 n = kvmppc_actual_pgsz(v, r);
1106 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; 1104 n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1107 if (n > npages_dirty) 1105 if (n > npages_dirty)
1108 npages_dirty = n; 1106 npages_dirty = n;
@@ -1138,7 +1136,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1138long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, 1136long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1139 struct kvm_memory_slot *memslot, unsigned long *map) 1137 struct kvm_memory_slot *memslot, unsigned long *map)
1140{ 1138{
1141 unsigned long i, j; 1139 unsigned long i;
1142 unsigned long *rmapp; 1140 unsigned long *rmapp;
1143 1141
1144 preempt_disable(); 1142 preempt_disable();
@@ -1150,9 +1148,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1150 * since we always put huge-page HPTEs in the rmap chain 1148 * since we always put huge-page HPTEs in the rmap chain
1151 * corresponding to their page base address. 1149 * corresponding to their page base address.
1152 */ 1150 */
1153 if (npages && map) 1151 if (npages)
1154 for (j = i; npages; ++j, --npages) 1152 set_dirty_bits(map, i, npages);
1155 __set_bit_le(j, map);
1156 ++rmapp; 1153 ++rmapp;
1157 } 1154 }
1158 preempt_enable(); 1155 preempt_enable();
@@ -1196,7 +1193,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1196 struct page *page = virt_to_page(va); 1193 struct page *page = virt_to_page(va);
1197 struct kvm_memory_slot *memslot; 1194 struct kvm_memory_slot *memslot;
1198 unsigned long gfn; 1195 unsigned long gfn;
1199 unsigned long *rmap;
1200 int srcu_idx; 1196 int srcu_idx;
1201 1197
1202 put_page(page); 1198 put_page(page);
@@ -1204,20 +1200,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1204 if (!dirty) 1200 if (!dirty)
1205 return; 1201 return;
1206 1202
1207 /* We need to mark this page dirty in the rmap chain */ 1203 /* We need to mark this page dirty in the memslot dirty_bitmap, if any */
1208 gfn = gpa >> PAGE_SHIFT; 1204 gfn = gpa >> PAGE_SHIFT;
1209 srcu_idx = srcu_read_lock(&kvm->srcu); 1205 srcu_idx = srcu_read_lock(&kvm->srcu);
1210 memslot = gfn_to_memslot(kvm, gfn); 1206 memslot = gfn_to_memslot(kvm, gfn);
1211 if (memslot) { 1207 if (memslot && memslot->dirty_bitmap)
1212 if (!kvm_is_radix(kvm)) { 1208 set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
1213 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1214 lock_rmap(rmap);
1215 *rmap |= KVMPPC_RMAP_CHANGED;
1216 unlock_rmap(rmap);
1217 } else if (memslot->dirty_bitmap) {
1218 mark_page_dirty(kvm, gfn);
1219 }
1220 }
1221 srcu_read_unlock(&kvm->srcu, srcu_idx); 1209 srcu_read_unlock(&kvm->srcu, srcu_idx);
1222} 1210}
1223 1211
@@ -1277,7 +1265,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1277 guest_rpte = rev->guest_rpte; 1265 guest_rpte = rev->guest_rpte;
1278 1266
1279 ret = -EIO; 1267 ret = -EIO;
1280 apsize = hpte_page_size(vpte, guest_rpte); 1268 apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
1281 if (!apsize) 1269 if (!apsize)
1282 goto out; 1270 goto out;
1283 1271
@@ -1292,7 +1280,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1292 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 1280 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1293 1281
1294 lock_rmap(rmapp); 1282 lock_rmap(rmapp);
1295 kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); 1283 kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
1296 unlock_rmap(rmapp); 1284 unlock_rmap(rmapp);
1297 } 1285 }
1298 1286
@@ -1465,7 +1453,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
1465 struct kvm_resize_hpt *resize; 1453 struct kvm_resize_hpt *resize;
1466 int ret; 1454 int ret;
1467 1455
1468 if (flags != 0) 1456 if (flags != 0 || kvm_is_radix(kvm))
1469 return -EINVAL; 1457 return -EINVAL;
1470 1458
1471 if (shift && ((shift < 18) || (shift > 46))) 1459 if (shift && ((shift < 18) || (shift > 46)))
@@ -1531,7 +1519,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1531 struct kvm_resize_hpt *resize; 1519 struct kvm_resize_hpt *resize;
1532 long ret; 1520 long ret;
1533 1521
1534 if (flags != 0) 1522 if (flags != 0 || kvm_is_radix(kvm))
1535 return -EINVAL; 1523 return -EINVAL;
1536 1524
1537 if (shift && ((shift < 18) || (shift > 46))) 1525 if (shift && ((shift < 18) || (shift > 46)))
@@ -1543,15 +1531,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1543 1531
1544 /* This shouldn't be possible */ 1532 /* This shouldn't be possible */
1545 ret = -EIO; 1533 ret = -EIO;
1546 if (WARN_ON(!kvm->arch.hpte_setup_done)) 1534 if (WARN_ON(!kvm->arch.mmu_ready))
1547 goto out_no_hpt; 1535 goto out_no_hpt;
1548 1536
1549 /* Stop VCPUs from running while we mess with the HPT */ 1537 /* Stop VCPUs from running while we mess with the HPT */
1550 kvm->arch.hpte_setup_done = 0; 1538 kvm->arch.mmu_ready = 0;
1551 smp_mb(); 1539 smp_mb();
1552 1540
1553 /* Boot all CPUs out of the guest so they re-read 1541 /* Boot all CPUs out of the guest so they re-read
1554 * hpte_setup_done */ 1542 * mmu_ready */
1555 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); 1543 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
1556 1544
1557 ret = -ENXIO; 1545 ret = -ENXIO;
@@ -1574,7 +1562,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1574 1562
1575out: 1563out:
1576 /* Let VCPUs run again */ 1564 /* Let VCPUs run again */
1577 kvm->arch.hpte_setup_done = 1; 1565 kvm->arch.mmu_ready = 1;
1578 smp_mb(); 1566 smp_mb();
1579out_no_hpt: 1567out_no_hpt:
1580 resize_hpt_release(kvm, resize); 1568 resize_hpt_release(kvm, resize);
@@ -1717,6 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1717 1705
1718 if (!access_ok(VERIFY_WRITE, buf, count)) 1706 if (!access_ok(VERIFY_WRITE, buf, count))
1719 return -EFAULT; 1707 return -EFAULT;
1708 if (kvm_is_radix(kvm))
1709 return 0;
1720 1710
1721 first_pass = ctx->first_pass; 1711 first_pass = ctx->first_pass;
1722 flags = ctx->flags; 1712 flags = ctx->flags;
@@ -1810,20 +1800,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1810 unsigned long tmp[2]; 1800 unsigned long tmp[2];
1811 ssize_t nb; 1801 ssize_t nb;
1812 long int err, ret; 1802 long int err, ret;
1813 int hpte_setup; 1803 int mmu_ready;
1814 1804
1815 if (!access_ok(VERIFY_READ, buf, count)) 1805 if (!access_ok(VERIFY_READ, buf, count))
1816 return -EFAULT; 1806 return -EFAULT;
1807 if (kvm_is_radix(kvm))
1808 return -EINVAL;
1817 1809
1818 /* lock out vcpus from running while we're doing this */ 1810 /* lock out vcpus from running while we're doing this */
1819 mutex_lock(&kvm->lock); 1811 mutex_lock(&kvm->lock);
1820 hpte_setup = kvm->arch.hpte_setup_done; 1812 mmu_ready = kvm->arch.mmu_ready;
1821 if (hpte_setup) { 1813 if (mmu_ready) {
1822 kvm->arch.hpte_setup_done = 0; /* temporarily */ 1814 kvm->arch.mmu_ready = 0; /* temporarily */
1823 /* order hpte_setup_done vs. vcpus_running */ 1815 /* order mmu_ready vs. vcpus_running */
1824 smp_mb(); 1816 smp_mb();
1825 if (atomic_read(&kvm->arch.vcpus_running)) { 1817 if (atomic_read(&kvm->arch.vcpus_running)) {
1826 kvm->arch.hpte_setup_done = 1; 1818 kvm->arch.mmu_ready = 1;
1827 mutex_unlock(&kvm->lock); 1819 mutex_unlock(&kvm->lock);
1828 return -EBUSY; 1820 return -EBUSY;
1829 } 1821 }
@@ -1876,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1876 "r=%lx\n", ret, i, v, r); 1868 "r=%lx\n", ret, i, v, r);
1877 goto out; 1869 goto out;
1878 } 1870 }
1879 if (!hpte_setup && is_vrma_hpte(v)) { 1871 if (!mmu_ready && is_vrma_hpte(v)) {
1880 unsigned long psize = hpte_base_page_size(v, r); 1872 unsigned long psize = hpte_base_page_size(v, r);
1881 unsigned long senc = slb_pgsize_encoding(psize); 1873 unsigned long senc = slb_pgsize_encoding(psize);
1882 unsigned long lpcr; 1874 unsigned long lpcr;
@@ -1885,7 +1877,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1885 (VRMA_VSID << SLB_VSID_SHIFT_1T); 1877 (VRMA_VSID << SLB_VSID_SHIFT_1T);
1886 lpcr = senc << (LPCR_VRMASD_SH - 4); 1878 lpcr = senc << (LPCR_VRMASD_SH - 4);
1887 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 1879 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
1888 hpte_setup = 1; 1880 mmu_ready = 1;
1889 } 1881 }
1890 ++i; 1882 ++i;
1891 hptp += 2; 1883 hptp += 2;
@@ -1901,9 +1893,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1901 } 1893 }
1902 1894
1903 out: 1895 out:
1904 /* Order HPTE updates vs. hpte_setup_done */ 1896 /* Order HPTE updates vs. mmu_ready */
1905 smp_wmb(); 1897 smp_wmb();
1906 kvm->arch.hpte_setup_done = hpte_setup; 1898 kvm->arch.mmu_ready = mmu_ready;
1907 mutex_unlock(&kvm->lock); 1899 mutex_unlock(&kvm->lock);
1908 1900
1909 if (err) 1901 if (err)
@@ -2012,6 +2004,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
2012 struct kvm *kvm; 2004 struct kvm *kvm;
2013 __be64 *hptp; 2005 __be64 *hptp;
2014 2006
2007 kvm = p->kvm;
2008 if (kvm_is_radix(kvm))
2009 return 0;
2010
2015 ret = mutex_lock_interruptible(&p->mutex); 2011 ret = mutex_lock_interruptible(&p->mutex);
2016 if (ret) 2012 if (ret)
2017 return ret; 2013 return ret;
@@ -2034,7 +2030,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
2034 } 2030 }
2035 } 2031 }
2036 2032
2037 kvm = p->kvm;
2038 i = p->hpt_index; 2033 i = p->hpt_index;
2039 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); 2034 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
2040 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); 2035 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
@@ -2109,10 +2104,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
2109 2104
2110 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ 2105 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
2111 2106
2112 if (kvm_is_radix(vcpu->kvm)) 2107 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2113 mmu->xlate = kvmppc_mmu_radix_xlate;
2114 else
2115 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2116 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; 2108 mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
2117 2109
2118 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; 2110 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index c5d7435455f1..58618f644c56 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
474 return ret; 474 return ret;
475} 475}
476 476
477static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
478 unsigned long gfn, unsigned int order)
479{
480 unsigned long i, limit;
481 unsigned long *dp;
482
483 if (!memslot->dirty_bitmap)
484 return;
485 limit = 1ul << order;
486 if (limit < BITS_PER_LONG) {
487 for (i = 0; i < limit; ++i)
488 mark_page_dirty(kvm, gfn + i);
489 return;
490 }
491 dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
492 limit /= BITS_PER_LONG;
493 for (i = 0; i < limit; ++i)
494 *dp++ = ~0ul;
495}
496
497/* Called with kvm->lock held */ 477/* Called with kvm->lock held */
498int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, 478int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
499 unsigned long gfn) 479 unsigned long gfn)
@@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
508 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, 488 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
509 gpa, shift); 489 gpa, shift);
510 kvmppc_radix_tlbie_page(kvm, gpa, shift); 490 kvmppc_radix_tlbie_page(kvm, gpa, shift);
511 if (old & _PAGE_DIRTY) { 491 if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
512 if (!shift) 492 unsigned long npages = 1;
513 mark_page_dirty(kvm, gfn); 493 if (shift)
514 else 494 npages = 1ul << (shift - PAGE_SHIFT);
515 mark_pages_dirty(kvm, memslot, 495 kvmppc_update_dirty_map(memslot, gfn, npages);
516 gfn, shift - PAGE_SHIFT);
517 } 496 }
518 } 497 }
519 return 0; 498 return 0;
@@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
579 struct kvm_memory_slot *memslot, unsigned long *map) 558 struct kvm_memory_slot *memslot, unsigned long *map)
580{ 559{
581 unsigned long i, j; 560 unsigned long i, j;
582 unsigned long n, *p;
583 int npages; 561 int npages;
584 562
585 /*
586 * Radix accumulates dirty bits in the first half of the
587 * memslot's dirty_bitmap area, for when pages are paged
588 * out or modified by the host directly. Pick up these
589 * bits and add them to the map.
590 */
591 n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
592 p = memslot->dirty_bitmap;
593 for (i = 0; i < n; ++i)
594 map[i] |= xchg(&p[i], 0);
595
596 for (i = 0; i < memslot->npages; i = j) { 563 for (i = 0; i < memslot->npages; i = j) {
597 npages = kvm_radix_test_clear_dirty(kvm, memslot, i); 564 npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
598 565
@@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
604 * real address, if npages > 1 we can skip to i + npages. 571 * real address, if npages > 1 we can skip to i + npages.
605 */ 572 */
606 j = i + 1; 573 j = i + 1;
607 if (npages) 574 if (npages) {
608 for (j = i; npages; ++j, --npages) 575 set_dirty_bits(map, i, npages);
609 __set_bit_le(j, map); 576 i = j + npages;
577 }
610 } 578 }
611 return 0; 579 return 0;
612} 580}
@@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
694 pgd_clear(pgd); 662 pgd_clear(pgd);
695 } 663 }
696 pgd_free(kvm->mm, kvm->arch.pgtable); 664 pgd_free(kvm->mm, kvm->arch.pgtable);
665 kvm->arch.pgtable = NULL;
697} 666}
698 667
699static void pte_ctor(void *addr) 668static void pte_ctor(void *addr)
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 3589c4e3d49b..688722acd692 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -113,7 +113,7 @@ slb_do_enter:
113 113
114 /* Remove all SLB entries that are in use. */ 114 /* Remove all SLB entries that are in use. */
115 115
116 li r0, r0 116 li r0, 0
117 slbmte r0, r0 117 slbmte r0, r0
118 slbia 118 slbia
119 119
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 40e5857c4b1c..79ea3d9269db 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/kernel.h>
22#include <linux/err.h> 23#include <linux/err.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/preempt.h> 25#include <linux/preempt.h>
@@ -98,6 +99,10 @@ static int target_smt_mode;
98module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); 99module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
99MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); 100MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
100 101
102static bool indep_threads_mode = true;
103module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
104MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
105
101#ifdef CONFIG_KVM_XICS 106#ifdef CONFIG_KVM_XICS
102static struct kernel_param_ops module_param_ops = { 107static struct kernel_param_ops module_param_ops = {
103 .set = param_set_int, 108 .set = param_set_int,
@@ -115,6 +120,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
115 120
116static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 121static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
117static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 122static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
123static void kvmppc_setup_partition_table(struct kvm *kvm);
118 124
119static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, 125static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
120 int *ip) 126 int *ip)
@@ -1734,9 +1740,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1734 * MMU mode (radix or HPT), unfortunately, but since we only support 1740 * MMU mode (radix or HPT), unfortunately, but since we only support
1735 * HPT guests on a HPT host so far, that isn't an impediment yet. 1741 * HPT guests on a HPT host so far, that isn't an impediment yet.
1736 */ 1742 */
1737static int threads_per_vcore(void) 1743static int threads_per_vcore(struct kvm *kvm)
1738{ 1744{
1739 if (cpu_has_feature(CPU_FTR_ARCH_300)) 1745 if (kvm->arch.threads_indep)
1740 return 1; 1746 return 1;
1741 return threads_per_subcore; 1747 return threads_per_subcore;
1742} 1748}
@@ -1774,7 +1780,7 @@ static struct debugfs_timings_element {
1774 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, 1780 {"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
1775}; 1781};
1776 1782
1777#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) 1783#define N_TIMINGS (ARRAY_SIZE(timings))
1778 1784
1779struct debugfs_timings_state { 1785struct debugfs_timings_state {
1780 struct kvm_vcpu *vcpu; 1786 struct kvm_vcpu *vcpu;
@@ -2228,11 +2234,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
2228 kvmppc_ipi_thread(cpu); 2234 kvmppc_ipi_thread(cpu);
2229} 2235}
2230 2236
2231static void kvmppc_wait_for_nap(void) 2237static void kvmppc_wait_for_nap(int n_threads)
2232{ 2238{
2233 int cpu = smp_processor_id(); 2239 int cpu = smp_processor_id();
2234 int i, loops; 2240 int i, loops;
2235 int n_threads = threads_per_vcore();
2236 2241
2237 if (n_threads <= 1) 2242 if (n_threads <= 1)
2238 return; 2243 return;
@@ -2319,7 +2324,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
2319 2324
2320 vc->vcore_state = VCORE_PREEMPT; 2325 vc->vcore_state = VCORE_PREEMPT;
2321 vc->pcpu = smp_processor_id(); 2326 vc->pcpu = smp_processor_id();
2322 if (vc->num_threads < threads_per_vcore()) { 2327 if (vc->num_threads < threads_per_vcore(vc->kvm)) {
2323 spin_lock(&lp->lock); 2328 spin_lock(&lp->lock);
2324 list_add_tail(&vc->preempt_list, &lp->list); 2329 list_add_tail(&vc->preempt_list, &lp->list);
2325 spin_unlock(&lp->lock); 2330 spin_unlock(&lp->lock);
@@ -2357,7 +2362,7 @@ struct core_info {
2357 2362
2358/* 2363/*
2359 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 2364 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
2360 * respectively in 2-way micro-threading (split-core) mode. 2365 * respectively in 2-way micro-threading (split-core) mode on POWER8.
2361 */ 2366 */
2362static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; 2367static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
2363 2368
@@ -2373,7 +2378,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
2373 2378
2374static bool subcore_config_ok(int n_subcores, int n_threads) 2379static bool subcore_config_ok(int n_subcores, int n_threads)
2375{ 2380{
2376 /* Can only dynamically split if unsplit to begin with */ 2381 /*
2382 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
2383 * mode, with one thread per subcore.
2384 */
2385 if (cpu_has_feature(CPU_FTR_ARCH_300))
2386 return n_subcores <= 4 && n_threads == 1;
2387
2388 /* On POWER8, can only dynamically split if unsplit to begin with */
2377 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) 2389 if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
2378 return false; 2390 return false;
2379 if (n_subcores > MAX_SUBCORES) 2391 if (n_subcores > MAX_SUBCORES)
@@ -2404,6 +2416,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2404 if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 2416 if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2405 return false; 2417 return false;
2406 2418
2419 /* POWER9 currently requires all threads to be in the same MMU mode */
2420 if (cpu_has_feature(CPU_FTR_ARCH_300) &&
2421 kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
2422 return false;
2423
2407 if (n_threads < cip->max_subcore_threads) 2424 if (n_threads < cip->max_subcore_threads)
2408 n_threads = cip->max_subcore_threads; 2425 n_threads = cip->max_subcore_threads;
2409 if (!subcore_config_ok(cip->n_subcores + 1, n_threads)) 2426 if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2632,6 +2649,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2632 int target_threads; 2649 int target_threads;
2633 int controlled_threads; 2650 int controlled_threads;
2634 int trap; 2651 int trap;
2652 bool is_power8;
2653 bool hpt_on_radix;
2635 2654
2636 /* 2655 /*
2637 * Remove from the list any threads that have a signal pending 2656 * Remove from the list any threads that have a signal pending
@@ -2654,15 +2673,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2654 * the number of threads per subcore, except on POWER9, 2673 * the number of threads per subcore, except on POWER9,
2655 * where it's 1 because the threads are (mostly) independent. 2674 * where it's 1 because the threads are (mostly) independent.
2656 */ 2675 */
2657 controlled_threads = threads_per_vcore(); 2676 controlled_threads = threads_per_vcore(vc->kvm);
2658 2677
2659 /* 2678 /*
2660 * Make sure we are running on primary threads, and that secondary 2679 * Make sure we are running on primary threads, and that secondary
2661 * threads are offline. Also check if the number of threads in this 2680 * threads are offline. Also check if the number of threads in this
2662 * guest are greater than the current system threads per guest. 2681 * guest are greater than the current system threads per guest.
2682 * On POWER9, we need to be not in independent-threads mode if
2683 * this is a HPT guest on a radix host.
2663 */ 2684 */
2664 if ((controlled_threads > 1) && 2685 hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
2665 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { 2686 if (((controlled_threads > 1) &&
2687 ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
2688 (hpt_on_radix && vc->kvm->arch.threads_indep)) {
2666 for_each_runnable_thread(i, vcpu, vc) { 2689 for_each_runnable_thread(i, vcpu, vc) {
2667 vcpu->arch.ret = -EBUSY; 2690 vcpu->arch.ret = -EBUSY;
2668 kvmppc_remove_runnable(vc, vcpu); 2691 kvmppc_remove_runnable(vc, vcpu);
@@ -2699,14 +2722,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2699 * Hard-disable interrupts, and check resched flag and signals. 2722 * Hard-disable interrupts, and check resched flag and signals.
2700 * If we need to reschedule or deliver a signal, clean up 2723 * If we need to reschedule or deliver a signal, clean up
2701 * and return without going into the guest(s). 2724 * and return without going into the guest(s).
2702 * If the hpte_setup_done flag has been cleared, don't go into the 2725 * If the mmu_ready flag has been cleared, don't go into the
2703 * guest because that means a HPT resize operation is in progress. 2726 * guest because that means a HPT resize operation is in progress.
2704 */ 2727 */
2705 local_irq_disable(); 2728 local_irq_disable();
2706 hard_irq_disable(); 2729 hard_irq_disable();
2707 if (lazy_irq_pending() || need_resched() || 2730 if (lazy_irq_pending() || need_resched() ||
2708 recheck_signals(&core_info) || 2731 recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
2709 (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) {
2710 local_irq_enable(); 2732 local_irq_enable();
2711 vc->vcore_state = VCORE_INACTIVE; 2733 vc->vcore_state = VCORE_INACTIVE;
2712 /* Unlock all except the primary vcore */ 2734 /* Unlock all except the primary vcore */
@@ -2728,32 +2750,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2728 cmd_bit = stat_bit = 0; 2750 cmd_bit = stat_bit = 0;
2729 split = core_info.n_subcores; 2751 split = core_info.n_subcores;
2730 sip = NULL; 2752 sip = NULL;
2731 if (split > 1) { 2753 is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
2732 /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ 2754 && !cpu_has_feature(CPU_FTR_ARCH_300);
2733 if (split == 2 && (dynamic_mt_modes & 2)) { 2755
2734 cmd_bit = HID0_POWER8_1TO2LPAR; 2756 if (split > 1 || hpt_on_radix) {
2735 stat_bit = HID0_POWER8_2LPARMODE;
2736 } else {
2737 split = 4;
2738 cmd_bit = HID0_POWER8_1TO4LPAR;
2739 stat_bit = HID0_POWER8_4LPARMODE;
2740 }
2741 subcore_size = MAX_SMT_THREADS / split;
2742 sip = &split_info; 2757 sip = &split_info;
2743 memset(&split_info, 0, sizeof(split_info)); 2758 memset(&split_info, 0, sizeof(split_info));
2744 split_info.rpr = mfspr(SPRN_RPR);
2745 split_info.pmmar = mfspr(SPRN_PMMAR);
2746 split_info.ldbar = mfspr(SPRN_LDBAR);
2747 split_info.subcore_size = subcore_size;
2748 for (sub = 0; sub < core_info.n_subcores; ++sub) 2759 for (sub = 0; sub < core_info.n_subcores; ++sub)
2749 split_info.vc[sub] = core_info.vc[sub]; 2760 split_info.vc[sub] = core_info.vc[sub];
2761
2762 if (is_power8) {
2763 if (split == 2 && (dynamic_mt_modes & 2)) {
2764 cmd_bit = HID0_POWER8_1TO2LPAR;
2765 stat_bit = HID0_POWER8_2LPARMODE;
2766 } else {
2767 split = 4;
2768 cmd_bit = HID0_POWER8_1TO4LPAR;
2769 stat_bit = HID0_POWER8_4LPARMODE;
2770 }
2771 subcore_size = MAX_SMT_THREADS / split;
2772 split_info.rpr = mfspr(SPRN_RPR);
2773 split_info.pmmar = mfspr(SPRN_PMMAR);
2774 split_info.ldbar = mfspr(SPRN_LDBAR);
2775 split_info.subcore_size = subcore_size;
2776 } else {
2777 split_info.subcore_size = 1;
2778 if (hpt_on_radix) {
2779 /* Use the split_info for LPCR/LPIDR changes */
2780 split_info.lpcr_req = vc->lpcr;
2781 split_info.lpidr_req = vc->kvm->arch.lpid;
2782 split_info.host_lpcr = vc->kvm->arch.host_lpcr;
2783 split_info.do_set = 1;
2784 }
2785 }
2786
2750 /* order writes to split_info before kvm_split_mode pointer */ 2787 /* order writes to split_info before kvm_split_mode pointer */
2751 smp_wmb(); 2788 smp_wmb();
2752 } 2789 }
2753 for (thr = 0; thr < controlled_threads; ++thr) 2790
2791 for (thr = 0; thr < controlled_threads; ++thr) {
2792 paca[pcpu + thr].kvm_hstate.tid = thr;
2793 paca[pcpu + thr].kvm_hstate.napping = 0;
2754 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2794 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
2795 }
2755 2796
2756 /* Initiate micro-threading (split-core) if required */ 2797 /* Initiate micro-threading (split-core) on POWER8 if required */
2757 if (cmd_bit) { 2798 if (cmd_bit) {
2758 unsigned long hid0 = mfspr(SPRN_HID0); 2799 unsigned long hid0 = mfspr(SPRN_HID0);
2759 2800
@@ -2772,7 +2813,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2772 /* Start all the threads */ 2813 /* Start all the threads */
2773 active = 0; 2814 active = 0;
2774 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2815 for (sub = 0; sub < core_info.n_subcores; ++sub) {
2775 thr = subcore_thread_map[sub]; 2816 thr = is_power8 ? subcore_thread_map[sub] : sub;
2776 thr0_done = false; 2817 thr0_done = false;
2777 active |= 1 << thr; 2818 active |= 1 << thr;
2778 pvc = core_info.vc[sub]; 2819 pvc = core_info.vc[sub];
@@ -2799,18 +2840,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2799 * the vcore pointer in the PACA of the secondaries. 2840 * the vcore pointer in the PACA of the secondaries.
2800 */ 2841 */
2801 smp_mb(); 2842 smp_mb();
2802 if (cmd_bit)
2803 split_info.do_nap = 1; /* ask secondaries to nap when done */
2804 2843
2805 /* 2844 /*
2806 * When doing micro-threading, poke the inactive threads as well. 2845 * When doing micro-threading, poke the inactive threads as well.
2807 * This gets them to the nap instruction after kvm_do_nap, 2846 * This gets them to the nap instruction after kvm_do_nap,
2808 * which reduces the time taken to unsplit later. 2847 * which reduces the time taken to unsplit later.
2848 * For POWER9 HPT guest on radix host, we need all the secondary
2849 * threads woken up so they can do the LPCR/LPIDR change.
2809 */ 2850 */
2810 if (split > 1) 2851 if (cmd_bit || hpt_on_radix) {
2852 split_info.do_nap = 1; /* ask secondaries to nap when done */
2811 for (thr = 1; thr < threads_per_subcore; ++thr) 2853 for (thr = 1; thr < threads_per_subcore; ++thr)
2812 if (!(active & (1 << thr))) 2854 if (!(active & (1 << thr)))
2813 kvmppc_ipi_thread(pcpu + thr); 2855 kvmppc_ipi_thread(pcpu + thr);
2856 }
2814 2857
2815 vc->vcore_state = VCORE_RUNNING; 2858 vc->vcore_state = VCORE_RUNNING;
2816 preempt_disable(); 2859 preempt_disable();
@@ -2844,10 +2887,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2844 vc->vcore_state = VCORE_EXITING; 2887 vc->vcore_state = VCORE_EXITING;
2845 2888
2846 /* wait for secondary threads to finish writing their state to memory */ 2889 /* wait for secondary threads to finish writing their state to memory */
2847 kvmppc_wait_for_nap(); 2890 kvmppc_wait_for_nap(controlled_threads);
2848 2891
2849 /* Return to whole-core mode if we split the core earlier */ 2892 /* Return to whole-core mode if we split the core earlier */
2850 if (split > 1) { 2893 if (cmd_bit) {
2851 unsigned long hid0 = mfspr(SPRN_HID0); 2894 unsigned long hid0 = mfspr(SPRN_HID0);
2852 unsigned long loops = 0; 2895 unsigned long loops = 0;
2853 2896
@@ -2863,8 +2906,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2863 cpu_relax(); 2906 cpu_relax();
2864 ++loops; 2907 ++loops;
2865 } 2908 }
2866 split_info.do_nap = 0; 2909 } else if (hpt_on_radix) {
2910 /* Wait for all threads to have seen final sync */
2911 for (thr = 1; thr < controlled_threads; ++thr) {
2912 while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
2913 HMT_low();
2914 barrier();
2915 }
2916 HMT_medium();
2917 }
2867 } 2918 }
2919 split_info.do_nap = 0;
2868 2920
2869 kvmppc_set_host_core(pcpu); 2921 kvmppc_set_host_core(pcpu);
2870 2922
@@ -3073,6 +3125,25 @@ out:
3073 trace_kvmppc_vcore_wakeup(do_sleep, block_ns); 3125 trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
3074} 3126}
3075 3127
3128static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
3129{
3130 int r = 0;
3131 struct kvm *kvm = vcpu->kvm;
3132
3133 mutex_lock(&kvm->lock);
3134 if (!kvm->arch.mmu_ready) {
3135 if (!kvm_is_radix(kvm))
3136 r = kvmppc_hv_setup_htab_rma(vcpu);
3137 if (!r) {
3138 if (cpu_has_feature(CPU_FTR_ARCH_300))
3139 kvmppc_setup_partition_table(kvm);
3140 kvm->arch.mmu_ready = 1;
3141 }
3142 }
3143 mutex_unlock(&kvm->lock);
3144 return r;
3145}
3146
3076static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3147static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3077{ 3148{
3078 int n_ceded, i, r; 3149 int n_ceded, i, r;
@@ -3129,15 +3200,15 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3129 3200
3130 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 3201 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
3131 !signal_pending(current)) { 3202 !signal_pending(current)) {
3132 /* See if the HPT and VRMA are ready to go */ 3203 /* See if the MMU is ready to go */
3133 if (!kvm_is_radix(vcpu->kvm) && 3204 if (!vcpu->kvm->arch.mmu_ready) {
3134 !vcpu->kvm->arch.hpte_setup_done) {
3135 spin_unlock(&vc->lock); 3205 spin_unlock(&vc->lock);
3136 r = kvmppc_hv_setup_htab_rma(vcpu); 3206 r = kvmhv_setup_mmu(vcpu);
3137 spin_lock(&vc->lock); 3207 spin_lock(&vc->lock);
3138 if (r) { 3208 if (r) {
3139 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3209 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3140 kvm_run->fail_entry.hardware_entry_failure_reason = 0; 3210 kvm_run->fail_entry.
3211 hardware_entry_failure_reason = 0;
3141 vcpu->arch.ret = r; 3212 vcpu->arch.ret = r;
3142 break; 3213 break;
3143 } 3214 }
@@ -3219,6 +3290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3219 unsigned long ebb_regs[3] = {}; /* shut up GCC */ 3290 unsigned long ebb_regs[3] = {}; /* shut up GCC */
3220 unsigned long user_tar = 0; 3291 unsigned long user_tar = 0;
3221 unsigned int user_vrsave; 3292 unsigned int user_vrsave;
3293 struct kvm *kvm;
3222 3294
3223 if (!vcpu->arch.sane) { 3295 if (!vcpu->arch.sane) {
3224 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3296 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3256,8 +3328,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3256 return -EINTR; 3328 return -EINTR;
3257 } 3329 }
3258 3330
3259 atomic_inc(&vcpu->kvm->arch.vcpus_running); 3331 kvm = vcpu->kvm;
3260 /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ 3332 atomic_inc(&kvm->arch.vcpus_running);
3333 /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
3261 smp_mb(); 3334 smp_mb();
3262 3335
3263 flush_all_to_thread(current); 3336 flush_all_to_thread(current);
@@ -3285,10 +3358,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3285 trace_kvm_hcall_exit(vcpu, r); 3358 trace_kvm_hcall_exit(vcpu, r);
3286 kvmppc_core_prepare_to_enter(vcpu); 3359 kvmppc_core_prepare_to_enter(vcpu);
3287 } else if (r == RESUME_PAGE_FAULT) { 3360 } else if (r == RESUME_PAGE_FAULT) {
3288 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 3361 srcu_idx = srcu_read_lock(&kvm->srcu);
3289 r = kvmppc_book3s_hv_page_fault(run, vcpu, 3362 r = kvmppc_book3s_hv_page_fault(run, vcpu,
3290 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 3363 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
3291 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 3364 srcu_read_unlock(&kvm->srcu, srcu_idx);
3292 } else if (r == RESUME_PASSTHROUGH) { 3365 } else if (r == RESUME_PASSTHROUGH) {
3293 if (WARN_ON(xive_enabled())) 3366 if (WARN_ON(xive_enabled()))
3294 r = H_SUCCESS; 3367 r = H_SUCCESS;
@@ -3308,27 +3381,26 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
3308 mtspr(SPRN_VRSAVE, user_vrsave); 3381 mtspr(SPRN_VRSAVE, user_vrsave);
3309 3382
3310 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 3383 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
3311 atomic_dec(&vcpu->kvm->arch.vcpus_running); 3384 atomic_dec(&kvm->arch.vcpus_running);
3312 return r; 3385 return r;
3313} 3386}
3314 3387
3315static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, 3388static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
3316 int linux_psize) 3389 int shift, int sllp)
3317{ 3390{
3318 struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; 3391 (*sps)->page_shift = shift;
3319 3392 (*sps)->slb_enc = sllp;
3320 if (!def->shift) 3393 (*sps)->enc[0].page_shift = shift;
3321 return; 3394 (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
3322 (*sps)->page_shift = def->shift;
3323 (*sps)->slb_enc = def->sllp;
3324 (*sps)->enc[0].page_shift = def->shift;
3325 (*sps)->enc[0].pte_enc = def->penc[linux_psize];
3326 /* 3395 /*
3327 * Add 16MB MPSS support if host supports it 3396 * Add 16MB MPSS support (may get filtered out by userspace)
3328 */ 3397 */
3329 if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { 3398 if (shift != 24) {
3330 (*sps)->enc[1].page_shift = 24; 3399 int penc = kvmppc_pgsize_lp_encoding(shift, 24);
3331 (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; 3400 if (penc != -1) {
3401 (*sps)->enc[1].page_shift = 24;
3402 (*sps)->enc[1].pte_enc = penc;
3403 }
3332 } 3404 }
3333 (*sps)++; 3405 (*sps)++;
3334} 3406}
@@ -3339,13 +3411,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
3339 struct kvm_ppc_one_seg_page_size *sps; 3411 struct kvm_ppc_one_seg_page_size *sps;
3340 3412
3341 /* 3413 /*
3342 * Since we don't yet support HPT guests on a radix host,
3343 * return an error if the host uses radix.
3344 */
3345 if (radix_enabled())
3346 return -EINVAL;
3347
3348 /*
3349 * POWER7, POWER8 and POWER9 all support 32 storage keys for data. 3414 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
3350 * POWER7 doesn't support keys for instruction accesses, 3415 * POWER7 doesn't support keys for instruction accesses,
3351 * POWER8 and POWER9 do. 3416 * POWER8 and POWER9 do.
@@ -3353,16 +3418,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
3353 info->data_keys = 32; 3418 info->data_keys = 32;
3354 info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0; 3419 info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
3355 3420
3356 info->flags = KVM_PPC_PAGE_SIZES_REAL; 3421 /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
3357 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 3422 info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
3358 info->flags |= KVM_PPC_1T_SEGMENTS; 3423 info->slb_size = 32;
3359 info->slb_size = mmu_slb_size;
3360 3424
3361 /* We only support these sizes for now, and no muti-size segments */ 3425 /* We only support these sizes for now, and no muti-size segments */
3362 sps = &info->sps[0]; 3426 sps = &info->sps[0];
3363 kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); 3427 kvmppc_add_seg_page_size(&sps, 12, 0);
3364 kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); 3428 kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
3365 kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); 3429 kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
3366 3430
3367 return 0; 3431 return 0;
3368} 3432}
@@ -3377,7 +3441,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
3377 struct kvm_memory_slot *memslot; 3441 struct kvm_memory_slot *memslot;
3378 int i, r; 3442 int i, r;
3379 unsigned long n; 3443 unsigned long n;
3380 unsigned long *buf; 3444 unsigned long *buf, *p;
3381 struct kvm_vcpu *vcpu; 3445 struct kvm_vcpu *vcpu;
3382 3446
3383 mutex_lock(&kvm->slots_lock); 3447 mutex_lock(&kvm->slots_lock);
@@ -3393,8 +3457,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
3393 goto out; 3457 goto out;
3394 3458
3395 /* 3459 /*
3396 * Use second half of bitmap area because radix accumulates 3460 * Use second half of bitmap area because both HPT and radix
3397 * bits in the first half. 3461 * accumulate bits in the first half.
3398 */ 3462 */
3399 n = kvm_dirty_bitmap_bytes(memslot); 3463 n = kvm_dirty_bitmap_bytes(memslot);
3400 buf = memslot->dirty_bitmap + n / sizeof(long); 3464 buf = memslot->dirty_bitmap + n / sizeof(long);
@@ -3407,6 +3471,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
3407 if (r) 3471 if (r)
3408 goto out; 3472 goto out;
3409 3473
3474 /*
3475 * We accumulate dirty bits in the first half of the
3476 * memslot's dirty_bitmap area, for when pages are paged
3477 * out or modified by the host directly. Pick up these
3478 * bits and add them to the map.
3479 */
3480 p = memslot->dirty_bitmap;
3481 for (i = 0; i < n / sizeof(long); ++i)
3482 buf[i] |= xchg(&p[i], 0);
3483
3410 /* Harvest dirty bits from VPA and DTL updates */ 3484 /* Harvest dirty bits from VPA and DTL updates */
3411 /* Note: we never modify the SLB shadow buffer areas */ 3485 /* Note: we never modify the SLB shadow buffer areas */
3412 kvm_for_each_vcpu(i, vcpu, kvm) { 3486 kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -3438,15 +3512,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
3438static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, 3512static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
3439 unsigned long npages) 3513 unsigned long npages)
3440{ 3514{
3441 /*
3442 * For now, if radix_enabled() then we only support radix guests,
3443 * and in that case we don't need the rmap array.
3444 */
3445 if (radix_enabled()) {
3446 slot->arch.rmap = NULL;
3447 return 0;
3448 }
3449
3450 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 3515 slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
3451 if (!slot->arch.rmap) 3516 if (!slot->arch.rmap)
3452 return -ENOMEM; 3517 return -ENOMEM;
@@ -3467,8 +3532,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
3467 const struct kvm_memory_slot *new) 3532 const struct kvm_memory_slot *new)
3468{ 3533{
3469 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 3534 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
3470 struct kvm_memslots *slots;
3471 struct kvm_memory_slot *memslot;
3472 3535
3473 /* 3536 /*
3474 * If we are making a new memslot, it might make 3537 * If we are making a new memslot, it might make
@@ -3478,18 +3541,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
3478 */ 3541 */
3479 if (npages) 3542 if (npages)
3480 atomic64_inc(&kvm->arch.mmio_update); 3543 atomic64_inc(&kvm->arch.mmio_update);
3481
3482 if (npages && old->npages && !kvm_is_radix(kvm)) {
3483 /*
3484 * If modifying a memslot, reset all the rmap dirty bits.
3485 * If this is a new memslot, we don't need to do anything
3486 * since the rmap array starts out as all zeroes,
3487 * i.e. no pages are dirty.
3488 */
3489 slots = kvm_memslots(kvm);
3490 memslot = id_to_memslot(slots, mem->slot);
3491 kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
3492 }
3493} 3544}
3494 3545
3495/* 3546/*
@@ -3545,6 +3596,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
3545 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); 3596 mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
3546} 3597}
3547 3598
3599/*
3600 * Set up HPT (hashed page table) and RMA (real-mode area).
3601 * Must be called with kvm->lock held.
3602 */
3548static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) 3603static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3549{ 3604{
3550 int err = 0; 3605 int err = 0;
@@ -3556,10 +3611,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3556 unsigned long psize, porder; 3611 unsigned long psize, porder;
3557 int srcu_idx; 3612 int srcu_idx;
3558 3613
3559 mutex_lock(&kvm->lock);
3560 if (kvm->arch.hpte_setup_done)
3561 goto out; /* another vcpu beat us to it */
3562
3563 /* Allocate hashed page table (if not done already) and reset it */ 3614 /* Allocate hashed page table (if not done already) and reset it */
3564 if (!kvm->arch.hpt.virt) { 3615 if (!kvm->arch.hpt.virt) {
3565 int order = KVM_DEFAULT_HPT_ORDER; 3616 int order = KVM_DEFAULT_HPT_ORDER;
@@ -3618,18 +3669,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3618 /* the -4 is to account for senc values starting at 0x10 */ 3669 /* the -4 is to account for senc values starting at 0x10 */
3619 lpcr = senc << (LPCR_VRMASD_SH - 4); 3670 lpcr = senc << (LPCR_VRMASD_SH - 4);
3620 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); 3671 kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
3621 } else {
3622 kvmppc_setup_partition_table(kvm);
3623 } 3672 }
3624 3673
3625 /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ 3674 /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
3626 smp_wmb(); 3675 smp_wmb();
3627 kvm->arch.hpte_setup_done = 1;
3628 err = 0; 3676 err = 0;
3629 out_srcu: 3677 out_srcu:
3630 srcu_read_unlock(&kvm->srcu, srcu_idx); 3678 srcu_read_unlock(&kvm->srcu, srcu_idx);
3631 out: 3679 out:
3632 mutex_unlock(&kvm->lock);
3633 return err; 3680 return err;
3634 3681
3635 up_out: 3682 up_out:
@@ -3637,6 +3684,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3637 goto out_srcu; 3684 goto out_srcu;
3638} 3685}
3639 3686
3687/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
3688int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
3689{
3690 kvmppc_free_radix(kvm);
3691 kvmppc_update_lpcr(kvm, LPCR_VPM1,
3692 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
3693 kvmppc_rmap_reset(kvm);
3694 kvm->arch.radix = 0;
3695 kvm->arch.process_table = 0;
3696 return 0;
3697}
3698
3699/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
3700int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
3701{
3702 int err;
3703
3704 err = kvmppc_init_vm_radix(kvm);
3705 if (err)
3706 return err;
3707
3708 kvmppc_free_hpt(&kvm->arch.hpt);
3709 kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
3710 LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
3711 kvm->arch.radix = 1;
3712 return 0;
3713}
3714
3640#ifdef CONFIG_KVM_XICS 3715#ifdef CONFIG_KVM_XICS
3641/* 3716/*
3642 * Allocate a per-core structure for managing state about which cores are 3717 * Allocate a per-core structure for managing state about which cores are
@@ -3780,10 +3855,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3780 } 3855 }
3781 3856
3782 /* 3857 /*
3783 * For now, if the host uses radix, the guest must be radix. 3858 * If the host uses radix, the guest starts out as radix.
3784 */ 3859 */
3785 if (radix_enabled()) { 3860 if (radix_enabled()) {
3786 kvm->arch.radix = 1; 3861 kvm->arch.radix = 1;
3862 kvm->arch.mmu_ready = 1;
3787 lpcr &= ~LPCR_VPM1; 3863 lpcr &= ~LPCR_VPM1;
3788 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; 3864 lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
3789 ret = kvmppc_init_vm_radix(kvm); 3865 ret = kvmppc_init_vm_radix(kvm);
@@ -3803,7 +3879,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3803 * Work out how many sets the TLB has, for the use of 3879 * Work out how many sets the TLB has, for the use of
3804 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3880 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
3805 */ 3881 */
3806 if (kvm_is_radix(kvm)) 3882 if (radix_enabled())
3807 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ 3883 kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
3808 else if (cpu_has_feature(CPU_FTR_ARCH_300)) 3884 else if (cpu_has_feature(CPU_FTR_ARCH_300))
3809 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ 3885 kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
@@ -3815,10 +3891,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3815 /* 3891 /*
3816 * Track that we now have a HV mode VM active. This blocks secondary 3892 * Track that we now have a HV mode VM active. This blocks secondary
3817 * CPU threads from coming online. 3893 * CPU threads from coming online.
3818 * On POWER9, we only need to do this for HPT guests on a radix 3894 * On POWER9, we only need to do this if the "indep_threads_mode"
3819 * host, which is not yet supported. 3895 * module parameter has been set to N.
3820 */ 3896 */
3821 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3897 if (cpu_has_feature(CPU_FTR_ARCH_300))
3898 kvm->arch.threads_indep = indep_threads_mode;
3899 if (!kvm->arch.threads_indep)
3822 kvm_hv_vm_activated(); 3900 kvm_hv_vm_activated();
3823 3901
3824 /* 3902 /*
@@ -3858,7 +3936,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
3858{ 3936{
3859 debugfs_remove_recursive(kvm->arch.debugfs_dir); 3937 debugfs_remove_recursive(kvm->arch.debugfs_dir);
3860 3938
3861 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 3939 if (!kvm->arch.threads_indep)
3862 kvm_hv_vm_deactivated(); 3940 kvm_hv_vm_deactivated();
3863 3941
3864 kvmppc_free_vcores(kvm); 3942 kvmppc_free_vcores(kvm);
@@ -4193,6 +4271,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4193{ 4271{
4194 unsigned long lpcr; 4272 unsigned long lpcr;
4195 int radix; 4273 int radix;
4274 int err;
4196 4275
4197 /* If not on a POWER9, reject it */ 4276 /* If not on a POWER9, reject it */
4198 if (!cpu_has_feature(CPU_FTR_ARCH_300)) 4277 if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -4202,12 +4281,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4202 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) 4281 if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
4203 return -EINVAL; 4282 return -EINVAL;
4204 4283
4205 /* We can't change a guest to/from radix yet */
4206 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
4207 if (radix != kvm_is_radix(kvm))
4208 return -EINVAL;
4209
4210 /* GR (guest radix) bit in process_table field must match */ 4284 /* GR (guest radix) bit in process_table field must match */
4285 radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
4211 if (!!(cfg->process_table & PATB_GR) != radix) 4286 if (!!(cfg->process_table & PATB_GR) != radix)
4212 return -EINVAL; 4287 return -EINVAL;
4213 4288
@@ -4215,15 +4290,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
4215 if ((cfg->process_table & PRTS_MASK) > 24) 4290 if ((cfg->process_table & PRTS_MASK) > 24)
4216 return -EINVAL; 4291 return -EINVAL;
4217 4292
4293 /* We can change a guest to/from radix now, if the host is radix */
4294 if (radix && !radix_enabled())
4295 return -EINVAL;
4296
4218 mutex_lock(&kvm->lock); 4297 mutex_lock(&kvm->lock);
4298 if (radix != kvm_is_radix(kvm)) {
4299 if (kvm->arch.mmu_ready) {
4300 kvm->arch.mmu_ready = 0;
4301 /* order mmu_ready vs. vcpus_running */
4302 smp_mb();
4303 if (atomic_read(&kvm->arch.vcpus_running)) {
4304 kvm->arch.mmu_ready = 1;
4305 err = -EBUSY;
4306 goto out_unlock;
4307 }
4308 }
4309 if (radix)
4310 err = kvmppc_switch_mmu_to_radix(kvm);
4311 else
4312 err = kvmppc_switch_mmu_to_hpt(kvm);
4313 if (err)
4314 goto out_unlock;
4315 }
4316
4219 kvm->arch.process_table = cfg->process_table; 4317 kvm->arch.process_table = cfg->process_table;
4220 kvmppc_setup_partition_table(kvm); 4318 kvmppc_setup_partition_table(kvm);
4221 4319
4222 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; 4320 lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
4223 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); 4321 kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
4224 mutex_unlock(&kvm->lock); 4322 err = 0;
4225 4323
4226 return 0; 4324 out_unlock:
4325 mutex_unlock(&kvm->lock);
4326 return err;
4227} 4327}
4228 4328
4229static struct kvmppc_ops kvm_ops_hv = { 4329static struct kvmppc_ops kvm_ops_hv = {
@@ -4365,4 +4465,3 @@ module_exit(kvmppc_book3s_exit_hv);
4365MODULE_LICENSE("GPL"); 4465MODULE_LICENSE("GPL");
4366MODULE_ALIAS_MISCDEV(KVM_MINOR); 4466MODULE_ALIAS_MISCDEV(KVM_MINOR);
4367MODULE_ALIAS("devname:kvm"); 4467MODULE_ALIAS("devname:kvm");
4368
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 90644db9d38e..49a2c7825e04 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
278 struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; 278 struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
279 int ptid = local_paca->kvm_hstate.ptid; 279 int ptid = local_paca->kvm_hstate.ptid;
280 struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; 280 struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
281 int me, ee, i; 281 int me, ee, i, t;
282 int cpu0;
282 283
283 /* Set our bit in the threads-exiting-guest map in the 0xff00 284 /* Set our bit in the threads-exiting-guest map in the 0xff00
284 bits of vcore->entry_exit_map */ 285 bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
320 if ((ee >> 8) == 0) 321 if ((ee >> 8) == 0)
321 kvmhv_interrupt_vcore(vc, ee); 322 kvmhv_interrupt_vcore(vc, ee);
322 } 323 }
324
325 /*
326 * On POWER9 when running a HPT guest on a radix host (sip != NULL),
327 * we have to interrupt inactive CPU threads to get them to
328 * restore the host LPCR value.
329 */
330 if (sip->lpcr_req) {
331 if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
332 vc = local_paca->kvm_hstate.kvm_vcore;
333 cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
334 for (t = 1; t < threads_per_core; ++t) {
335 if (sip->napped[t])
336 kvmhv_rm_send_ipi(cpu0 + t);
337 }
338 }
339 }
323} 340}
324 341
325struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; 342struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -529,6 +546,8 @@ static inline bool is_rm(void)
529 546
530unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) 547unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
531{ 548{
549 if (!kvmppc_xics_enabled(vcpu))
550 return H_TOO_HARD;
532 if (xive_enabled()) { 551 if (xive_enabled()) {
533 if (is_rm()) 552 if (is_rm())
534 return xive_rm_h_xirr(vcpu); 553 return xive_rm_h_xirr(vcpu);
@@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
541 560
542unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) 561unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
543{ 562{
563 if (!kvmppc_xics_enabled(vcpu))
564 return H_TOO_HARD;
544 vcpu->arch.gpr[5] = get_tb(); 565 vcpu->arch.gpr[5] = get_tb();
545 if (xive_enabled()) { 566 if (xive_enabled()) {
546 if (is_rm()) 567 if (is_rm())
@@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
554 575
555unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) 576unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
556{ 577{
578 if (!kvmppc_xics_enabled(vcpu))
579 return H_TOO_HARD;
557 if (xive_enabled()) { 580 if (xive_enabled()) {
558 if (is_rm()) 581 if (is_rm())
559 return xive_rm_h_ipoll(vcpu, server); 582 return xive_rm_h_ipoll(vcpu, server);
@@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
567int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, 590int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
568 unsigned long mfrr) 591 unsigned long mfrr)
569{ 592{
593 if (!kvmppc_xics_enabled(vcpu))
594 return H_TOO_HARD;
570 if (xive_enabled()) { 595 if (xive_enabled()) {
571 if (is_rm()) 596 if (is_rm())
572 return xive_rm_h_ipi(vcpu, server, mfrr); 597 return xive_rm_h_ipi(vcpu, server, mfrr);
@@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
579 604
580int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) 605int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
581{ 606{
607 if (!kvmppc_xics_enabled(vcpu))
608 return H_TOO_HARD;
582 if (xive_enabled()) { 609 if (xive_enabled()) {
583 if (is_rm()) 610 if (is_rm())
584 return xive_rm_h_cppr(vcpu, cppr); 611 return xive_rm_h_cppr(vcpu, cppr);
@@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
591 618
592int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 619int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
593{ 620{
621 if (!kvmppc_xics_enabled(vcpu))
622 return H_TOO_HARD;
594 if (xive_enabled()) { 623 if (xive_enabled()) {
595 if (is_rm()) 624 if (is_rm())
596 return xive_rm_h_eoi(vcpu, xirr); 625 return xive_rm_h_eoi(vcpu, xirr);
@@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
601 return xics_rm_h_eoi(vcpu, xirr); 630 return xics_rm_h_eoi(vcpu, xirr);
602} 631}
603#endif /* CONFIG_KVM_XICS */ 632#endif /* CONFIG_KVM_XICS */
633
634void kvmppc_bad_interrupt(struct pt_regs *regs)
635{
636 die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
637 panic("Bad KVM trap");
638}
639
640/*
641 * Functions used to switch LPCR HR and UPRT bits on all threads
642 * when entering and exiting HPT guests on a radix host.
643 */
644
645#define PHASE_REALMODE 1 /* in real mode */
646#define PHASE_SET_LPCR 2 /* have set LPCR */
647#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */
648#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */
649
650#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
651
652static void wait_for_sync(struct kvm_split_mode *sip, int phase)
653{
654 int thr = local_paca->kvm_hstate.tid;
655
656 sip->lpcr_sync.phase[thr] |= phase;
657 phase = ALL(phase);
658 while ((sip->lpcr_sync.allphases & phase) != phase) {
659 HMT_low();
660 barrier();
661 }
662 HMT_medium();
663}
664
665void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
666{
667 unsigned long rb, set;
668
669 /* wait for every other thread to get to real mode */
670 wait_for_sync(sip, PHASE_REALMODE);
671
672 /* Set LPCR and LPIDR */
673 mtspr(SPRN_LPCR, sip->lpcr_req);
674 mtspr(SPRN_LPID, sip->lpidr_req);
675 isync();
676
677 /* Invalidate the TLB on thread 0 */
678 if (local_paca->kvm_hstate.tid == 0) {
679 sip->do_set = 0;
680 asm volatile("ptesync" : : : "memory");
681 for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
682 rb = TLBIEL_INVAL_SET_LPID +
683 (set << TLBIEL_INVAL_SET_SHIFT);
684 asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
685 "r" (rb), "r" (0));
686 }
687 asm volatile("ptesync" : : : "memory");
688 }
689
690 /* indicate that we have done so and wait for others */
691 wait_for_sync(sip, PHASE_SET_LPCR);
692 /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
693 smp_rmb();
694}
695
696/*
697 * Called when a thread that has been in the guest needs
698 * to reload the host LPCR value - but only on POWER9 when
699 * running a HPT guest on a radix host.
700 */
701void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
702{
703 /* we're out of the guest... */
704 wait_for_sync(sip, PHASE_OUT_OF_GUEST);
705
706 mtspr(SPRN_LPID, 0);
707 mtspr(SPRN_LPCR, sip->host_lpcr);
708 isync();
709
710 if (local_paca->kvm_hstate.tid == 0) {
711 sip->do_restore = 0;
712 smp_wmb(); /* order store of do_restore vs. phase */
713 }
714
715 wait_for_sync(sip, PHASE_RESET_LPCR);
716 smp_mb();
717 local_paca->kvm_hstate.kvm_split_mode = NULL;
718}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4efe364f1188..26c11f678fbf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
107} 107}
108EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 108EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
109 109
110/* Update the changed page order field of an rmap entry */ 110/* Update the dirty bitmap of a memslot */
111void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) 111void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
112 unsigned long gfn, unsigned long psize)
112{ 113{
113 unsigned long order; 114 unsigned long npages;
114 115
115 if (!psize) 116 if (!psize || !memslot->dirty_bitmap)
116 return; 117 return;
117 order = ilog2(psize); 118 npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
118 order <<= KVMPPC_RMAP_CHG_SHIFT; 119 gfn -= memslot->base_gfn;
119 if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) 120 set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
120 *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; 121}
122EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
123
124static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
125 unsigned long hpte_v, unsigned long hpte_gr)
126{
127 struct kvm_memory_slot *memslot;
128 unsigned long gfn;
129 unsigned long psize;
130
131 psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
132 gfn = hpte_rpn(hpte_gr, psize);
133 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
134 if (memslot && memslot->dirty_bitmap)
135 kvmppc_update_dirty_map(memslot, gfn, psize);
121} 136}
122EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
123 137
124/* Returns a pointer to the revmap entry for the page mapped by a HPTE */ 138/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
125static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, 139static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
126 unsigned long hpte_gr) 140 unsigned long hpte_gr,
141 struct kvm_memory_slot **memslotp,
142 unsigned long *gfnp)
127{ 143{
128 struct kvm_memory_slot *memslot; 144 struct kvm_memory_slot *memslot;
129 unsigned long *rmap; 145 unsigned long *rmap;
130 unsigned long gfn; 146 unsigned long gfn;
131 147
132 gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); 148 gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
133 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); 149 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
150 if (memslotp)
151 *memslotp = memslot;
152 if (gfnp)
153 *gfnp = gfn;
134 if (!memslot) 154 if (!memslot)
135 return NULL; 155 return NULL;
136 156
@@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
147 unsigned long ptel, head; 167 unsigned long ptel, head;
148 unsigned long *rmap; 168 unsigned long *rmap;
149 unsigned long rcbits; 169 unsigned long rcbits;
170 struct kvm_memory_slot *memslot;
171 unsigned long gfn;
150 172
151 rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); 173 rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
152 ptel = rev->guest_rpte |= rcbits; 174 ptel = rev->guest_rpte |= rcbits;
153 rmap = revmap_for_hpte(kvm, hpte_v, ptel); 175 rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
154 if (!rmap) 176 if (!rmap)
155 return; 177 return;
156 lock_rmap(rmap); 178 lock_rmap(rmap);
@@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
169 } 191 }
170 *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; 192 *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
171 if (rcbits & HPTE_R_C) 193 if (rcbits & HPTE_R_C)
172 kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); 194 kvmppc_update_dirty_map(memslot, gfn,
195 kvmppc_actual_pgsz(hpte_v, hpte_r));
173 unlock_rmap(rmap); 196 unlock_rmap(rmap);
174} 197}
175 198
@@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
193 216
194 if (kvm_is_radix(kvm)) 217 if (kvm_is_radix(kvm))
195 return H_FUNCTION; 218 return H_FUNCTION;
196 psize = hpte_page_size(pteh, ptel); 219 psize = kvmppc_actual_pgsz(pteh, ptel);
197 if (!psize) 220 if (!psize)
198 return H_PARAMETER; 221 return H_PARAMETER;
199 writing = hpte_is_writable(ptel); 222 writing = hpte_is_writable(ptel);
@@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
797 gr |= r & (HPTE_R_R | HPTE_R_C); 820 gr |= r & (HPTE_R_R | HPTE_R_C);
798 if (r & HPTE_R_R) { 821 if (r & HPTE_R_R) {
799 kvmppc_clear_ref_hpte(kvm, hpte, pte_index); 822 kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
800 rmap = revmap_for_hpte(kvm, v, gr); 823 rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
801 if (rmap) { 824 if (rmap) {
802 lock_rmap(rmap); 825 lock_rmap(rmap);
803 *rmap |= KVMPPC_RMAP_REFERENCED; 826 *rmap |= KVMPPC_RMAP_REFERENCED;
@@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
819 __be64 *hpte; 842 __be64 *hpte;
820 unsigned long v, r, gr; 843 unsigned long v, r, gr;
821 struct revmap_entry *rev; 844 struct revmap_entry *rev;
822 unsigned long *rmap;
823 long ret = H_NOT_FOUND; 845 long ret = H_NOT_FOUND;
824 846
825 if (kvm_is_radix(kvm)) 847 if (kvm_is_radix(kvm))
@@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
848 r = be64_to_cpu(hpte[1]); 870 r = be64_to_cpu(hpte[1]);
849 gr |= r & (HPTE_R_R | HPTE_R_C); 871 gr |= r & (HPTE_R_R | HPTE_R_C);
850 if (r & HPTE_R_C) { 872 if (r & HPTE_R_C) {
851 unsigned long psize = hpte_page_size(v, r);
852 hpte[1] = cpu_to_be64(r & ~HPTE_R_C); 873 hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
853 eieio(); 874 eieio();
854 rmap = revmap_for_hpte(kvm, v, gr); 875 kvmppc_set_dirty_from_hpte(kvm, v, gr);
855 if (rmap) {
856 lock_rmap(rmap);
857 *rmap |= KVMPPC_RMAP_CHANGED;
858 kvmppc_update_rmap_change(rmap, psize);
859 unlock_rmap(rmap);
860 }
861 } 876 }
862 } 877 }
863 vcpu->arch.gpr[4] = gr; 878 vcpu->arch.gpr[4] = gr;
@@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
1014 * Check the HPTE again, including base page size 1029 * Check the HPTE again, including base page size
1015 */ 1030 */
1016 if ((v & valid) && (v & mask) == val && 1031 if ((v & valid) && (v & mask) == val &&
1017 hpte_base_page_size(v, r) == (1ul << pshift)) 1032 kvmppc_hpte_base_page_shift(v, r) == pshift)
1018 /* Return with the HPTE still locked */ 1033 /* Return with the HPTE still locked */
1019 return (hash << 3) + (i >> 1); 1034 return (hash << 3) + (i >> 1);
1020 1035
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 68bf0f14a962..2659844784b8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -31,6 +31,7 @@
31#include <asm/tm.h> 31#include <asm/tm.h>
32#include <asm/opal.h> 32#include <asm/opal.h>
33#include <asm/xive-regs.h> 33#include <asm/xive-regs.h>
34#include <asm/thread_info.h>
34 35
35/* Sign-extend HDEC if not on POWER9 */ 36/* Sign-extend HDEC if not on POWER9 */
36#define EXTEND_HDEC(reg) \ 37#define EXTEND_HDEC(reg) \
@@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
81 RFI 82 RFI
82 83
83kvmppc_call_hv_entry: 84kvmppc_call_hv_entry:
85BEGIN_FTR_SECTION
86 /* On P9, do LPCR setting, if necessary */
87 ld r3, HSTATE_SPLIT_MODE(r13)
88 cmpdi r3, 0
89 beq 46f
90 lwz r4, KVM_SPLIT_DO_SET(r3)
91 cmpwi r4, 0
92 beq 46f
93 bl kvmhv_p9_set_lpcr
94 nop
9546:
96END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
97
84 ld r4, HSTATE_KVM_VCPU(r13) 98 ld r4, HSTATE_KVM_VCPU(r13)
85 bl kvmppc_hv_entry 99 bl kvmppc_hv_entry
86 100
@@ -387,6 +401,7 @@ kvm_secondary_got_guest:
387 ld r6, HSTATE_SPLIT_MODE(r13) 401 ld r6, HSTATE_SPLIT_MODE(r13)
388 cmpdi r6, 0 402 cmpdi r6, 0
389 beq 63f 403 beq 63f
404BEGIN_FTR_SECTION
390 ld r0, KVM_SPLIT_RPR(r6) 405 ld r0, KVM_SPLIT_RPR(r6)
391 mtspr SPRN_RPR, r0 406 mtspr SPRN_RPR, r0
392 ld r0, KVM_SPLIT_PMMAR(r6) 407 ld r0, KVM_SPLIT_PMMAR(r6)
@@ -394,6 +409,15 @@ kvm_secondary_got_guest:
394 ld r0, KVM_SPLIT_LDBAR(r6) 409 ld r0, KVM_SPLIT_LDBAR(r6)
395 mtspr SPRN_LDBAR, r0 410 mtspr SPRN_LDBAR, r0
396 isync 411 isync
412FTR_SECTION_ELSE
413 /* On P9 we use the split_info for coordinating LPCR changes */
414 lwz r4, KVM_SPLIT_DO_SET(r6)
415 cmpwi r4, 0
416 beq 63f
417 mr r3, r6
418 bl kvmhv_p9_set_lpcr
419 nop
420ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
39763: 42163:
398 /* Order load of vcpu after load of vcore */ 422 /* Order load of vcpu after load of vcore */
399 lwsync 423 lwsync
@@ -464,6 +488,12 @@ kvm_no_guest:
464 ld r3, HSTATE_SPLIT_MODE(r13) 488 ld r3, HSTATE_SPLIT_MODE(r13)
465 cmpdi r3, 0 489 cmpdi r3, 0
466 beq kvm_no_guest 490 beq kvm_no_guest
491 lwz r0, KVM_SPLIT_DO_SET(r3)
492 cmpwi r0, 0
493 bne kvmhv_do_set
494 lwz r0, KVM_SPLIT_DO_RESTORE(r3)
495 cmpwi r0, 0
496 bne kvmhv_do_restore
467 lbz r0, KVM_SPLIT_DO_NAP(r3) 497 lbz r0, KVM_SPLIT_DO_NAP(r3)
468 cmpwi r0, 0 498 cmpwi r0, 0
469 beq kvm_no_guest 499 beq kvm_no_guest
@@ -476,6 +506,19 @@ kvm_no_guest:
476 stb r0, HSTATE_HWTHREAD_STATE(r13) 506 stb r0, HSTATE_HWTHREAD_STATE(r13)
477 b kvm_no_guest 507 b kvm_no_guest
478 508
509kvmhv_do_set:
510 /* Set LPCR, LPIDR etc. on P9 */
511 HMT_MEDIUM
512 bl kvmhv_p9_set_lpcr
513 nop
514 b kvm_no_guest
515
516kvmhv_do_restore:
517 HMT_MEDIUM
518 bl kvmhv_p9_restore_lpcr
519 nop
520 b kvm_no_guest
521
479/* 522/*
480 * Here the primary thread is trying to return the core to 523 * Here the primary thread is trying to return the core to
481 * whole-core mode, so we need to nap. 524 * whole-core mode, so we need to nap.
@@ -513,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
513 /* Set kvm_split_mode.napped[tid] = 1 */ 556 /* Set kvm_split_mode.napped[tid] = 1 */
514 ld r3, HSTATE_SPLIT_MODE(r13) 557 ld r3, HSTATE_SPLIT_MODE(r13)
515 li r0, 1 558 li r0, 1
516 lhz r4, PACAPACAINDEX(r13) 559 lbz r4, HSTATE_TID(r13)
517 clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */
518 addi r4, r4, KVM_SPLIT_NAPPED 560 addi r4, r4, KVM_SPLIT_NAPPED
519 stbx r0, r3, r4 561 stbx r0, r3, r4
520 /* Check the do_nap flag again after setting napped[] */ 562 /* Check the do_nap flag again after setting napped[] */
@@ -1911,10 +1953,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
191119: lis r8,0x7fff /* MAX_INT@h */ 195319: lis r8,0x7fff /* MAX_INT@h */
1912 mtspr SPRN_HDEC,r8 1954 mtspr SPRN_HDEC,r8
1913 1955
191416: ld r8,KVM_HOST_LPCR(r4) 195616:
1957BEGIN_FTR_SECTION
1958 /* On POWER9 with HPT-on-radix we need to wait for all other threads */
1959 ld r3, HSTATE_SPLIT_MODE(r13)
1960 cmpdi r3, 0
1961 beq 47f
1962 lwz r8, KVM_SPLIT_DO_RESTORE(r3)
1963 cmpwi r8, 0
1964 beq 47f
1965 stw r12, STACK_SLOT_TRAP(r1)
1966 bl kvmhv_p9_restore_lpcr
1967 nop
1968 lwz r12, STACK_SLOT_TRAP(r1)
1969 b 48f
197047:
1971END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1972 ld r8,KVM_HOST_LPCR(r4)
1915 mtspr SPRN_LPCR,r8 1973 mtspr SPRN_LPCR,r8
1916 isync 1974 isync
1917 197548:
1918 /* load host SLB entries */ 1976 /* load host SLB entries */
1919BEGIN_MMU_FTR_SECTION 1977BEGIN_MMU_FTR_SECTION
1920 b 0f 1978 b 0f
@@ -3133,10 +3191,139 @@ kvmppc_restore_tm:
3133/* 3191/*
3134 * We come here if we get any exception or interrupt while we are 3192 * We come here if we get any exception or interrupt while we are
3135 * executing host real mode code while in guest MMU context. 3193 * executing host real mode code while in guest MMU context.
3136 * For now just spin, but we should do something better. 3194 * r12 is (CR << 32) | vector
3195 * r13 points to our PACA
3196 * r12 is saved in HSTATE_SCRATCH0(r13)
3197 * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE
3198 * r9 is saved in HSTATE_SCRATCH2(r13)
3199 * r13 is saved in HSPRG1
3200 * cfar is saved in HSTATE_CFAR(r13)
3201 * ppr is saved in HSTATE_PPR(r13)
3137 */ 3202 */
3138kvmppc_bad_host_intr: 3203kvmppc_bad_host_intr:
3204 /*
3205 * Switch to the emergency stack, but start half-way down in
3206 * case we were already on it.
3207 */
3208 mr r9, r1
3209 std r1, PACAR1(r13)
3210 ld r1, PACAEMERGSP(r13)
3211 subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
3212 std r9, 0(r1)
3213 std r0, GPR0(r1)
3214 std r9, GPR1(r1)
3215 std r2, GPR2(r1)
3216 SAVE_4GPRS(3, r1)
3217 SAVE_2GPRS(7, r1)
3218 srdi r0, r12, 32
3219 clrldi r12, r12, 32
3220 std r0, _CCR(r1)
3221 std r12, _TRAP(r1)
3222 andi. r0, r12, 2
3223 beq 1f
3224 mfspr r3, SPRN_HSRR0
3225 mfspr r4, SPRN_HSRR1
3226 mfspr r5, SPRN_HDAR
3227 mfspr r6, SPRN_HDSISR
3228 b 2f
32291: mfspr r3, SPRN_SRR0
3230 mfspr r4, SPRN_SRR1
3231 mfspr r5, SPRN_DAR
3232 mfspr r6, SPRN_DSISR
32332: std r3, _NIP(r1)
3234 std r4, _MSR(r1)
3235 std r5, _DAR(r1)
3236 std r6, _DSISR(r1)
3237 ld r9, HSTATE_SCRATCH2(r13)
3238 ld r12, HSTATE_SCRATCH0(r13)
3239 GET_SCRATCH0(r0)
3240 SAVE_4GPRS(9, r1)
3241 std r0, GPR13(r1)
3242 SAVE_NVGPRS(r1)
3243 ld r5, HSTATE_CFAR(r13)
3244 std r5, ORIG_GPR3(r1)
3245 mflr r3
3246#ifdef CONFIG_RELOCATABLE
3247 ld r4, HSTATE_SCRATCH1(r13)
3248#else
3249 mfctr r4
3250#endif
3251 mfxer r5
3252 lbz r6, PACASOFTIRQEN(r13)
3253 std r3, _LINK(r1)
3254 std r4, _CTR(r1)
3255 std r5, _XER(r1)
3256 std r6, SOFTE(r1)
3257 ld r2, PACATOC(r13)
3258 LOAD_REG_IMMEDIATE(3, 0x7265677368657265)
3259 std r3, STACK_FRAME_OVERHEAD-16(r1)
3260
3261 /*
3262 * On POWER9 do a minimal restore of the MMU and call C code,
3263 * which will print a message and panic.
3264 * XXX On POWER7 and POWER8, we just spin here since we don't
3265 * know what the other threads are doing (and we don't want to
3266 * coordinate with them) - but at least we now have register state
3267 * in memory that we might be able to look at from another CPU.
3268 */
3269BEGIN_FTR_SECTION
3139 b . 3270 b .
3271END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
3272 ld r9, HSTATE_KVM_VCPU(r13)
3273 ld r10, VCPU_KVM(r9)
3274
3275 li r0, 0
3276 mtspr SPRN_AMR, r0
3277 mtspr SPRN_IAMR, r0
3278 mtspr SPRN_CIABR, r0
3279 mtspr SPRN_DAWRX, r0
3280
3281 /* Flush the ERAT on radix P9 DD1 guest exit */
3282BEGIN_FTR_SECTION
3283 PPC_INVALIDATE_ERAT
3284END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
3285
3286BEGIN_MMU_FTR_SECTION
3287 b 4f
3288END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
3289
3290 slbmte r0, r0
3291 slbia
3292 ptesync
3293 ld r8, PACA_SLBSHADOWPTR(r13)
3294 .rept SLB_NUM_BOLTED
3295 li r3, SLBSHADOW_SAVEAREA
3296 LDX_BE r5, r8, r3
3297 addi r3, r3, 8
3298 LDX_BE r6, r8, r3
3299 andis. r7, r5, SLB_ESID_V@h
3300 beq 3f
3301 slbmte r6, r5
33023: addi r8, r8, 16
3303 .endr
3304
33054: lwz r7, KVM_HOST_LPID(r10)
3306 mtspr SPRN_LPID, r7
3307 mtspr SPRN_PID, r0
3308 ld r8, KVM_HOST_LPCR(r10)
3309 mtspr SPRN_LPCR, r8
3310 isync
3311 li r0, KVM_GUEST_MODE_NONE
3312 stb r0, HSTATE_IN_GUEST(r13)
3313
3314 /*
3315 * Turn on the MMU and jump to C code
3316 */
3317 bcl 20, 31, .+4
33185: mflr r3
3319 addi r3, r3, 9f - 5b
3320 ld r4, PACAKMSR(r13)
3321 mtspr SPRN_SRR0, r3
3322 mtspr SPRN_SRR1, r4
3323 rfid
33249: addi r3, r1, STACK_FRAME_OVERHEAD
3325 bl kvmppc_bad_interrupt
3326 b 9b
3140 3327
3141/* 3328/*
3142 * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken 3329 * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 69a09444d46e..d0dc8624198f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
1326 kvmppc_set_pvr_pr(vcpu, sregs->pvr); 1326 kvmppc_set_pvr_pr(vcpu, sregs->pvr);
1327 1327
1328 vcpu3s->sdr1 = sregs->u.s.sdr1; 1328 vcpu3s->sdr1 = sregs->u.s.sdr1;
1329#ifdef CONFIG_PPC_BOOK3S_64
1329 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { 1330 if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
1331 /* Flush all SLB entries */
1332 vcpu->arch.mmu.slbmte(vcpu, 0, 0);
1333 vcpu->arch.mmu.slbia(vcpu);
1334
1330 for (i = 0; i < 64; i++) { 1335 for (i = 0; i < 64; i++) {
1331 vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, 1336 u64 rb = sregs->u.s.ppc64.slb[i].slbe;
1332 sregs->u.s.ppc64.slb[i].slbe); 1337 u64 rs = sregs->u.s.ppc64.slb[i].slbv;
1338
1339 if (rb & SLB_ESID_V)
1340 vcpu->arch.mmu.slbmte(vcpu, rs, rb);
1333 } 1341 }
1334 } else { 1342 } else
1343#endif
1344 {
1335 for (i = 0; i < 16; i++) { 1345 for (i = 0; i < 16; i++) {
1336 vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); 1346 vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
1337 } 1347 }
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 8a4205fa774f..dae3be5ff42b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
419 case H_PROTECT: 419 case H_PROTECT:
420 case H_BULK_REMOVE: 420 case H_BULK_REMOVE:
421 case H_PUT_TCE: 421 case H_PUT_TCE:
422 case H_PUT_TCE_INDIRECT:
423 case H_STUFF_TCE:
422 case H_CEDE: 424 case H_CEDE:
423 case H_LOGICAL_CI_LOAD: 425 case H_LOGICAL_CI_LOAD:
424 case H_LOGICAL_CI_STORE: 426 case H_LOGICAL_CI_STORE:
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index c6c734424c70..423b21393bc9 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
377 377
378 start = vma->vm_pgoff; 378 start = vma->vm_pgoff;
379 end = start + 379 end = start +
380 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); 380 vma_pages(vma);
381 381
382 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); 382 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
383 383
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1abe6eb51335..6b6c53c42ac9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
590 r = !!(hv_enabled && radix_enabled()); 590 r = !!(hv_enabled && radix_enabled());
591 break; 591 break;
592 case KVM_CAP_PPC_MMU_HASH_V3: 592 case KVM_CAP_PPC_MMU_HASH_V3:
593 r = !!(hv_enabled && !radix_enabled() && 593 r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
594 cpu_has_feature(CPU_FTR_ARCH_300));
595 break; 594 break;
596#endif 595#endif
597 case KVM_CAP_SYNC_MMU: 596 case KVM_CAP_SYNC_MMU:
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index fd006a272024..f3a9b5a445b6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -685,11 +685,28 @@ struct kvm_s390_crypto {
685 __u8 dea_kw; 685 __u8 dea_kw;
686}; 686};
687 687
688#define APCB0_MASK_SIZE 1
689struct kvm_s390_apcb0 {
690 __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */
691 __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */
692 __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */
693 __u64 reserved18; /* 0x0018 */
694};
695
696#define APCB1_MASK_SIZE 4
697struct kvm_s390_apcb1 {
698 __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */
699 __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */
700 __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */
701 __u64 reserved60[4]; /* 0x0060 */
702};
703
688struct kvm_s390_crypto_cb { 704struct kvm_s390_crypto_cb {
689 __u8 reserved00[72]; /* 0x0000 */ 705 struct kvm_s390_apcb0 apcb0; /* 0x0000 */
690 __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ 706 __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */
691 __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ 707 __u8 dea_wrapping_key_mask[24]; /* 0x0048 */
692 __u8 reserved80[128]; /* 0x0080 */ 708 __u8 aes_wrapping_key_mask[32]; /* 0x0060 */
709 struct kvm_s390_apcb1 apcb1; /* 0x0080 */
693}; 710};
694 711
695/* 712/*
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 329b2843fee2..fa557372d600 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -213,6 +213,16 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
213 vcpu->arch.local_int.pending_irqs; 213 vcpu->arch.local_int.pending_irqs;
214} 214}
215 215
216static inline int isc_to_irq_type(unsigned long isc)
217{
218 return IRQ_PEND_IO_ISC_0 + isc;
219}
220
221static inline int irq_type_to_isc(unsigned long irq_type)
222{
223 return irq_type - IRQ_PEND_IO_ISC_0;
224}
225
216static unsigned long disable_iscs(struct kvm_vcpu *vcpu, 226static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
217 unsigned long active_mask) 227 unsigned long active_mask)
218{ 228{
@@ -220,7 +230,7 @@ static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
220 230
221 for (i = 0; i <= MAX_ISC; i++) 231 for (i = 0; i <= MAX_ISC; i++)
222 if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i))) 232 if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i)))
223 active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i)); 233 active_mask &= ~(1UL << (isc_to_irq_type(i)));
224 234
225 return active_mask; 235 return active_mask;
226} 236}
@@ -901,7 +911,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
901 fi = &vcpu->kvm->arch.float_int; 911 fi = &vcpu->kvm->arch.float_int;
902 912
903 spin_lock(&fi->lock); 913 spin_lock(&fi->lock);
904 isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0]; 914 isc_list = &fi->lists[irq_type_to_isc(irq_type)];
905 inti = list_first_entry_or_null(isc_list, 915 inti = list_first_entry_or_null(isc_list,
906 struct kvm_s390_interrupt_info, 916 struct kvm_s390_interrupt_info,
907 list); 917 list);
@@ -1074,6 +1084,12 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
1074 * in kvm_vcpu_block without having the waitqueue set (polling) 1084 * in kvm_vcpu_block without having the waitqueue set (polling)
1075 */ 1085 */
1076 vcpu->valid_wakeup = true; 1086 vcpu->valid_wakeup = true;
1087 /*
1088 * This is mostly to document, that the read in swait_active could
1089 * be moved before other stores, leading to subtle races.
1090 * All current users do not store or use an atomic like update
1091 */
1092 smp_mb__after_atomic();
1077 if (swait_active(&vcpu->wq)) { 1093 if (swait_active(&vcpu->wq)) {
1078 /* 1094 /*
1079 * The vcpu gave up the cpu voluntarily, mark it as a good 1095 * The vcpu gave up the cpu voluntarily, mark it as a good
@@ -1395,7 +1411,7 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm,
1395 list_del_init(&iter->list); 1411 list_del_init(&iter->list);
1396 fi->counters[FIRQ_CNTR_IO] -= 1; 1412 fi->counters[FIRQ_CNTR_IO] -= 1;
1397 if (list_empty(isc_list)) 1413 if (list_empty(isc_list))
1398 clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); 1414 clear_bit(isc_to_irq_type(isc), &fi->pending_irqs);
1399 spin_unlock(&fi->lock); 1415 spin_unlock(&fi->lock);
1400 return iter; 1416 return iter;
1401 } 1417 }
@@ -1522,7 +1538,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1522 isc = int_word_to_isc(inti->io.io_int_word); 1538 isc = int_word_to_isc(inti->io.io_int_word);
1523 list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; 1539 list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
1524 list_add_tail(&inti->list, list); 1540 list_add_tail(&inti->list, list);
1525 set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); 1541 set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
1526 spin_unlock(&fi->lock); 1542 spin_unlock(&fi->lock);
1527 return 0; 1543 return 0;
1528} 1544}
@@ -2175,6 +2191,8 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr)
2175 return -EINVAL; 2191 return -EINVAL;
2176 if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) 2192 if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid)))
2177 return -EFAULT; 2193 return -EFAULT;
2194 if (!schid)
2195 return -EINVAL;
2178 kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); 2196 kfree(kvm_s390_get_io_int(kvm, isc_mask, schid));
2179 /* 2197 /*
2180 * If userspace is conforming to the architecture, we can have at most 2198 * If userspace is conforming to the architecture, we can have at most
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4bc70afe0a10..98ad8b9e0360 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -395,6 +395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
395 case KVM_CAP_S390_USER_INSTR0: 395 case KVM_CAP_S390_USER_INSTR0:
396 case KVM_CAP_S390_CMMA_MIGRATION: 396 case KVM_CAP_S390_CMMA_MIGRATION:
397 case KVM_CAP_S390_AIS: 397 case KVM_CAP_S390_AIS:
398 case KVM_CAP_S390_AIS_MIGRATION:
398 r = 1; 399 r = 1;
399 break; 400 break;
400 case KVM_CAP_S390_MEM_OP: 401 case KVM_CAP_S390_MEM_OP:
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b18b5652e5c5..a311938b63b3 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -443,22 +443,14 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
443 * 443 *
444 * Returns: - 0 on success 444 * Returns: - 0 on success
445 * - -EINVAL if the gpa is not valid guest storage 445 * - -EINVAL if the gpa is not valid guest storage
446 * - -ENOMEM if out of memory
447 */ 446 */
448static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) 447static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
449{ 448{
450 struct page *page; 449 struct page *page;
451 hva_t hva;
452 int rc;
453 450
454 hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 451 page = gfn_to_page(kvm, gpa_to_gfn(gpa));
455 if (kvm_is_error_hva(hva)) 452 if (is_error_page(page))
456 return -EINVAL; 453 return -EINVAL;
457 rc = get_user_pages_fast(hva, 1, 1, &page);
458 if (rc < 0)
459 return rc;
460 else if (rc != 1)
461 return -ENOMEM;
462 *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); 454 *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
463 return 0; 455 return 0;
464} 456}
@@ -466,11 +458,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
466/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ 458/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
467static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) 459static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
468{ 460{
469 struct page *page; 461 kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
470
471 page = virt_to_page(hpa);
472 set_page_dirty_lock(page);
473 put_page(page);
474 /* mark the page always as dirty for migration */ 462 /* mark the page always as dirty for migration */
475 mark_page_dirty(kvm, gpa_to_gfn(gpa)); 463 mark_page_dirty(kvm, gpa_to_gfn(gpa));
476} 464}
@@ -557,7 +545,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
557 rc = set_validity_icpt(scb_s, 0x003bU); 545 rc = set_validity_icpt(scb_s, 0x003bU);
558 if (!rc) { 546 if (!rc) {
559 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 547 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
560 if (rc == -EINVAL) 548 if (rc)
561 rc = set_validity_icpt(scb_s, 0x0034U); 549 rc = set_validity_icpt(scb_s, 0x0034U);
562 } 550 }
563 if (rc) 551 if (rc)
@@ -574,10 +562,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
574 } 562 }
575 /* 256 bytes cannot cross page boundaries */ 563 /* 256 bytes cannot cross page boundaries */
576 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 564 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
577 if (rc == -EINVAL) 565 if (rc) {
578 rc = set_validity_icpt(scb_s, 0x0080U); 566 rc = set_validity_icpt(scb_s, 0x0080U);
579 if (rc)
580 goto unpin; 567 goto unpin;
568 }
581 scb_s->itdba = hpa; 569 scb_s->itdba = hpa;
582 } 570 }
583 571
@@ -592,10 +580,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
592 * if this block gets bigger, we have to shadow it. 580 * if this block gets bigger, we have to shadow it.
593 */ 581 */
594 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 582 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
595 if (rc == -EINVAL) 583 if (rc) {
596 rc = set_validity_icpt(scb_s, 0x1310U); 584 rc = set_validity_icpt(scb_s, 0x1310U);
597 if (rc)
598 goto unpin; 585 goto unpin;
586 }
599 scb_s->gvrd = hpa; 587 scb_s->gvrd = hpa;
600 } 588 }
601 589
@@ -607,11 +595,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
607 } 595 }
608 /* 64 bytes cannot cross page boundaries */ 596 /* 64 bytes cannot cross page boundaries */
609 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 597 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
610 if (rc == -EINVAL) 598 if (rc) {
611 rc = set_validity_icpt(scb_s, 0x0043U); 599 rc = set_validity_icpt(scb_s, 0x0043U);
612 /* Validity 0x0044 will be checked by SIE */
613 if (rc)
614 goto unpin; 600 goto unpin;
601 }
602 /* Validity 0x0044 will be checked by SIE */
615 scb_s->riccbd = hpa; 603 scb_s->riccbd = hpa;
616 } 604 }
617 if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { 605 if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
@@ -635,10 +623,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
635 * cross page boundaries 623 * cross page boundaries
636 */ 624 */
637 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 625 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
638 if (rc == -EINVAL) 626 if (rc) {
639 rc = set_validity_icpt(scb_s, 0x10b0U); 627 rc = set_validity_icpt(scb_s, 0x10b0U);
640 if (rc)
641 goto unpin; 628 goto unpin;
629 }
642 scb_s->sdnxo = hpa | sdnxc; 630 scb_s->sdnxo = hpa | sdnxc;
643 } 631 }
644 return 0; 632 return 0;
@@ -663,7 +651,6 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
663 * 651 *
664 * Returns: - 0 if the scb was pinned. 652 * Returns: - 0 if the scb was pinned.
665 * - > 0 if control has to be given to guest 2 653 * - > 0 if control has to be given to guest 2
666 * - -ENOMEM if out of memory
667 */ 654 */
668static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, 655static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
669 gpa_t gpa) 656 gpa_t gpa)
@@ -672,14 +659,13 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
672 int rc; 659 int rc;
673 660
674 rc = pin_guest_page(vcpu->kvm, gpa, &hpa); 661 rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
675 if (rc == -EINVAL) { 662 if (rc) {
676 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 663 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
677 if (!rc) 664 WARN_ON_ONCE(rc);
678 rc = 1; 665 return 1;
679 } 666 }
680 if (!rc) 667 vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
681 vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; 668 return 0;
682 return rc;
683} 669}
684 670
685/* 671/*
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ee23a43386a2..034caa1a084e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -226,6 +226,8 @@ struct x86_emulate_ops {
226 226
227 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); 227 unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
228 void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); 228 void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
229 int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
230
229}; 231};
230 232
231typedef u32 __attribute__((vector_size(16))) sse128_t; 233typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d7d856b2d89..1bfb99770c34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1061,6 +1061,11 @@ struct kvm_x86_ops {
1061 void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); 1061 void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
1062 1062
1063 void (*setup_mce)(struct kvm_vcpu *vcpu); 1063 void (*setup_mce)(struct kvm_vcpu *vcpu);
1064
1065 int (*smi_allowed)(struct kvm_vcpu *vcpu);
1066 int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
1067 int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
1068 int (*enable_smi_window)(struct kvm_vcpu *vcpu);
1064}; 1069};
1065 1070
1066struct kvm_arch_async_pf { 1071struct kvm_arch_async_pf {
@@ -1426,4 +1431,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
1426#endif 1431#endif
1427} 1432}
1428 1433
1434#define put_smstate(type, buf, offset, val) \
1435 *(type *)((buf) + (offset) - 0x7e00) = val
1436
1429#endif /* _ASM_X86_KVM_HOST_H */ 1437#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index caec8417539f..8b6780751132 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,11 +70,11 @@
70#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 70#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
71#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 71#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
72#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 72#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
73#define SECONDARY_EXEC_RDRAND 0x00000800 73#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
74#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 74#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
75#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 75#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
76#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 76#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
77#define SECONDARY_EXEC_RDSEED 0x00010000 77#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
78#define SECONDARY_EXEC_ENABLE_PML 0x00020000 78#define SECONDARY_EXEC_ENABLE_PML 0x00020000
79#define SECONDARY_EXEC_XSAVES 0x00100000 79#define SECONDARY_EXEC_XSAVES 0x00100000
80#define SECONDARY_EXEC_TSC_SCALING 0x02000000 80#define SECONDARY_EXEC_TSC_SCALING 0x02000000
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d90cdc77e077..8079d141792a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
2591 ctxt->ops->set_msr(ctxt, MSR_EFER, efer); 2591 ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
2592 2592
2593 smbase = ctxt->ops->get_smbase(ctxt); 2593 smbase = ctxt->ops->get_smbase(ctxt);
2594
2595 /*
2596 * Give pre_leave_smm() a chance to make ISA-specific changes to the
2597 * vCPU state (e.g. enter guest mode) before loading state from the SMM
2598 * state-save area.
2599 */
2600 if (ctxt->ops->pre_leave_smm(ctxt, smbase))
2601 return X86EMUL_UNHANDLEABLE;
2602
2594 if (emulator_has_longmode(ctxt)) 2603 if (emulator_has_longmode(ctxt))
2595 ret = rsm_load_state_64(ctxt, smbase + 0x8000); 2604 ret = rsm_load_state_64(ctxt, smbase + 0x8000);
2596 else 2605 else
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36c90d631096..943acbf00c69 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1301,14 +1301,42 @@ static void update_divide_count(struct kvm_lapic *apic)
1301 apic->divide_count); 1301 apic->divide_count);
1302} 1302}
1303 1303
1304static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
1305{
1306 /*
1307 * Do not allow the guest to program periodic timers with small
1308 * interval, since the hrtimers are not throttled by the host
1309 * scheduler.
1310 */
1311 if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
1312 s64 min_period = min_timer_period_us * 1000LL;
1313
1314 if (apic->lapic_timer.period < min_period) {
1315 pr_info_ratelimited(
1316 "kvm: vcpu %i: requested %lld ns "
1317 "lapic timer period limited to %lld ns\n",
1318 apic->vcpu->vcpu_id,
1319 apic->lapic_timer.period, min_period);
1320 apic->lapic_timer.period = min_period;
1321 }
1322 }
1323}
1324
1304static void apic_update_lvtt(struct kvm_lapic *apic) 1325static void apic_update_lvtt(struct kvm_lapic *apic)
1305{ 1326{
1306 u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & 1327 u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
1307 apic->lapic_timer.timer_mode_mask; 1328 apic->lapic_timer.timer_mode_mask;
1308 1329
1309 if (apic->lapic_timer.timer_mode != timer_mode) { 1330 if (apic->lapic_timer.timer_mode != timer_mode) {
1331 if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
1332 APIC_LVT_TIMER_TSCDEADLINE)) {
1333 hrtimer_cancel(&apic->lapic_timer.timer);
1334 kvm_lapic_set_reg(apic, APIC_TMICT, 0);
1335 apic->lapic_timer.period = 0;
1336 apic->lapic_timer.tscdeadline = 0;
1337 }
1310 apic->lapic_timer.timer_mode = timer_mode; 1338 apic->lapic_timer.timer_mode = timer_mode;
1311 hrtimer_cancel(&apic->lapic_timer.timer); 1339 limit_periodic_timer_frequency(apic);
1312 } 1340 }
1313} 1341}
1314 1342
@@ -1430,6 +1458,30 @@ static void start_sw_period(struct kvm_lapic *apic)
1430 HRTIMER_MODE_ABS_PINNED); 1458 HRTIMER_MODE_ABS_PINNED);
1431} 1459}
1432 1460
1461static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
1462{
1463 ktime_t now, remaining;
1464 u64 ns_remaining_old, ns_remaining_new;
1465
1466 apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
1467 * APIC_BUS_CYCLE_NS * apic->divide_count;
1468 limit_periodic_timer_frequency(apic);
1469
1470 now = ktime_get();
1471 remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
1472 if (ktime_to_ns(remaining) < 0)
1473 remaining = 0;
1474
1475 ns_remaining_old = ktime_to_ns(remaining);
1476 ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
1477 apic->divide_count, old_divisor);
1478
1479 apic->lapic_timer.tscdeadline +=
1480 nsec_to_cycles(apic->vcpu, ns_remaining_new) -
1481 nsec_to_cycles(apic->vcpu, ns_remaining_old);
1482 apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
1483}
1484
1433static bool set_target_expiration(struct kvm_lapic *apic) 1485static bool set_target_expiration(struct kvm_lapic *apic)
1434{ 1486{
1435 ktime_t now; 1487 ktime_t now;
@@ -1439,27 +1491,13 @@ static bool set_target_expiration(struct kvm_lapic *apic)
1439 apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) 1491 apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
1440 * APIC_BUS_CYCLE_NS * apic->divide_count; 1492 * APIC_BUS_CYCLE_NS * apic->divide_count;
1441 1493
1442 if (!apic->lapic_timer.period) 1494 if (!apic->lapic_timer.period) {
1495 apic->lapic_timer.tscdeadline = 0;
1443 return false; 1496 return false;
1444
1445 /*
1446 * Do not allow the guest to program periodic timers with small
1447 * interval, since the hrtimers are not throttled by the host
1448 * scheduler.
1449 */
1450 if (apic_lvtt_period(apic)) {
1451 s64 min_period = min_timer_period_us * 1000LL;
1452
1453 if (apic->lapic_timer.period < min_period) {
1454 pr_info_ratelimited(
1455 "kvm: vcpu %i: requested %lld ns "
1456 "lapic timer period limited to %lld ns\n",
1457 apic->vcpu->vcpu_id,
1458 apic->lapic_timer.period, min_period);
1459 apic->lapic_timer.period = min_period;
1460 }
1461 } 1497 }
1462 1498
1499 limit_periodic_timer_frequency(apic);
1500
1463 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 1501 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
1464 PRIx64 ", " 1502 PRIx64 ", "
1465 "timer initial count 0x%x, period %lldns, " 1503 "timer initial count 0x%x, period %lldns, "
@@ -1515,6 +1553,9 @@ static bool start_hv_timer(struct kvm_lapic *apic)
1515 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) 1553 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
1516 return false; 1554 return false;
1517 1555
1556 if (!ktimer->tscdeadline)
1557 return false;
1558
1518 r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); 1559 r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
1519 if (r < 0) 1560 if (r < 0)
1520 return false; 1561 return false;
@@ -1738,13 +1779,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
1738 start_apic_timer(apic); 1779 start_apic_timer(apic);
1739 break; 1780 break;
1740 1781
1741 case APIC_TDCR: 1782 case APIC_TDCR: {
1783 uint32_t old_divisor = apic->divide_count;
1784
1742 if (val & 4) 1785 if (val & 4)
1743 apic_debug("KVM_WRITE:TDCR %x\n", val); 1786 apic_debug("KVM_WRITE:TDCR %x\n", val);
1744 kvm_lapic_set_reg(apic, APIC_TDCR, val); 1787 kvm_lapic_set_reg(apic, APIC_TDCR, val);
1745 update_divide_count(apic); 1788 update_divide_count(apic);
1789 if (apic->divide_count != old_divisor &&
1790 apic->lapic_timer.period) {
1791 hrtimer_cancel(&apic->lapic_timer.timer);
1792 update_target_expiration(apic, old_divisor);
1793 restart_apic_timer(apic);
1794 }
1746 break; 1795 break;
1747 1796 }
1748 case APIC_ESR: 1797 case APIC_ESR:
1749 if (apic_x2apic_mode(apic) && val != 0) { 1798 if (apic_x2apic_mode(apic) && val != 0) {
1750 apic_debug("KVM_WRITE:ESR not zero %x\n", val); 1799 apic_debug("KVM_WRITE:ESR not zero %x\n", val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a119b361b8b7..e5e66e5c6640 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ module_param(dbg, bool, 0644);
150/* make pte_list_desc fit well in cache line */ 150/* make pte_list_desc fit well in cache line */
151#define PTE_LIST_EXT 3 151#define PTE_LIST_EXT 3
152 152
153/*
154 * Return values of handle_mmio_page_fault and mmu.page_fault:
155 * RET_PF_RETRY: let CPU fault again on the address.
156 * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
157 *
158 * For handle_mmio_page_fault only:
159 * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
160 */
161enum {
162 RET_PF_RETRY = 0,
163 RET_PF_EMULATE = 1,
164 RET_PF_INVALID = 2,
165};
166
153struct pte_list_desc { 167struct pte_list_desc {
154 u64 *sptes[PTE_LIST_EXT]; 168 u64 *sptes[PTE_LIST_EXT];
155 struct pte_list_desc *more; 169 struct pte_list_desc *more;
@@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2424 2438
2425static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2439static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2426{ 2440{
2427 return __shadow_walk_next(iterator, *iterator->sptep); 2441 __shadow_walk_next(iterator, *iterator->sptep);
2428} 2442}
2429 2443
2430static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 2444static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2794,13 +2808,13 @@ done:
2794 return ret; 2808 return ret;
2795} 2809}
2796 2810
2797static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, 2811static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2798 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, 2812 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
2799 bool speculative, bool host_writable) 2813 bool speculative, bool host_writable)
2800{ 2814{
2801 int was_rmapped = 0; 2815 int was_rmapped = 0;
2802 int rmap_count; 2816 int rmap_count;
2803 bool emulate = false; 2817 int ret = RET_PF_RETRY;
2804 2818
2805 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2819 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2806 *sptep, write_fault, gfn); 2820 *sptep, write_fault, gfn);
@@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2830 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, 2844 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
2831 true, host_writable)) { 2845 true, host_writable)) {
2832 if (write_fault) 2846 if (write_fault)
2833 emulate = true; 2847 ret = RET_PF_EMULATE;
2834 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2848 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2835 } 2849 }
2836 2850
2837 if (unlikely(is_mmio_spte(*sptep))) 2851 if (unlikely(is_mmio_spte(*sptep)))
2838 emulate = true; 2852 ret = RET_PF_EMULATE;
2839 2853
2840 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2854 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2841 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2855 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2855 2869
2856 kvm_release_pfn_clean(pfn); 2870 kvm_release_pfn_clean(pfn);
2857 2871
2858 return emulate; 2872 return ret;
2859} 2873}
2860 2874
2861static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2875static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
2994 * Do not cache the mmio info caused by writing the readonly gfn 3008 * Do not cache the mmio info caused by writing the readonly gfn
2995 * into the spte otherwise read access on readonly gfn also can 3009 * into the spte otherwise read access on readonly gfn also can
2996 * caused mmio page fault and treat it as mmio access. 3010 * caused mmio page fault and treat it as mmio access.
2997 * Return 1 to tell kvm to emulate it.
2998 */ 3011 */
2999 if (pfn == KVM_PFN_ERR_RO_FAULT) 3012 if (pfn == KVM_PFN_ERR_RO_FAULT)
3000 return 1; 3013 return RET_PF_EMULATE;
3001 3014
3002 if (pfn == KVM_PFN_ERR_HWPOISON) { 3015 if (pfn == KVM_PFN_ERR_HWPOISON) {
3003 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 3016 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3004 return 0; 3017 return RET_PF_RETRY;
3005 } 3018 }
3006 3019
3007 return -EFAULT; 3020 return -EFAULT;
@@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3286 } 3299 }
3287 3300
3288 if (fast_page_fault(vcpu, v, level, error_code)) 3301 if (fast_page_fault(vcpu, v, level, error_code))
3289 return 0; 3302 return RET_PF_RETRY;
3290 3303
3291 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3304 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3292 smp_rmb(); 3305 smp_rmb();
3293 3306
3294 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 3307 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3295 return 0; 3308 return RET_PF_RETRY;
3296 3309
3297 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3310 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3298 return r; 3311 return r;
@@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3312out_unlock: 3325out_unlock:
3313 spin_unlock(&vcpu->kvm->mmu_lock); 3326 spin_unlock(&vcpu->kvm->mmu_lock);
3314 kvm_release_pfn_clean(pfn); 3327 kvm_release_pfn_clean(pfn);
3315 return 0; 3328 return RET_PF_RETRY;
3316} 3329}
3317 3330
3318 3331
@@ -3659,54 +3672,38 @@ exit:
3659 return reserved; 3672 return reserved;
3660} 3673}
3661 3674
3662/*
3663 * Return values of handle_mmio_page_fault:
3664 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
3665 * directly.
3666 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
3667 * fault path update the mmio spte.
3668 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
3669 * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
3670 */
3671enum {
3672 RET_MMIO_PF_EMULATE = 1,
3673 RET_MMIO_PF_INVALID = 2,
3674 RET_MMIO_PF_RETRY = 0,
3675 RET_MMIO_PF_BUG = -1
3676};
3677
3678static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3675static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3679{ 3676{
3680 u64 spte; 3677 u64 spte;
3681 bool reserved; 3678 bool reserved;
3682 3679
3683 if (mmio_info_in_cache(vcpu, addr, direct)) 3680 if (mmio_info_in_cache(vcpu, addr, direct))
3684 return RET_MMIO_PF_EMULATE; 3681 return RET_PF_EMULATE;
3685 3682
3686 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); 3683 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3687 if (WARN_ON(reserved)) 3684 if (WARN_ON(reserved))
3688 return RET_MMIO_PF_BUG; 3685 return -EINVAL;
3689 3686
3690 if (is_mmio_spte(spte)) { 3687 if (is_mmio_spte(spte)) {
3691 gfn_t gfn = get_mmio_spte_gfn(spte); 3688 gfn_t gfn = get_mmio_spte_gfn(spte);
3692 unsigned access = get_mmio_spte_access(spte); 3689 unsigned access = get_mmio_spte_access(spte);
3693 3690
3694 if (!check_mmio_spte(vcpu, spte)) 3691 if (!check_mmio_spte(vcpu, spte))
3695 return RET_MMIO_PF_INVALID; 3692 return RET_PF_INVALID;
3696 3693
3697 if (direct) 3694 if (direct)
3698 addr = 0; 3695 addr = 0;
3699 3696
3700 trace_handle_mmio_page_fault(addr, gfn, access); 3697 trace_handle_mmio_page_fault(addr, gfn, access);
3701 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3698 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3702 return RET_MMIO_PF_EMULATE; 3699 return RET_PF_EMULATE;
3703 } 3700 }
3704 3701
3705 /* 3702 /*
3706 * If the page table is zapped by other cpus, let CPU fault again on 3703 * If the page table is zapped by other cpus, let CPU fault again on
3707 * the address. 3704 * the address.
3708 */ 3705 */
3709 return RET_MMIO_PF_RETRY; 3706 return RET_PF_RETRY;
3710} 3707}
3711EXPORT_SYMBOL_GPL(handle_mmio_page_fault); 3708EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
3712 3709
@@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3756 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3753 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3757 3754
3758 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3755 if (page_fault_handle_page_track(vcpu, error_code, gfn))
3759 return 1; 3756 return RET_PF_EMULATE;
3760 3757
3761 r = mmu_topup_memory_caches(vcpu); 3758 r = mmu_topup_memory_caches(vcpu);
3762 if (r) 3759 if (r)
@@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3820} 3817}
3821 3818
3822int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 3819int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
3823 u64 fault_address, char *insn, int insn_len, 3820 u64 fault_address, char *insn, int insn_len)
3824 bool need_unprotect)
3825{ 3821{
3826 int r = 1; 3822 int r = 1;
3827 3823
@@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
3829 default: 3825 default:
3830 trace_kvm_page_fault(fault_address, error_code); 3826 trace_kvm_page_fault(fault_address, error_code);
3831 3827
3832 if (need_unprotect && kvm_event_needs_reinjection(vcpu)) 3828 if (kvm_event_needs_reinjection(vcpu))
3833 kvm_mmu_unprotect_page_virt(vcpu, fault_address); 3829 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
3834 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 3830 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
3835 insn_len); 3831 insn_len);
@@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3876 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3872 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3877 3873
3878 if (page_fault_handle_page_track(vcpu, error_code, gfn)) 3874 if (page_fault_handle_page_track(vcpu, error_code, gfn))
3879 return 1; 3875 return RET_PF_EMULATE;
3880 3876
3881 r = mmu_topup_memory_caches(vcpu); 3877 r = mmu_topup_memory_caches(vcpu);
3882 if (r) 3878 if (r)
@@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3893 } 3889 }
3894 3890
3895 if (fast_page_fault(vcpu, gpa, level, error_code)) 3891 if (fast_page_fault(vcpu, gpa, level, error_code))
3896 return 0; 3892 return RET_PF_RETRY;
3897 3893
3898 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3894 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3899 smp_rmb(); 3895 smp_rmb();
3900 3896
3901 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3897 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3902 return 0; 3898 return RET_PF_RETRY;
3903 3899
3904 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 3900 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
3905 return r; 3901 return r;
@@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3919out_unlock: 3915out_unlock:
3920 spin_unlock(&vcpu->kvm->mmu_lock); 3916 spin_unlock(&vcpu->kvm->mmu_lock);
3921 kvm_release_pfn_clean(pfn); 3917 kvm_release_pfn_clean(pfn);
3922 return 0; 3918 return RET_PF_RETRY;
3923} 3919}
3924 3920
3925static void nonpaging_init_context(struct kvm_vcpu *vcpu, 3921static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4918,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
4918 vcpu->arch.gpa_val = cr2; 4914 vcpu->arch.gpa_val = cr2;
4919 } 4915 }
4920 4916
4917 r = RET_PF_INVALID;
4921 if (unlikely(error_code & PFERR_RSVD_MASK)) { 4918 if (unlikely(error_code & PFERR_RSVD_MASK)) {
4922 r = handle_mmio_page_fault(vcpu, cr2, direct); 4919 r = handle_mmio_page_fault(vcpu, cr2, direct);
4923 if (r == RET_MMIO_PF_EMULATE) { 4920 if (r == RET_PF_EMULATE) {
4924 emulation_type = 0; 4921 emulation_type = 0;
4925 goto emulate; 4922 goto emulate;
4926 } 4923 }
4927 if (r == RET_MMIO_PF_RETRY)
4928 return 1;
4929 if (r < 0)
4930 return r;
4931 /* Must be RET_MMIO_PF_INVALID. */
4932 } 4924 }
4933 4925
4934 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), 4926 if (r == RET_PF_INVALID) {
4935 false); 4927 r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
4928 false);
4929 WARN_ON(r == RET_PF_INVALID);
4930 }
4931
4932 if (r == RET_PF_RETRY)
4933 return 1;
4936 if (r < 0) 4934 if (r < 0)
4937 return r; 4935 return r;
4938 if (!r)
4939 return 1;
4940 4936
4941 /* 4937 /*
4942 * Before emulating the instruction, check if the error code 4938 * Before emulating the instruction, check if the error code
@@ -4993,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
4993static void free_mmu_pages(struct kvm_vcpu *vcpu) 4989static void free_mmu_pages(struct kvm_vcpu *vcpu)
4994{ 4990{
4995 free_page((unsigned long)vcpu->arch.mmu.pae_root); 4991 free_page((unsigned long)vcpu->arch.mmu.pae_root);
4996 if (vcpu->arch.mmu.lm_root != NULL) 4992 free_page((unsigned long)vcpu->arch.mmu.lm_root);
4997 free_page((unsigned long)vcpu->arch.mmu.lm_root);
4998} 4993}
4999 4994
5000static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 4995static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5464,10 +5459,8 @@ static struct shrinker mmu_shrinker = {
5464 5459
5465static void mmu_destroy_caches(void) 5460static void mmu_destroy_caches(void)
5466{ 5461{
5467 if (pte_list_desc_cache) 5462 kmem_cache_destroy(pte_list_desc_cache);
5468 kmem_cache_destroy(pte_list_desc_cache); 5463 kmem_cache_destroy(mmu_page_header_cache);
5469 if (mmu_page_header_cache)
5470 kmem_cache_destroy(mmu_page_header_cache);
5471} 5464}
5472 5465
5473int kvm_mmu_module_init(void) 5466int kvm_mmu_module_init(void)
@@ -5476,13 +5469,13 @@ int kvm_mmu_module_init(void)
5476 5469
5477 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5470 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
5478 sizeof(struct pte_list_desc), 5471 sizeof(struct pte_list_desc),
5479 0, 0, NULL); 5472 0, SLAB_ACCOUNT, NULL);
5480 if (!pte_list_desc_cache) 5473 if (!pte_list_desc_cache)
5481 goto nomem; 5474 goto nomem;
5482 5475
5483 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 5476 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
5484 sizeof(struct kvm_mmu_page), 5477 sizeof(struct kvm_mmu_page),
5485 0, 0, NULL); 5478 0, SLAB_ACCOUNT, NULL);
5486 if (!mmu_page_header_cache) 5479 if (!mmu_page_header_cache)
5487 goto nomem; 5480 goto nomem;
5488 5481
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index efc857615d8e..5b408c0ad612 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
66 bool accessed_dirty); 66 bool accessed_dirty);
67bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); 67bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
68int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 68int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
69 u64 fault_address, char *insn, int insn_len, 69 u64 fault_address, char *insn, int insn_len);
70 bool need_unprotect);
71 70
72static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 71static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
73{ 72{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f18d1f8d332b..5abae72266b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
593 struct kvm_mmu_page *sp = NULL; 593 struct kvm_mmu_page *sp = NULL;
594 struct kvm_shadow_walk_iterator it; 594 struct kvm_shadow_walk_iterator it;
595 unsigned direct_access, access = gw->pt_access; 595 unsigned direct_access, access = gw->pt_access;
596 int top_level, emulate; 596 int top_level, ret;
597 597
598 direct_access = gw->pte_access; 598 direct_access = gw->pte_access;
599 599
@@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
659 } 659 }
660 660
661 clear_sp_write_flooding_count(it.sptep); 661 clear_sp_write_flooding_count(it.sptep);
662 emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, 662 ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
663 it.level, gw->gfn, pfn, prefault, map_writable); 663 it.level, gw->gfn, pfn, prefault, map_writable);
664 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 664 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
665 665
666 return emulate; 666 return ret;
667 667
668out_gpte_changed: 668out_gpte_changed:
669 kvm_release_pfn_clean(pfn); 669 kvm_release_pfn_clean(pfn);
670 return 0; 670 return RET_PF_RETRY;
671} 671}
672 672
673 /* 673 /*
@@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
762 if (!prefault) 762 if (!prefault)
763 inject_page_fault(vcpu, &walker.fault); 763 inject_page_fault(vcpu, &walker.fault);
764 764
765 return 0; 765 return RET_PF_RETRY;
766 } 766 }
767 767
768 if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { 768 if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
769 shadow_page_table_clear_flood(vcpu, addr); 769 shadow_page_table_clear_flood(vcpu, addr);
770 return 1; 770 return RET_PF_EMULATE;
771 } 771 }
772 772
773 vcpu->arch.write_fault_to_shadow_pgtable = false; 773 vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
789 789
790 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, 790 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
791 &map_writable)) 791 &map_writable))
792 return 0; 792 return RET_PF_RETRY;
793 793
794 if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) 794 if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
795 return r; 795 return r;
@@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
834out_unlock: 834out_unlock:
835 spin_unlock(&vcpu->kvm->mmu_lock); 835 spin_unlock(&vcpu->kvm->mmu_lock);
836 kvm_release_pfn_clean(pfn); 836 kvm_release_pfn_clean(pfn);
837 return 0; 837 return RET_PF_RETRY;
838} 838}
839 839
840static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) 840static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..b71daed3cca2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag)
1034 } 1034 }
1035 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1035 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1036 1036
1037 if (!vcpu)
1038 return 0;
1039
1040 /* Note: 1037 /* Note:
1041 * At this point, the IOMMU should have already set the pending 1038 * At this point, the IOMMU should have already set the pending
1042 * bit in the vAPIC backing page. So, we just need to schedule 1039 * bit in the vAPIC backing page. So, we just need to schedule
1043 * in the vcpu. 1040 * in the vcpu.
1044 */ 1041 */
1045 if (vcpu->mode == OUTSIDE_GUEST_MODE) 1042 if (vcpu)
1046 kvm_vcpu_wake_up(vcpu); 1043 kvm_vcpu_wake_up(vcpu);
1047 1044
1048 return 0; 1045 return 0;
@@ -2144,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm)
2144 2141
2145 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, 2142 return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
2146 svm->vmcb->control.insn_bytes, 2143 svm->vmcb->control.insn_bytes,
2147 svm->vmcb->control.insn_len, !npt_enabled); 2144 svm->vmcb->control.insn_len);
2145}
2146
2147static int npf_interception(struct vcpu_svm *svm)
2148{
2149 u64 fault_address = svm->vmcb->control.exit_info_2;
2150 u64 error_code = svm->vmcb->control.exit_info_1;
2151
2152 trace_kvm_page_fault(fault_address, error_code);
2153 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
2154 svm->vmcb->control.insn_bytes,
2155 svm->vmcb->control.insn_len);
2148} 2156}
2149 2157
2150static int db_interception(struct vcpu_svm *svm) 2158static int db_interception(struct vcpu_svm *svm)
@@ -2916,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
2916 return true; 2924 return true;
2917} 2925}
2918 2926
2919static bool nested_svm_vmrun(struct vcpu_svm *svm) 2927static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
2928 struct vmcb *nested_vmcb, struct page *page)
2920{ 2929{
2921 struct vmcb *nested_vmcb;
2922 struct vmcb *hsave = svm->nested.hsave;
2923 struct vmcb *vmcb = svm->vmcb;
2924 struct page *page;
2925 u64 vmcb_gpa;
2926
2927 vmcb_gpa = svm->vmcb->save.rax;
2928
2929 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2930 if (!nested_vmcb)
2931 return false;
2932
2933 if (!nested_vmcb_checks(nested_vmcb)) {
2934 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2935 nested_vmcb->control.exit_code_hi = 0;
2936 nested_vmcb->control.exit_info_1 = 0;
2937 nested_vmcb->control.exit_info_2 = 0;
2938
2939 nested_svm_unmap(page);
2940
2941 return false;
2942 }
2943
2944 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2945 nested_vmcb->save.rip,
2946 nested_vmcb->control.int_ctl,
2947 nested_vmcb->control.event_inj,
2948 nested_vmcb->control.nested_ctl);
2949
2950 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2951 nested_vmcb->control.intercept_cr >> 16,
2952 nested_vmcb->control.intercept_exceptions,
2953 nested_vmcb->control.intercept);
2954
2955 /* Clear internal status */
2956 kvm_clear_exception_queue(&svm->vcpu);
2957 kvm_clear_interrupt_queue(&svm->vcpu);
2958
2959 /*
2960 * Save the old vmcb, so we don't need to pick what we save, but can
2961 * restore everything when a VMEXIT occurs
2962 */
2963 hsave->save.es = vmcb->save.es;
2964 hsave->save.cs = vmcb->save.cs;
2965 hsave->save.ss = vmcb->save.ss;
2966 hsave->save.ds = vmcb->save.ds;
2967 hsave->save.gdtr = vmcb->save.gdtr;
2968 hsave->save.idtr = vmcb->save.idtr;
2969 hsave->save.efer = svm->vcpu.arch.efer;
2970 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2971 hsave->save.cr4 = svm->vcpu.arch.cr4;
2972 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2973 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2974 hsave->save.rsp = vmcb->save.rsp;
2975 hsave->save.rax = vmcb->save.rax;
2976 if (npt_enabled)
2977 hsave->save.cr3 = vmcb->save.cr3;
2978 else
2979 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2980
2981 copy_vmcb_control_area(hsave, vmcb);
2982
2983 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) 2930 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2984 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2931 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2985 else 2932 else
@@ -3072,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
3072 enable_gif(svm); 3019 enable_gif(svm);
3073 3020
3074 mark_all_dirty(svm->vmcb); 3021 mark_all_dirty(svm->vmcb);
3022}
3023
3024static bool nested_svm_vmrun(struct vcpu_svm *svm)
3025{
3026 struct vmcb *nested_vmcb;
3027 struct vmcb *hsave = svm->nested.hsave;
3028 struct vmcb *vmcb = svm->vmcb;
3029 struct page *page;
3030 u64 vmcb_gpa;
3031
3032 vmcb_gpa = svm->vmcb->save.rax;
3033
3034 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3035 if (!nested_vmcb)
3036 return false;
3037
3038 if (!nested_vmcb_checks(nested_vmcb)) {
3039 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
3040 nested_vmcb->control.exit_code_hi = 0;
3041 nested_vmcb->control.exit_info_1 = 0;
3042 nested_vmcb->control.exit_info_2 = 0;
3043
3044 nested_svm_unmap(page);
3045
3046 return false;
3047 }
3048
3049 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
3050 nested_vmcb->save.rip,
3051 nested_vmcb->control.int_ctl,
3052 nested_vmcb->control.event_inj,
3053 nested_vmcb->control.nested_ctl);
3054
3055 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
3056 nested_vmcb->control.intercept_cr >> 16,
3057 nested_vmcb->control.intercept_exceptions,
3058 nested_vmcb->control.intercept);
3059
3060 /* Clear internal status */
3061 kvm_clear_exception_queue(&svm->vcpu);
3062 kvm_clear_interrupt_queue(&svm->vcpu);
3063
3064 /*
3065 * Save the old vmcb, so we don't need to pick what we save, but can
3066 * restore everything when a VMEXIT occurs
3067 */
3068 hsave->save.es = vmcb->save.es;
3069 hsave->save.cs = vmcb->save.cs;
3070 hsave->save.ss = vmcb->save.ss;
3071 hsave->save.ds = vmcb->save.ds;
3072 hsave->save.gdtr = vmcb->save.gdtr;
3073 hsave->save.idtr = vmcb->save.idtr;
3074 hsave->save.efer = svm->vcpu.arch.efer;
3075 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
3076 hsave->save.cr4 = svm->vcpu.arch.cr4;
3077 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
3078 hsave->save.rip = kvm_rip_read(&svm->vcpu);
3079 hsave->save.rsp = vmcb->save.rsp;
3080 hsave->save.rax = vmcb->save.rax;
3081 if (npt_enabled)
3082 hsave->save.cr3 = vmcb->save.cr3;
3083 else
3084 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
3085
3086 copy_vmcb_control_area(hsave, vmcb);
3087
3088 enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
3075 3089
3076 return true; 3090 return true;
3077} 3091}
@@ -3173,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm)
3173 3187
3174 /* 3188 /*
3175 * If VGIF is enabled, the STGI intercept is only added to 3189 * If VGIF is enabled, the STGI intercept is only added to
3176 * detect the opening of the NMI window; remove it now. 3190 * detect the opening of the SMI/NMI window; remove it now.
3177 */ 3191 */
3178 if (vgif_enabled(svm)) 3192 if (vgif_enabled(svm))
3179 clr_intercept(svm, INTERCEPT_STGI); 3193 clr_intercept(svm, INTERCEPT_STGI);
@@ -4131,7 +4145,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
4131 [SVM_EXIT_MONITOR] = monitor_interception, 4145 [SVM_EXIT_MONITOR] = monitor_interception,
4132 [SVM_EXIT_MWAIT] = mwait_interception, 4146 [SVM_EXIT_MWAIT] = mwait_interception,
4133 [SVM_EXIT_XSETBV] = xsetbv_interception, 4147 [SVM_EXIT_XSETBV] = xsetbv_interception,
4134 [SVM_EXIT_NPF] = pf_interception, 4148 [SVM_EXIT_NPF] = npf_interception,
4135 [SVM_EXIT_RSM] = emulate_on_interception, 4149 [SVM_EXIT_RSM] = emulate_on_interception,
4136 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 4150 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
4137 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 4151 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -5393,6 +5407,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
5393 vcpu->arch.mcg_cap &= 0x1ff; 5407 vcpu->arch.mcg_cap &= 0x1ff;
5394} 5408}
5395 5409
5410static int svm_smi_allowed(struct kvm_vcpu *vcpu)
5411{
5412 struct vcpu_svm *svm = to_svm(vcpu);
5413
5414 /* Per APM Vol.2 15.22.2 "Response to SMI" */
5415 if (!gif_set(svm))
5416 return 0;
5417
5418 if (is_guest_mode(&svm->vcpu) &&
5419 svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
5420 /* TODO: Might need to set exit_info_1 and exit_info_2 here */
5421 svm->vmcb->control.exit_code = SVM_EXIT_SMI;
5422 svm->nested.exit_required = true;
5423 return 0;
5424 }
5425
5426 return 1;
5427}
5428
5429static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
5430{
5431 struct vcpu_svm *svm = to_svm(vcpu);
5432 int ret;
5433
5434 if (is_guest_mode(vcpu)) {
5435 /* FED8h - SVM Guest */
5436 put_smstate(u64, smstate, 0x7ed8, 1);
5437 /* FEE0h - SVM Guest VMCB Physical Address */
5438 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
5439
5440 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
5441 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
5442 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
5443
5444 ret = nested_svm_vmexit(svm);
5445 if (ret)
5446 return ret;
5447 }
5448 return 0;
5449}
5450
5451static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
5452{
5453 struct vcpu_svm *svm = to_svm(vcpu);
5454 struct vmcb *nested_vmcb;
5455 struct page *page;
5456 struct {
5457 u64 guest;
5458 u64 vmcb;
5459 } svm_state_save;
5460 int ret;
5461
5462 ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
5463 sizeof(svm_state_save));
5464 if (ret)
5465 return ret;
5466
5467 if (svm_state_save.guest) {
5468 vcpu->arch.hflags &= ~HF_SMM_MASK;
5469 nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
5470 if (nested_vmcb)
5471 enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
5472 else
5473 ret = 1;
5474 vcpu->arch.hflags |= HF_SMM_MASK;
5475 }
5476 return ret;
5477}
5478
5479static int enable_smi_window(struct kvm_vcpu *vcpu)
5480{
5481 struct vcpu_svm *svm = to_svm(vcpu);
5482
5483 if (!gif_set(svm)) {
5484 if (vgif_enabled(svm))
5485 set_intercept(svm, INTERCEPT_STGI);
5486 /* STGI will cause a vm exit */
5487 return 1;
5488 }
5489 return 0;
5490}
5491
5396static struct kvm_x86_ops svm_x86_ops __ro_after_init = { 5492static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
5397 .cpu_has_kvm_support = has_svm, 5493 .cpu_has_kvm_support = has_svm,
5398 .disabled_by_bios = is_disabled, 5494 .disabled_by_bios = is_disabled,
@@ -5503,6 +5599,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
5503 .deliver_posted_interrupt = svm_deliver_avic_intr, 5599 .deliver_posted_interrupt = svm_deliver_avic_intr,
5504 .update_pi_irte = svm_update_pi_irte, 5600 .update_pi_irte = svm_update_pi_irte,
5505 .setup_mce = svm_setup_mce, 5601 .setup_mce = svm_setup_mce,
5602
5603 .smi_allowed = svm_smi_allowed,
5604 .pre_enter_smm = svm_pre_enter_smm,
5605 .pre_leave_smm = svm_pre_leave_smm,
5606 .enable_smi_window = enable_smi_window,
5506}; 5607};
5507 5608
5508static int __init svm_init(void) 5609static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..7c3522a989d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -486,6 +486,14 @@ struct nested_vmx {
486 u64 nested_vmx_cr4_fixed1; 486 u64 nested_vmx_cr4_fixed1;
487 u64 nested_vmx_vmcs_enum; 487 u64 nested_vmx_vmcs_enum;
488 u64 nested_vmx_vmfunc_controls; 488 u64 nested_vmx_vmfunc_controls;
489
490 /* SMM related state */
491 struct {
492 /* in VMX operation on SMM entry? */
493 bool vmxon;
494 /* in guest mode on SMM entry? */
495 bool guest_mode;
496 } smm;
489}; 497};
490 498
491#define POSTED_INTR_ON 0 499#define POSTED_INTR_ON 0
@@ -900,16 +908,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
900static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 908static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
901static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); 909static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
902static bool vmx_xsaves_supported(void); 910static bool vmx_xsaves_supported(void);
903static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
904static void vmx_set_segment(struct kvm_vcpu *vcpu, 911static void vmx_set_segment(struct kvm_vcpu *vcpu,
905 struct kvm_segment *var, int seg); 912 struct kvm_segment *var, int seg);
906static void vmx_get_segment(struct kvm_vcpu *vcpu, 913static void vmx_get_segment(struct kvm_vcpu *vcpu,
907 struct kvm_segment *var, int seg); 914 struct kvm_segment *var, int seg);
908static bool guest_state_valid(struct kvm_vcpu *vcpu); 915static bool guest_state_valid(struct kvm_vcpu *vcpu);
909static u32 vmx_segment_access_rights(struct kvm_segment *var); 916static u32 vmx_segment_access_rights(struct kvm_segment *var);
910static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
911static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 917static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
912static int alloc_identity_pagetable(struct kvm *kvm);
913static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); 918static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
914static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); 919static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
915static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 920static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -1598,18 +1603,15 @@ static inline void vpid_sync_context(int vpid)
1598 1603
1599static inline void ept_sync_global(void) 1604static inline void ept_sync_global(void)
1600{ 1605{
1601 if (cpu_has_vmx_invept_global()) 1606 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1602 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1603} 1607}
1604 1608
1605static inline void ept_sync_context(u64 eptp) 1609static inline void ept_sync_context(u64 eptp)
1606{ 1610{
1607 if (enable_ept) { 1611 if (cpu_has_vmx_invept_context())
1608 if (cpu_has_vmx_invept_context()) 1612 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1609 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1613 else
1610 else 1614 ept_sync_global();
1611 ept_sync_global();
1612 }
1613} 1615}
1614 1616
1615static __always_inline void vmcs_check16(unsigned long field) 1617static __always_inline void vmcs_check16(unsigned long field)
@@ -2831,8 +2833,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2831 SECONDARY_EXEC_ENABLE_PML; 2833 SECONDARY_EXEC_ENABLE_PML;
2832 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; 2834 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2833 } 2835 }
2834 } else 2836 }
2835 vmx->nested.nested_vmx_ept_caps = 0;
2836 2837
2837 if (cpu_has_vmx_vmfunc()) { 2838 if (cpu_has_vmx_vmfunc()) {
2838 vmx->nested.nested_vmx_secondary_ctls_high |= 2839 vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2841,8 +2842,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2841 * Advertise EPTP switching unconditionally 2842 * Advertise EPTP switching unconditionally
2842 * since we emulate it 2843 * since we emulate it
2843 */ 2844 */
2844 vmx->nested.nested_vmx_vmfunc_controls = 2845 if (enable_ept)
2845 VMX_VMFUNC_EPTP_SWITCHING; 2846 vmx->nested.nested_vmx_vmfunc_controls =
2847 VMX_VMFUNC_EPTP_SWITCHING;
2846 } 2848 }
2847 2849
2848 /* 2850 /*
@@ -2856,8 +2858,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2856 SECONDARY_EXEC_ENABLE_VPID; 2858 SECONDARY_EXEC_ENABLE_VPID;
2857 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | 2859 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
2858 VMX_VPID_EXTENT_SUPPORTED_MASK; 2860 VMX_VPID_EXTENT_SUPPORTED_MASK;
2859 } else 2861 }
2860 vmx->nested.nested_vmx_vpid_caps = 0;
2861 2862
2862 if (enable_unrestricted_guest) 2863 if (enable_unrestricted_guest)
2863 vmx->nested.nested_vmx_secondary_ctls_high |= 2864 vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -3544,7 +3545,8 @@ static int hardware_enable(void)
3544 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 3545 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3545 } 3546 }
3546 kvm_cpu_vmxon(phys_addr); 3547 kvm_cpu_vmxon(phys_addr);
3547 ept_sync_global(); 3548 if (enable_ept)
3549 ept_sync_global();
3548 3550
3549 return 0; 3551 return 0;
3550} 3552}
@@ -3657,8 +3659,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3657 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3659 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3658 SECONDARY_EXEC_SHADOW_VMCS | 3660 SECONDARY_EXEC_SHADOW_VMCS |
3659 SECONDARY_EXEC_XSAVES | 3661 SECONDARY_EXEC_XSAVES |
3660 SECONDARY_EXEC_RDSEED | 3662 SECONDARY_EXEC_RDSEED_EXITING |
3661 SECONDARY_EXEC_RDRAND | 3663 SECONDARY_EXEC_RDRAND_EXITING |
3662 SECONDARY_EXEC_ENABLE_PML | 3664 SECONDARY_EXEC_ENABLE_PML |
3663 SECONDARY_EXEC_TSC_SCALING | 3665 SECONDARY_EXEC_TSC_SCALING |
3664 SECONDARY_EXEC_ENABLE_VMFUNC; 3666 SECONDARY_EXEC_ENABLE_VMFUNC;
@@ -3679,14 +3681,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3679 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3681 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3680 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3682 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3681 3683
3684 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
3685 &vmx_capability.ept, &vmx_capability.vpid);
3686
3682 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 3687 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3683 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 3688 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3684 enabled */ 3689 enabled */
3685 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 3690 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3686 CPU_BASED_CR3_STORE_EXITING | 3691 CPU_BASED_CR3_STORE_EXITING |
3687 CPU_BASED_INVLPG_EXITING); 3692 CPU_BASED_INVLPG_EXITING);
3688 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 3693 } else if (vmx_capability.ept) {
3689 vmx_capability.ept, vmx_capability.vpid); 3694 vmx_capability.ept = 0;
3695 pr_warn_once("EPT CAP should not exist if not support "
3696 "1-setting enable EPT VM-execution control\n");
3697 }
3698 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3699 vmx_capability.vpid) {
3700 vmx_capability.vpid = 0;
3701 pr_warn_once("VPID CAP should not exist if not support "
3702 "1-setting enable VPID VM-execution control\n");
3690 } 3703 }
3691 3704
3692 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; 3705 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -4781,18 +4794,18 @@ static int init_rmode_identity_map(struct kvm *kvm)
4781 kvm_pfn_t identity_map_pfn; 4794 kvm_pfn_t identity_map_pfn;
4782 u32 tmp; 4795 u32 tmp;
4783 4796
4784 if (!enable_ept)
4785 return 0;
4786
4787 /* Protect kvm->arch.ept_identity_pagetable_done. */ 4797 /* Protect kvm->arch.ept_identity_pagetable_done. */
4788 mutex_lock(&kvm->slots_lock); 4798 mutex_lock(&kvm->slots_lock);
4789 4799
4790 if (likely(kvm->arch.ept_identity_pagetable_done)) 4800 if (likely(kvm->arch.ept_identity_pagetable_done))
4791 goto out2; 4801 goto out2;
4792 4802
4803 if (!kvm->arch.ept_identity_map_addr)
4804 kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4793 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 4805 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4794 4806
4795 r = alloc_identity_pagetable(kvm); 4807 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4808 kvm->arch.ept_identity_map_addr, PAGE_SIZE);
4796 if (r < 0) 4809 if (r < 0)
4797 goto out2; 4810 goto out2;
4798 4811
@@ -4864,20 +4877,6 @@ out:
4864 return r; 4877 return r;
4865} 4878}
4866 4879
4867static int alloc_identity_pagetable(struct kvm *kvm)
4868{
4869 /* Called with kvm->slots_lock held. */
4870
4871 int r = 0;
4872
4873 BUG_ON(kvm->arch.ept_identity_pagetable_done);
4874
4875 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4876 kvm->arch.ept_identity_map_addr, PAGE_SIZE);
4877
4878 return r;
4879}
4880
4881static int allocate_vpid(void) 4880static int allocate_vpid(void)
4882{ 4881{
4883 int vpid; 4882 int vpid;
@@ -5282,13 +5281,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
5282static bool vmx_rdrand_supported(void) 5281static bool vmx_rdrand_supported(void)
5283{ 5282{
5284 return vmcs_config.cpu_based_2nd_exec_ctrl & 5283 return vmcs_config.cpu_based_2nd_exec_ctrl &
5285 SECONDARY_EXEC_RDRAND; 5284 SECONDARY_EXEC_RDRAND_EXITING;
5286} 5285}
5287 5286
5288static bool vmx_rdseed_supported(void) 5287static bool vmx_rdseed_supported(void)
5289{ 5288{
5290 return vmcs_config.cpu_based_2nd_exec_ctrl & 5289 return vmcs_config.cpu_based_2nd_exec_ctrl &
5291 SECONDARY_EXEC_RDSEED; 5290 SECONDARY_EXEC_RDSEED_EXITING;
5292} 5291}
5293 5292
5294static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) 5293static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -5382,30 +5381,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5382 if (vmx_rdrand_supported()) { 5381 if (vmx_rdrand_supported()) {
5383 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); 5382 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
5384 if (rdrand_enabled) 5383 if (rdrand_enabled)
5385 exec_control &= ~SECONDARY_EXEC_RDRAND; 5384 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
5386 5385
5387 if (nested) { 5386 if (nested) {
5388 if (rdrand_enabled) 5387 if (rdrand_enabled)
5389 vmx->nested.nested_vmx_secondary_ctls_high |= 5388 vmx->nested.nested_vmx_secondary_ctls_high |=
5390 SECONDARY_EXEC_RDRAND; 5389 SECONDARY_EXEC_RDRAND_EXITING;
5391 else 5390 else
5392 vmx->nested.nested_vmx_secondary_ctls_high &= 5391 vmx->nested.nested_vmx_secondary_ctls_high &=
5393 ~SECONDARY_EXEC_RDRAND; 5392 ~SECONDARY_EXEC_RDRAND_EXITING;
5394 } 5393 }
5395 } 5394 }
5396 5395
5397 if (vmx_rdseed_supported()) { 5396 if (vmx_rdseed_supported()) {
5398 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); 5397 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
5399 if (rdseed_enabled) 5398 if (rdseed_enabled)
5400 exec_control &= ~SECONDARY_EXEC_RDSEED; 5399 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
5401 5400
5402 if (nested) { 5401 if (nested) {
5403 if (rdseed_enabled) 5402 if (rdseed_enabled)
5404 vmx->nested.nested_vmx_secondary_ctls_high |= 5403 vmx->nested.nested_vmx_secondary_ctls_high |=
5405 SECONDARY_EXEC_RDSEED; 5404 SECONDARY_EXEC_RDSEED_EXITING;
5406 else 5405 else
5407 vmx->nested.nested_vmx_secondary_ctls_high &= 5406 vmx->nested.nested_vmx_secondary_ctls_high &=
5408 ~SECONDARY_EXEC_RDSEED; 5407 ~SECONDARY_EXEC_RDSEED_EXITING;
5409 } 5408 }
5410 } 5409 }
5411 5410
@@ -5426,7 +5425,7 @@ static void ept_set_mmio_spte_mask(void)
5426/* 5425/*
5427 * Sets up the vmcs for emulated real mode. 5426 * Sets up the vmcs for emulated real mode.
5428 */ 5427 */
5429static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 5428static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
5430{ 5429{
5431#ifdef CONFIG_X86_64 5430#ifdef CONFIG_X86_64
5432 unsigned long a; 5431 unsigned long a;
@@ -5539,8 +5538,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
5539 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 5538 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5540 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 5539 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5541 } 5540 }
5542
5543 return 0;
5544} 5541}
5545 5542
5546static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 5543static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5604,6 +5601,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5604 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 5601 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5605 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 5602 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5606 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 5603 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5604 if (kvm_mpx_supported())
5605 vmcs_write64(GUEST_BNDCFGS, 0);
5607 5606
5608 setup_msrs(vmx); 5607 setup_msrs(vmx);
5609 5608
@@ -5912,8 +5911,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5912 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5911 cr2 = vmcs_readl(EXIT_QUALIFICATION);
5913 /* EPT won't cause page fault directly */ 5912 /* EPT won't cause page fault directly */
5914 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); 5913 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
5915 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, 5914 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5916 true);
5917 } 5915 }
5918 5916
5919 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5917 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6747,16 +6745,14 @@ static __init int hardware_setup(void)
6747 6745
6748 if (!cpu_has_vmx_ept() || 6746 if (!cpu_has_vmx_ept() ||
6749 !cpu_has_vmx_ept_4levels() || 6747 !cpu_has_vmx_ept_4levels() ||
6750 !cpu_has_vmx_ept_mt_wb()) { 6748 !cpu_has_vmx_ept_mt_wb() ||
6749 !cpu_has_vmx_invept_global())
6751 enable_ept = 0; 6750 enable_ept = 0;
6752 enable_unrestricted_guest = 0;
6753 enable_ept_ad_bits = 0;
6754 }
6755 6751
6756 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 6752 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
6757 enable_ept_ad_bits = 0; 6753 enable_ept_ad_bits = 0;
6758 6754
6759 if (!cpu_has_vmx_unrestricted_guest()) 6755 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
6760 enable_unrestricted_guest = 0; 6756 enable_unrestricted_guest = 0;
6761 6757
6762 if (!cpu_has_vmx_flexpriority()) 6758 if (!cpu_has_vmx_flexpriority())
@@ -6776,8 +6772,13 @@ static __init int hardware_setup(void)
6776 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 6772 if (enable_ept && !cpu_has_vmx_ept_2m_page())
6777 kvm_disable_largepages(); 6773 kvm_disable_largepages();
6778 6774
6779 if (!cpu_has_vmx_ple()) 6775 if (!cpu_has_vmx_ple()) {
6780 ple_gap = 0; 6776 ple_gap = 0;
6777 ple_window = 0;
6778 ple_window_grow = 0;
6779 ple_window_max = 0;
6780 ple_window_shrink = 0;
6781 }
6781 6782
6782 if (!cpu_has_vmx_apicv()) { 6783 if (!cpu_has_vmx_apicv()) {
6783 enable_apicv = 0; 6784 enable_apicv = 0;
@@ -8415,9 +8416,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
8415 case EXIT_REASON_RDPMC: 8416 case EXIT_REASON_RDPMC:
8416 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 8417 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
8417 case EXIT_REASON_RDRAND: 8418 case EXIT_REASON_RDRAND:
8418 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); 8419 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
8419 case EXIT_REASON_RDSEED: 8420 case EXIT_REASON_RDSEED:
8420 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); 8421 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
8421 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 8422 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
8422 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 8423 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
8423 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 8424 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -9475,7 +9476,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
9475 vmx->loaded_vmcs = vmcs; 9476 vmx->loaded_vmcs = vmcs;
9476 vmx_vcpu_put(vcpu); 9477 vmx_vcpu_put(vcpu);
9477 vmx_vcpu_load(vcpu, cpu); 9478 vmx_vcpu_load(vcpu, cpu);
9478 vcpu->cpu = cpu;
9479 put_cpu(); 9479 put_cpu();
9480} 9480}
9481 9481
@@ -9556,11 +9556,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9556 cpu = get_cpu(); 9556 cpu = get_cpu();
9557 vmx_vcpu_load(&vmx->vcpu, cpu); 9557 vmx_vcpu_load(&vmx->vcpu, cpu);
9558 vmx->vcpu.cpu = cpu; 9558 vmx->vcpu.cpu = cpu;
9559 err = vmx_vcpu_setup(vmx); 9559 vmx_vcpu_setup(vmx);
9560 vmx_vcpu_put(&vmx->vcpu); 9560 vmx_vcpu_put(&vmx->vcpu);
9561 put_cpu(); 9561 put_cpu();
9562 if (err)
9563 goto free_vmcs;
9564 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 9562 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9565 err = alloc_apic_access_page(kvm); 9563 err = alloc_apic_access_page(kvm);
9566 if (err) 9564 if (err)
@@ -9568,9 +9566,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9568 } 9566 }
9569 9567
9570 if (enable_ept) { 9568 if (enable_ept) {
9571 if (!kvm->arch.ept_identity_map_addr)
9572 kvm->arch.ept_identity_map_addr =
9573 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
9574 err = init_rmode_identity_map(kvm); 9569 err = init_rmode_identity_map(kvm);
9575 if (err) 9570 if (err)
9576 goto free_vmcs; 9571 goto free_vmcs;
@@ -11325,6 +11320,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
11325 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 11320 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
11326 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 11321 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
11327 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 11322 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
11323 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
11324 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
11328 11325
11329 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 11326 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
11330 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 11327 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -11421,8 +11418,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11421 leave_guest_mode(vcpu); 11418 leave_guest_mode(vcpu);
11422 11419
11423 if (likely(!vmx->fail)) { 11420 if (likely(!vmx->fail)) {
11424 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 11421 if (exit_reason == -1)
11425 exit_qualification); 11422 sync_vmcs12(vcpu, vmcs12);
11423 else
11424 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
11425 exit_qualification);
11426 11426
11427 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 11427 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
11428 vmcs12->vm_exit_msr_store_count)) 11428 vmcs12->vm_exit_msr_store_count))
@@ -11486,7 +11486,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11486 */ 11486 */
11487 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 11487 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
11488 11488
11489 if (enable_shadow_vmcs) 11489 if (enable_shadow_vmcs && exit_reason != -1)
11490 vmx->nested.sync_shadow_vmcs = true; 11490 vmx->nested.sync_shadow_vmcs = true;
11491 11491
11492 /* in case we halted in L2 */ 11492 /* in case we halted in L2 */
@@ -11510,12 +11510,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
11510 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 11510 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
11511 } 11511 }
11512 11512
11513 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 11513 if (exit_reason != -1)
11514 vmcs12->exit_qualification, 11514 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
11515 vmcs12->idt_vectoring_info_field, 11515 vmcs12->exit_qualification,
11516 vmcs12->vm_exit_intr_info, 11516 vmcs12->idt_vectoring_info_field,
11517 vmcs12->vm_exit_intr_error_code, 11517 vmcs12->vm_exit_intr_info,
11518 KVM_ISA_VMX); 11518 vmcs12->vm_exit_intr_error_code,
11519 KVM_ISA_VMX);
11519 11520
11520 load_vmcs12_host_state(vcpu, vmcs12); 11521 load_vmcs12_host_state(vcpu, vmcs12);
11521 11522
@@ -11938,6 +11939,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
11938 ~FEATURE_CONTROL_LMCE; 11939 ~FEATURE_CONTROL_LMCE;
11939} 11940}
11940 11941
11942static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
11943{
11944 /* we need a nested vmexit to enter SMM, postpone if run is pending */
11945 if (to_vmx(vcpu)->nested.nested_run_pending)
11946 return 0;
11947 return 1;
11948}
11949
11950static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
11951{
11952 struct vcpu_vmx *vmx = to_vmx(vcpu);
11953
11954 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
11955 if (vmx->nested.smm.guest_mode)
11956 nested_vmx_vmexit(vcpu, -1, 0, 0);
11957
11958 vmx->nested.smm.vmxon = vmx->nested.vmxon;
11959 vmx->nested.vmxon = false;
11960 return 0;
11961}
11962
11963static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
11964{
11965 struct vcpu_vmx *vmx = to_vmx(vcpu);
11966 int ret;
11967
11968 if (vmx->nested.smm.vmxon) {
11969 vmx->nested.vmxon = true;
11970 vmx->nested.smm.vmxon = false;
11971 }
11972
11973 if (vmx->nested.smm.guest_mode) {
11974 vcpu->arch.hflags &= ~HF_SMM_MASK;
11975 ret = enter_vmx_non_root_mode(vcpu, false);
11976 vcpu->arch.hflags |= HF_SMM_MASK;
11977 if (ret)
11978 return ret;
11979
11980 vmx->nested.smm.guest_mode = false;
11981 }
11982 return 0;
11983}
11984
11985static int enable_smi_window(struct kvm_vcpu *vcpu)
11986{
11987 return 0;
11988}
11989
11941static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { 11990static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
11942 .cpu_has_kvm_support = cpu_has_kvm_support, 11991 .cpu_has_kvm_support = cpu_has_kvm_support,
11943 .disabled_by_bios = vmx_disabled_by_bios, 11992 .disabled_by_bios = vmx_disabled_by_bios,
@@ -12063,6 +12112,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12063#endif 12112#endif
12064 12113
12065 .setup_mce = vmx_setup_mce, 12114 .setup_mce = vmx_setup_mce,
12115
12116 .smi_allowed = vmx_smi_allowed,
12117 .pre_enter_smm = vmx_pre_enter_smm,
12118 .pre_leave_smm = vmx_pre_leave_smm,
12119 .enable_smi_window = enable_smi_window,
12066}; 12120};
12067 12121
12068static int __init vmx_init(void) 12122static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb7fcd6..34c85aa2e2d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work)
2006 KVMCLOCK_SYNC_PERIOD); 2006 KVMCLOCK_SYNC_PERIOD);
2007} 2007}
2008 2008
2009static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) 2009static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2010{ 2010{
2011 u64 mcg_cap = vcpu->arch.mcg_cap; 2011 u64 mcg_cap = vcpu->arch.mcg_cap;
2012 unsigned bank_num = mcg_cap & 0xff; 2012 unsigned bank_num = mcg_cap & 0xff;
2013 u32 msr = msr_info->index;
2014 u64 data = msr_info->data;
2013 2015
2014 switch (msr) { 2016 switch (msr) {
2015 case MSR_IA32_MCG_STATUS: 2017 case MSR_IA32_MCG_STATUS:
@@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
2034 if ((offset & 0x3) == 0 && 2036 if ((offset & 0x3) == 0 &&
2035 data != 0 && (data | (1 << 10)) != ~(u64)0) 2037 data != 0 && (data | (1 << 10)) != ~(u64)0)
2036 return -1; 2038 return -1;
2039 if (!msr_info->host_initiated &&
2040 (offset & 0x3) == 1 && data != 0)
2041 return -1;
2037 vcpu->arch.mce_banks[offset] = data; 2042 vcpu->arch.mce_banks[offset] = data;
2038 break; 2043 break;
2039 } 2044 }
@@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2283 case MSR_IA32_MCG_CTL: 2288 case MSR_IA32_MCG_CTL:
2284 case MSR_IA32_MCG_STATUS: 2289 case MSR_IA32_MCG_STATUS:
2285 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: 2290 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2286 return set_msr_mce(vcpu, msr, data); 2291 return set_msr_mce(vcpu, msr_info);
2287 2292
2288 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 2293 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2289 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 2294 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4034,10 +4039,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
4034 case KVM_SET_IDENTITY_MAP_ADDR: { 4039 case KVM_SET_IDENTITY_MAP_ADDR: {
4035 u64 ident_addr; 4040 u64 ident_addr;
4036 4041
4042 mutex_lock(&kvm->lock);
4043 r = -EINVAL;
4044 if (kvm->created_vcpus)
4045 goto set_identity_unlock;
4037 r = -EFAULT; 4046 r = -EFAULT;
4038 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 4047 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
4039 goto out; 4048 goto set_identity_unlock;
4040 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 4049 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
4050set_identity_unlock:
4051 mutex_unlock(&kvm->lock);
4041 break; 4052 break;
4042 } 4053 }
4043 case KVM_SET_NR_MMU_PAGES: 4054 case KVM_SET_NR_MMU_PAGES:
@@ -5275,6 +5286,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
5275 kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); 5286 kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
5276} 5287}
5277 5288
5289static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
5290{
5291 return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
5292}
5293
5278static const struct x86_emulate_ops emulate_ops = { 5294static const struct x86_emulate_ops emulate_ops = {
5279 .read_gpr = emulator_read_gpr, 5295 .read_gpr = emulator_read_gpr,
5280 .write_gpr = emulator_write_gpr, 5296 .write_gpr = emulator_write_gpr,
@@ -5316,6 +5332,7 @@ static const struct x86_emulate_ops emulate_ops = {
5316 .set_nmi_mask = emulator_set_nmi_mask, 5332 .set_nmi_mask = emulator_set_nmi_mask,
5317 .get_hflags = emulator_get_hflags, 5333 .get_hflags = emulator_get_hflags,
5318 .set_hflags = emulator_set_hflags, 5334 .set_hflags = emulator_set_hflags,
5335 .pre_leave_smm = emulator_pre_leave_smm,
5319}; 5336};
5320 5337
5321static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 5338static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6426,7 +6443,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6426 } 6443 }
6427 6444
6428 kvm_x86_ops->queue_exception(vcpu); 6445 kvm_x86_ops->queue_exception(vcpu);
6429 } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 6446 } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
6430 vcpu->arch.smi_pending = false; 6447 vcpu->arch.smi_pending = false;
6431 enter_smm(vcpu); 6448 enter_smm(vcpu);
6432 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { 6449 } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6473,9 +6490,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
6473 kvm_make_request(KVM_REQ_EVENT, vcpu); 6490 kvm_make_request(KVM_REQ_EVENT, vcpu);
6474} 6491}
6475 6492
6476#define put_smstate(type, buf, offset, val) \
6477 *(type *)((buf) + (offset) - 0x7e00) = val
6478
6479static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) 6493static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
6480{ 6494{
6481 u32 flags = 0; 6495 u32 flags = 0;
@@ -6641,13 +6655,20 @@ static void enter_smm(struct kvm_vcpu *vcpu)
6641 u32 cr0; 6655 u32 cr0;
6642 6656
6643 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); 6657 trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
6644 vcpu->arch.hflags |= HF_SMM_MASK;
6645 memset(buf, 0, 512); 6658 memset(buf, 0, 512);
6646 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 6659 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
6647 enter_smm_save_state_64(vcpu, buf); 6660 enter_smm_save_state_64(vcpu, buf);
6648 else 6661 else
6649 enter_smm_save_state_32(vcpu, buf); 6662 enter_smm_save_state_32(vcpu, buf);
6650 6663
6664 /*
6665 * Give pre_enter_smm() a chance to make ISA-specific changes to the
6666 * vCPU state (e.g. leave guest mode) after we've saved the state into
6667 * the SMM state-save area.
6668 */
6669 kvm_x86_ops->pre_enter_smm(vcpu, buf);
6670
6671 vcpu->arch.hflags |= HF_SMM_MASK;
6651 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); 6672 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
6652 6673
6653 if (kvm_x86_ops->get_nmi_mask(vcpu)) 6674 if (kvm_x86_ops->get_nmi_mask(vcpu))
@@ -6876,17 +6897,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6876 if (inject_pending_event(vcpu, req_int_win) != 0) 6897 if (inject_pending_event(vcpu, req_int_win) != 0)
6877 req_immediate_exit = true; 6898 req_immediate_exit = true;
6878 else { 6899 else {
6879 /* Enable NMI/IRQ window open exits if needed. 6900 /* Enable SMI/NMI/IRQ window open exits if needed.
6880 * 6901 *
6881 * SMIs have two cases: 1) they can be nested, and 6902 * SMIs have three cases:
6882 * then there is nothing to do here because RSM will 6903 * 1) They can be nested, and then there is nothing to
6883 * cause a vmexit anyway; 2) or the SMI can be pending 6904 * do here because RSM will cause a vmexit anyway.
6884 * because inject_pending_event has completed the 6905 * 2) There is an ISA-specific reason why SMI cannot be
6885 * injection of an IRQ or NMI from the previous vmexit, 6906 * injected, and the moment when this changes can be
6886 * and then we request an immediate exit to inject the SMI. 6907 * intercepted.
6908 * 3) Or the SMI can be pending because
6909 * inject_pending_event has completed the injection
6910 * of an IRQ or NMI from the previous vmexit, and
6911 * then we request an immediate exit to inject the
6912 * SMI.
6887 */ 6913 */
6888 if (vcpu->arch.smi_pending && !is_smm(vcpu)) 6914 if (vcpu->arch.smi_pending && !is_smm(vcpu))
6889 req_immediate_exit = true; 6915 if (!kvm_x86_ops->enable_smi_window(vcpu))
6916 req_immediate_exit = true;
6890 if (vcpu->arch.nmi_pending) 6917 if (vcpu->arch.nmi_pending)
6891 kvm_x86_ops->enable_nmi_window(vcpu); 6918 kvm_x86_ops->enable_nmi_window(vcpu);
6892 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 6919 if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -7798,18 +7825,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
7798 kvm_async_pf_hash_reset(vcpu); 7825 kvm_async_pf_hash_reset(vcpu);
7799 vcpu->arch.apf.halted = false; 7826 vcpu->arch.apf.halted = false;
7800 7827
7828 if (kvm_mpx_supported()) {
7829 void *mpx_state_buffer;
7830
7831 /*
7832 * To avoid have the INIT path from kvm_apic_has_events() that be
7833 * called with loaded FPU and does not let userspace fix the state.
7834 */
7835 kvm_put_guest_fpu(vcpu);
7836 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
7837 XFEATURE_MASK_BNDREGS);
7838 if (mpx_state_buffer)
7839 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
7840 mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
7841 XFEATURE_MASK_BNDCSR);
7842 if (mpx_state_buffer)
7843 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
7844 }
7845
7801 if (!init_event) { 7846 if (!init_event) {
7802 kvm_pmu_reset(vcpu); 7847 kvm_pmu_reset(vcpu);
7803 vcpu->arch.smbase = 0x30000; 7848 vcpu->arch.smbase = 0x30000;
7804 7849
7805 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; 7850 vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
7806 vcpu->arch.msr_misc_features_enables = 0; 7851 vcpu->arch.msr_misc_features_enables = 0;
7852
7853 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
7807 } 7854 }
7808 7855
7809 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 7856 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
7810 vcpu->arch.regs_avail = ~0; 7857 vcpu->arch.regs_avail = ~0;
7811 vcpu->arch.regs_dirty = ~0; 7858 vcpu->arch.regs_dirty = ~0;
7812 7859
7860 vcpu->arch.ia32_xss = 0;
7861
7813 kvm_x86_ops->vcpu_reset(vcpu, init_event); 7862 kvm_x86_ops->vcpu_reset(vcpu, init_event);
7814} 7863}
7815 7864
@@ -7974,16 +8023,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
7974int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 8023int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7975{ 8024{
7976 struct page *page; 8025 struct page *page;
7977 struct kvm *kvm;
7978 int r; 8026 int r;
7979 8027
7980 BUG_ON(vcpu->kvm == NULL);
7981 kvm = vcpu->kvm;
7982
7983 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); 8028 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
7984 vcpu->arch.pv.pv_unhalted = false;
7985 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 8029 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7986 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 8030 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
7987 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 8031 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7988 else 8032 else
7989 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 8033 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -8001,7 +8045,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8001 if (r < 0) 8045 if (r < 0)
8002 goto fail_free_pio_data; 8046 goto fail_free_pio_data;
8003 8047
8004 if (irqchip_in_kernel(kvm)) { 8048 if (irqchip_in_kernel(vcpu->kvm)) {
8005 r = kvm_create_lapic(vcpu); 8049 r = kvm_create_lapic(vcpu);
8006 if (r < 0) 8050 if (r < 0)
8007 goto fail_mmu_destroy; 8051 goto fail_mmu_destroy;
@@ -8023,10 +8067,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8023 8067
8024 fx_init(vcpu); 8068 fx_init(vcpu);
8025 8069
8026 vcpu->arch.ia32_tsc_adjust_msr = 0x0;
8027 vcpu->arch.pv_time_enabled = false;
8028
8029 vcpu->arch.guest_supported_xcr0 = 0;
8030 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 8070 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
8031 8071
8032 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 8072 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 538bfa8ba9b4..57cb2f00fc07 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -159,6 +159,7 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
159 * if we don't have the cp15 accessors we won't have a problem. 159 * if we don't have the cp15 accessors we won't have a problem.
160 */ 160 */
161u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct; 161u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
162EXPORT_SYMBOL_GPL(arch_timer_read_counter);
162 163
163static u64 arch_counter_read(struct clocksource *cs) 164static u64 arch_counter_read(struct clocksource *cs)
164{ 165{
@@ -218,6 +219,11 @@ static u32 notrace fsl_a008585_read_cntv_tval_el0(void)
218 return __fsl_a008585_read_reg(cntv_tval_el0); 219 return __fsl_a008585_read_reg(cntv_tval_el0);
219} 220}
220 221
222static u64 notrace fsl_a008585_read_cntpct_el0(void)
223{
224 return __fsl_a008585_read_reg(cntpct_el0);
225}
226
221static u64 notrace fsl_a008585_read_cntvct_el0(void) 227static u64 notrace fsl_a008585_read_cntvct_el0(void)
222{ 228{
223 return __fsl_a008585_read_reg(cntvct_el0); 229 return __fsl_a008585_read_reg(cntvct_el0);
@@ -259,6 +265,11 @@ static u32 notrace hisi_161010101_read_cntv_tval_el0(void)
259 return __hisi_161010101_read_reg(cntv_tval_el0); 265 return __hisi_161010101_read_reg(cntv_tval_el0);
260} 266}
261 267
268static u64 notrace hisi_161010101_read_cntpct_el0(void)
269{
270 return __hisi_161010101_read_reg(cntpct_el0);
271}
272
262static u64 notrace hisi_161010101_read_cntvct_el0(void) 273static u64 notrace hisi_161010101_read_cntvct_el0(void)
263{ 274{
264 return __hisi_161010101_read_reg(cntvct_el0); 275 return __hisi_161010101_read_reg(cntvct_el0);
@@ -289,6 +300,15 @@ static struct ate_acpi_oem_info hisi_161010101_oem_info[] = {
289#endif 300#endif
290 301
291#ifdef CONFIG_ARM64_ERRATUM_858921 302#ifdef CONFIG_ARM64_ERRATUM_858921
303static u64 notrace arm64_858921_read_cntpct_el0(void)
304{
305 u64 old, new;
306
307 old = read_sysreg(cntpct_el0);
308 new = read_sysreg(cntpct_el0);
309 return (((old ^ new) >> 32) & 1) ? old : new;
310}
311
292static u64 notrace arm64_858921_read_cntvct_el0(void) 312static u64 notrace arm64_858921_read_cntvct_el0(void)
293{ 313{
294 u64 old, new; 314 u64 old, new;
@@ -310,16 +330,19 @@ static void erratum_set_next_event_tval_generic(const int access, unsigned long
310 struct clock_event_device *clk) 330 struct clock_event_device *clk)
311{ 331{
312 unsigned long ctrl; 332 unsigned long ctrl;
313 u64 cval = evt + arch_counter_get_cntvct(); 333 u64 cval;
314 334
315 ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk); 335 ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
316 ctrl |= ARCH_TIMER_CTRL_ENABLE; 336 ctrl |= ARCH_TIMER_CTRL_ENABLE;
317 ctrl &= ~ARCH_TIMER_CTRL_IT_MASK; 337 ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;
318 338
319 if (access == ARCH_TIMER_PHYS_ACCESS) 339 if (access == ARCH_TIMER_PHYS_ACCESS) {
340 cval = evt + arch_counter_get_cntpct();
320 write_sysreg(cval, cntp_cval_el0); 341 write_sysreg(cval, cntp_cval_el0);
321 else 342 } else {
343 cval = evt + arch_counter_get_cntvct();
322 write_sysreg(cval, cntv_cval_el0); 344 write_sysreg(cval, cntv_cval_el0);
345 }
323 346
324 arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk); 347 arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
325} 348}
@@ -346,6 +369,7 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
346 .desc = "Freescale erratum a005858", 369 .desc = "Freescale erratum a005858",
347 .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0, 370 .read_cntp_tval_el0 = fsl_a008585_read_cntp_tval_el0,
348 .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0, 371 .read_cntv_tval_el0 = fsl_a008585_read_cntv_tval_el0,
372 .read_cntpct_el0 = fsl_a008585_read_cntpct_el0,
349 .read_cntvct_el0 = fsl_a008585_read_cntvct_el0, 373 .read_cntvct_el0 = fsl_a008585_read_cntvct_el0,
350 .set_next_event_phys = erratum_set_next_event_tval_phys, 374 .set_next_event_phys = erratum_set_next_event_tval_phys,
351 .set_next_event_virt = erratum_set_next_event_tval_virt, 375 .set_next_event_virt = erratum_set_next_event_tval_virt,
@@ -358,6 +382,7 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
358 .desc = "HiSilicon erratum 161010101", 382 .desc = "HiSilicon erratum 161010101",
359 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, 383 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
360 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, 384 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
385 .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
361 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, 386 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
362 .set_next_event_phys = erratum_set_next_event_tval_phys, 387 .set_next_event_phys = erratum_set_next_event_tval_phys,
363 .set_next_event_virt = erratum_set_next_event_tval_virt, 388 .set_next_event_virt = erratum_set_next_event_tval_virt,
@@ -368,6 +393,7 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
368 .desc = "HiSilicon erratum 161010101", 393 .desc = "HiSilicon erratum 161010101",
369 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0, 394 .read_cntp_tval_el0 = hisi_161010101_read_cntp_tval_el0,
370 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0, 395 .read_cntv_tval_el0 = hisi_161010101_read_cntv_tval_el0,
396 .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
371 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0, 397 .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
372 .set_next_event_phys = erratum_set_next_event_tval_phys, 398 .set_next_event_phys = erratum_set_next_event_tval_phys,
373 .set_next_event_virt = erratum_set_next_event_tval_virt, 399 .set_next_event_virt = erratum_set_next_event_tval_virt,
@@ -378,6 +404,7 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
378 .match_type = ate_match_local_cap_id, 404 .match_type = ate_match_local_cap_id,
379 .id = (void *)ARM64_WORKAROUND_858921, 405 .id = (void *)ARM64_WORKAROUND_858921,
380 .desc = "ARM erratum 858921", 406 .desc = "ARM erratum 858921",
407 .read_cntpct_el0 = arm64_858921_read_cntpct_el0,
381 .read_cntvct_el0 = arm64_858921_read_cntvct_el0, 408 .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
382 }, 409 },
383#endif 410#endif
@@ -901,7 +928,7 @@ static void __init arch_counter_register(unsigned type)
901 928
902 /* Register the CP15 based counter if we have one */ 929 /* Register the CP15 based counter if we have one */
903 if (type & ARCH_TIMER_TYPE_CP15) { 930 if (type & ARCH_TIMER_TYPE_CP15) {
904 if (IS_ENABLED(CONFIG_ARM64) || 931 if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
905 arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) 932 arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
906 arch_timer_read_counter = arch_counter_get_cntvct; 933 arch_timer_read_counter = arch_counter_get_cntvct;
907 else 934 else
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index b54b55597ffb..17221143f505 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1260,7 +1260,9 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
1260 goto out_unmap_rdist; 1260 goto out_unmap_rdist;
1261 1261
1262 gic_populate_ppi_partitions(node); 1262 gic_populate_ppi_partitions(node);
1263 gic_of_setup_kvm_info(node); 1263
1264 if (static_key_true(&supports_deactivate))
1265 gic_of_setup_kvm_info(node);
1264 return 0; 1266 return 0;
1265 1267
1266out_unmap_rdist: 1268out_unmap_rdist:
@@ -1549,7 +1551,9 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
1549 goto out_fwhandle_free; 1551 goto out_fwhandle_free;
1550 1552
1551 acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle); 1553 acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
1552 gic_acpi_setup_kvm_info(); 1554
1555 if (static_key_true(&supports_deactivate))
1556 gic_acpi_setup_kvm_info();
1553 1557
1554 return 0; 1558 return 0;
1555 1559
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index f641e8e2c78d..121af5cf688f 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1420,7 +1420,8 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
1420 if (ret) 1420 if (ret)
1421 return; 1421 return;
1422 1422
1423 gic_set_kvm_info(&gic_v2_kvm_info); 1423 if (static_key_true(&supports_deactivate))
1424 gic_set_kvm_info(&gic_v2_kvm_info);
1424} 1425}
1425 1426
1426int __init 1427int __init
@@ -1652,7 +1653,8 @@ static int __init gic_v2_acpi_init(struct acpi_subtable_header *header,
1652 if (IS_ENABLED(CONFIG_ARM_GIC_V2M)) 1653 if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
1653 gicv2m_init(NULL, gic_data[0].domain); 1654 gicv2m_init(NULL, gic_data[0].domain);
1654 1655
1655 gic_acpi_setup_kvm_info(); 1656 if (static_key_true(&supports_deactivate))
1657 gic_acpi_setup_kvm_info();
1656 1658
1657 return 0; 1659 return 0;
1658} 1660}
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index f0053f884b4a..01ee473517e2 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -31,8 +31,15 @@ struct arch_timer_context {
31 /* Timer IRQ */ 31 /* Timer IRQ */
32 struct kvm_irq_level irq; 32 struct kvm_irq_level irq;
33 33
34 /* Active IRQ state caching */ 34 /*
35 bool active_cleared_last; 35 * We have multiple paths which can save/restore the timer state
36 * onto the hardware, so we need some way of keeping track of
37 * where the latest state is.
38 *
39 * loaded == true: State is loaded on the hardware registers.
40 * loaded == false: State is stored in memory.
41 */
42 bool loaded;
36 43
37 /* Virtual offset */ 44 /* Virtual offset */
38 u64 cntvoff; 45 u64 cntvoff;
@@ -43,13 +50,13 @@ struct arch_timer_cpu {
43 struct arch_timer_context ptimer; 50 struct arch_timer_context ptimer;
44 51
45 /* Background timer used when the guest is not running */ 52 /* Background timer used when the guest is not running */
46 struct hrtimer timer; 53 struct hrtimer bg_timer;
47 54
48 /* Work queued with the above timer expires */ 55 /* Work queued with the above timer expires */
49 struct work_struct expired; 56 struct work_struct expired;
50 57
51 /* Background timer active */ 58 /* Physical timer emulation */
52 bool armed; 59 struct hrtimer phys_timer;
53 60
54 /* Is the timer enabled */ 61 /* Is the timer enabled */
55 bool enabled; 62 bool enabled;
@@ -59,7 +66,6 @@ int kvm_timer_hyp_init(void);
59int kvm_timer_enable(struct kvm_vcpu *vcpu); 66int kvm_timer_enable(struct kvm_vcpu *vcpu);
60int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); 67int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu);
61void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); 68void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
62void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
63void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); 69void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
64bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); 70bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu);
65void kvm_timer_update_run(struct kvm_vcpu *vcpu); 71void kvm_timer_update_run(struct kvm_vcpu *vcpu);
@@ -72,16 +78,22 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
72int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); 78int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
73int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); 79int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
74 80
75bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); 81bool kvm_timer_is_pending(struct kvm_vcpu *vcpu);
82
76void kvm_timer_schedule(struct kvm_vcpu *vcpu); 83void kvm_timer_schedule(struct kvm_vcpu *vcpu);
77void kvm_timer_unschedule(struct kvm_vcpu *vcpu); 84void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
78 85
79u64 kvm_phys_timer_read(void); 86u64 kvm_phys_timer_read(void);
80 87
88void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu);
81void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); 89void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu);
82 90
83void kvm_timer_init_vhe(void); 91void kvm_timer_init_vhe(void);
84 92
85#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) 93#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer)
86#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) 94#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer)
95
96void enable_el1_phys_timer_access(void);
97void disable_el1_phys_timer_access(void);
98
87#endif 99#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538eda32..2e754b7c282c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -667,6 +667,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
667 bool *writable); 667 bool *writable);
668 668
669void kvm_release_pfn_clean(kvm_pfn_t pfn); 669void kvm_release_pfn_clean(kvm_pfn_t pfn);
670void kvm_release_pfn_dirty(kvm_pfn_t pfn);
670void kvm_set_pfn_dirty(kvm_pfn_t pfn); 671void kvm_set_pfn_dirty(kvm_pfn_t pfn);
671void kvm_set_pfn_accessed(kvm_pfn_t pfn); 672void kvm_set_pfn_accessed(kvm_pfn_t pfn);
672void kvm_get_pfn(kvm_pfn_t pfn); 673void kvm_get_pfn(kvm_pfn_t pfn);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7e99999d6236..282d7613fce8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -931,6 +931,7 @@ struct kvm_ppc_resize_hpt {
931#define KVM_CAP_PPC_SMT_POSSIBLE 147 931#define KVM_CAP_PPC_SMT_POSSIBLE 147
932#define KVM_CAP_HYPERV_SYNIC2 148 932#define KVM_CAP_HYPERV_SYNIC2 148
933#define KVM_CAP_HYPERV_VP_INDEX 149 933#define KVM_CAP_HYPERV_VP_INDEX 149
934#define KVM_CAP_S390_AIS_MIGRATION 150
934 935
935#ifdef KVM_CAP_IRQ_ROUTING 936#ifdef KVM_CAP_IRQ_ROUTING
936 937
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 32283d88701a..217cf6f95c36 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -19,9 +19,11 @@ Three different ways of output formatting are available:
19 19
20The data is sampled from the KVM's debugfs entries and its perf events. 20The data is sampled from the KVM's debugfs entries and its perf events.
21""" 21"""
22from __future__ import print_function
22 23
23import curses 24import curses
24import sys 25import sys
26import locale
25import os 27import os
26import time 28import time
27import optparse 29import optparse
@@ -225,6 +227,8 @@ IOCTL_NUMBERS = {
225 'RESET': 0x00002403, 227 'RESET': 0x00002403,
226} 228}
227 229
230ENCODING = locale.getpreferredencoding(False)
231
228 232
229class Arch(object): 233class Arch(object):
230 """Encapsulates global architecture specific data. 234 """Encapsulates global architecture specific data.
@@ -666,7 +670,7 @@ class TracepointProvider(Provider):
666 """Returns 'event name: current value' for all enabled events.""" 670 """Returns 'event name: current value' for all enabled events."""
667 ret = defaultdict(int) 671 ret = defaultdict(int)
668 for group in self.group_leaders: 672 for group in self.group_leaders:
669 for name, val in group.read().iteritems(): 673 for name, val in group.read().items():
670 if name in self._fields: 674 if name in self._fields:
671 ret[name] += val 675 ret[name] += val
672 return ret 676 return ret
@@ -955,7 +959,7 @@ class Tui(object):
955 except: 959 except:
956 raise Exception 960 raise Exception
957 for line in child.stdout: 961 for line in child.stdout:
958 line = line.lstrip().split(' ', 1) 962 line = line.decode(ENCODING).lstrip().split(' ', 1)
959 # perform a sanity check before calling the more expensive 963 # perform a sanity check before calling the more expensive
960 # function to possibly extract the guest name 964 # function to possibly extract the guest name
961 if ' -name ' in line[1]: 965 if ' -name ' in line[1]:
@@ -1005,7 +1009,7 @@ class Tui(object):
1005 name = '' 1009 name = ''
1006 try: 1010 try:
1007 line = open('/proc/{}/cmdline' 1011 line = open('/proc/{}/cmdline'
1008 .format(pid), 'rb').read().split('\0') 1012 .format(pid), 'r').read().split('\0')
1009 parms = line[line.index('-name') + 1].split(',') 1013 parms = line[line.index('-name') + 1].split(',')
1010 while '' in parms: 1014 while '' in parms:
1011 # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results 1015 # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results
@@ -1170,7 +1174,7 @@ class Tui(object):
1170 .format(self.stats.fields_filter)) 1174 .format(self.stats.fields_filter))
1171 self.screen.addstr(3, 0, "New regex: ") 1175 self.screen.addstr(3, 0, "New regex: ")
1172 curses.echo() 1176 curses.echo()
1173 regex = self.screen.getstr() 1177 regex = self.screen.getstr().decode(ENCODING)
1174 curses.noecho() 1178 curses.noecho()
1175 if len(regex) == 0: 1179 if len(regex) == 0:
1176 self.stats.fields_filter = DEFAULT_REGEX 1180 self.stats.fields_filter = DEFAULT_REGEX
@@ -1204,7 +1208,7 @@ class Tui(object):
1204 1208
1205 curses.echo() 1209 curses.echo()
1206 self.screen.addstr(3, 0, "Pid [0 or pid]: ") 1210 self.screen.addstr(3, 0, "Pid [0 or pid]: ")
1207 pid = self.screen.getstr() 1211 pid = self.screen.getstr().decode(ENCODING)
1208 curses.noecho() 1212 curses.noecho()
1209 1213
1210 try: 1214 try:
@@ -1233,7 +1237,7 @@ class Tui(object):
1233 self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % 1237 self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
1234 self._delay_regular) 1238 self._delay_regular)
1235 curses.echo() 1239 curses.echo()
1236 val = self.screen.getstr() 1240 val = self.screen.getstr().decode(ENCODING)
1237 curses.noecho() 1241 curses.noecho()
1238 1242
1239 try: 1243 try:
@@ -1273,7 +1277,7 @@ class Tui(object):
1273 self.print_all_gnames(7) 1277 self.print_all_gnames(7)
1274 curses.echo() 1278 curses.echo()
1275 self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") 1279 self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
1276 gname = self.screen.getstr() 1280 gname = self.screen.getstr().decode(ENCODING)
1277 curses.noecho() 1281 curses.noecho()
1278 1282
1279 if not gname: 1283 if not gname:
@@ -1369,25 +1373,25 @@ def batch(stats):
1369 s = stats.get() 1373 s = stats.get()
1370 for key in sorted(s.keys()): 1374 for key in sorted(s.keys()):
1371 values = s[key] 1375 values = s[key]
1372 print '%-42s%10d%10d' % (key, values[0], values[1]) 1376 print('%-42s%10d%10d' % (key, values[0], values[1]))
1373 except KeyboardInterrupt: 1377 except KeyboardInterrupt:
1374 pass 1378 pass
1375 1379
1376 1380
1377def log(stats): 1381def log(stats):
1378 """Prints statistics as reiterating key block, multiple value blocks.""" 1382 """Prints statistics as reiterating key block, multiple value blocks."""
1379 keys = sorted(stats.get().iterkeys()) 1383 keys = sorted(stats.get().keys())
1380 1384
1381 def banner(): 1385 def banner():
1382 for k in keys: 1386 for k in keys:
1383 print '%s' % k, 1387 print(k, end=' ')
1384 print 1388 print()
1385 1389
1386 def statline(): 1390 def statline():
1387 s = stats.get() 1391 s = stats.get()
1388 for k in keys: 1392 for k in keys:
1389 print ' %9d' % s[k][1], 1393 print(' %9d' % s[k][1], end=' ')
1390 print 1394 print()
1391 line = 0 1395 line = 0
1392 banner_repeat = 20 1396 banner_repeat = 20
1393 while True: 1397 while True:
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index 79c7c357804b..8bc479fa37e6 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -25,11 +25,6 @@
25#include <asm/kvm_emulate.h> 25#include <asm/kvm_emulate.h>
26#include <asm/kvm_hyp.h> 26#include <asm/kvm_hyp.h>
27 27
28#ifndef CONFIG_ARM64
29#define COMPAT_PSR_T_BIT PSR_T_BIT
30#define COMPAT_PSR_IT_MASK PSR_IT_MASK
31#endif
32
33/* 28/*
34 * stolen from arch/arm/kernel/opcodes.c 29 * stolen from arch/arm/kernel/opcodes.c
35 * 30 *
@@ -150,3 +145,95 @@ void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
150 *vcpu_pc(vcpu) += 4; 145 *vcpu_pc(vcpu) += 4;
151 kvm_adjust_itstate(vcpu); 146 kvm_adjust_itstate(vcpu);
152} 147}
148
149/*
150 * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
151 */
152static const u8 return_offsets[8][2] = {
153 [0] = { 0, 0 }, /* Reset, unused */
154 [1] = { 4, 2 }, /* Undefined */
155 [2] = { 0, 0 }, /* SVC, unused */
156 [3] = { 4, 4 }, /* Prefetch abort */
157 [4] = { 8, 8 }, /* Data abort */
158 [5] = { 0, 0 }, /* HVC, unused */
159 [6] = { 4, 4 }, /* IRQ, unused */
160 [7] = { 4, 4 }, /* FIQ, unused */
161};
162
163static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
164{
165 unsigned long cpsr;
166 unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
167 bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT);
168 u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
169 u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
170
171 cpsr = mode | COMPAT_PSR_I_BIT;
172
173 if (sctlr & (1 << 30))
174 cpsr |= COMPAT_PSR_T_BIT;
175 if (sctlr & (1 << 25))
176 cpsr |= COMPAT_PSR_E_BIT;
177
178 *vcpu_cpsr(vcpu) = cpsr;
179
180 /* Note: These now point to the banked copies */
181 *vcpu_spsr(vcpu) = new_spsr_value;
182 *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
183
184 /* Branch to exception vector */
185 if (sctlr & (1 << 13))
186 vect_offset += 0xffff0000;
187 else /* always have security exceptions */
188 vect_offset += vcpu_cp15(vcpu, c12_VBAR);
189
190 *vcpu_pc(vcpu) = vect_offset;
191}
192
193void kvm_inject_undef32(struct kvm_vcpu *vcpu)
194{
195 prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4);
196}
197
198/*
199 * Modelled after TakeDataAbortException() and TakePrefetchAbortException
200 * pseudocode.
201 */
202static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
203 unsigned long addr)
204{
205 u32 vect_offset;
206 u32 *far, *fsr;
207 bool is_lpae;
208
209 if (is_pabt) {
210 vect_offset = 12;
211 far = &vcpu_cp15(vcpu, c6_IFAR);
212 fsr = &vcpu_cp15(vcpu, c5_IFSR);
213 } else { /* !iabt */
214 vect_offset = 16;
215 far = &vcpu_cp15(vcpu, c6_DFAR);
216 fsr = &vcpu_cp15(vcpu, c5_DFSR);
217 }
218
219 prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset);
220
221 *far = addr;
222
223 /* Give the guest an IMPLEMENTATION DEFINED exception */
224 is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
225 if (is_lpae)
226 *fsr = 1 << 9 | 0x34;
227 else
228 *fsr = 0x14;
229}
230
231void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
232{
233 inject_abt32(vcpu, false, addr);
234}
235
236void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
237{
238 inject_abt32(vcpu, true, addr);
239}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 8e89d63005c7..4db54ff08d9e 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -46,49 +46,68 @@ static const struct kvm_irq_level default_vtimer_irq = {
46 .level = 1, 46 .level = 1,
47}; 47};
48 48
49void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 49static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
50{ 50static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
51 vcpu_vtimer(vcpu)->active_cleared_last = false; 51 struct arch_timer_context *timer_ctx);
52} 52static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
53 53
54u64 kvm_phys_timer_read(void) 54u64 kvm_phys_timer_read(void)
55{ 55{
56 return timecounter->cc->read(timecounter->cc); 56 return timecounter->cc->read(timecounter->cc);
57} 57}
58 58
59static bool timer_is_armed(struct arch_timer_cpu *timer) 59static void soft_timer_start(struct hrtimer *hrt, u64 ns)
60{ 60{
61 return timer->armed; 61 hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
62 HRTIMER_MODE_ABS);
62} 63}
63 64
64/* timer_arm: as in "arm the timer", not as in ARM the company */ 65static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
65static void timer_arm(struct arch_timer_cpu *timer, u64 ns)
66{ 66{
67 timer->armed = true; 67 hrtimer_cancel(hrt);
68 hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), 68 if (work)
69 HRTIMER_MODE_ABS); 69 cancel_work_sync(work);
70} 70}
71 71
72static void timer_disarm(struct arch_timer_cpu *timer) 72static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
73{ 73{
74 if (timer_is_armed(timer)) { 74 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
75 hrtimer_cancel(&timer->timer); 75
76 cancel_work_sync(&timer->expired); 76 /*
77 timer->armed = false; 77 * When using a userspace irqchip with the architected timers, we must
78 } 78 * prevent continuously exiting from the guest, and therefore mask the
79 * physical interrupt by disabling it on the host interrupt controller
80 * when the virtual level is high, such that the guest can make
81 * forward progress. Once we detect the output level being
82 * de-asserted, we unmask the interrupt again so that we exit from the
83 * guest when the timer fires.
84 */
85 if (vtimer->irq.level)
86 disable_percpu_irq(host_vtimer_irq);
87 else
88 enable_percpu_irq(host_vtimer_irq, 0);
79} 89}
80 90
81static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 91static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
82{ 92{
83 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; 93 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
94 struct arch_timer_context *vtimer;
95
96 if (!vcpu) {
97 pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n");
98 return IRQ_NONE;
99 }
100 vtimer = vcpu_vtimer(vcpu);
101
102 if (!vtimer->irq.level) {
103 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
104 if (kvm_timer_irq_can_fire(vtimer))
105 kvm_timer_update_irq(vcpu, true, vtimer);
106 }
107
108 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
109 kvm_vtimer_update_mask_user(vcpu);
84 110
85 /*
86 * We disable the timer in the world switch and let it be
87 * handled by kvm_timer_sync_hwstate(). Getting a timer
88 * interrupt at this point is a sure sign of some major
89 * breakage.
90 */
91 pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu);
92 return IRQ_HANDLED; 111 return IRQ_HANDLED;
93} 112}
94 113
@@ -158,13 +177,13 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
158 return min(min_virt, min_phys); 177 return min(min_virt, min_phys);
159} 178}
160 179
161static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) 180static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
162{ 181{
163 struct arch_timer_cpu *timer; 182 struct arch_timer_cpu *timer;
164 struct kvm_vcpu *vcpu; 183 struct kvm_vcpu *vcpu;
165 u64 ns; 184 u64 ns;
166 185
167 timer = container_of(hrt, struct arch_timer_cpu, timer); 186 timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
168 vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); 187 vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
169 188
170 /* 189 /*
@@ -182,7 +201,33 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
182 return HRTIMER_NORESTART; 201 return HRTIMER_NORESTART;
183} 202}
184 203
185bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) 204static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
205{
206 struct arch_timer_context *ptimer;
207 struct arch_timer_cpu *timer;
208 struct kvm_vcpu *vcpu;
209 u64 ns;
210
211 timer = container_of(hrt, struct arch_timer_cpu, phys_timer);
212 vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
213 ptimer = vcpu_ptimer(vcpu);
214
215 /*
216 * Check that the timer has really expired from the guest's
217 * PoV (NTP on the host may have forced it to expire
218 * early). If not ready, schedule for a later time.
219 */
220 ns = kvm_timer_compute_delta(ptimer);
221 if (unlikely(ns)) {
222 hrtimer_forward_now(hrt, ns_to_ktime(ns));
223 return HRTIMER_RESTART;
224 }
225
226 kvm_timer_update_irq(vcpu, true, ptimer);
227 return HRTIMER_NORESTART;
228}
229
230static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
186{ 231{
187 u64 cval, now; 232 u64 cval, now;
188 233
@@ -195,6 +240,25 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
195 return cval <= now; 240 return cval <= now;
196} 241}
197 242
243bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
244{
245 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
246 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
247
248 if (vtimer->irq.level || ptimer->irq.level)
249 return true;
250
251 /*
252 * When this is called from withing the wait loop of kvm_vcpu_block(),
253 * the software view of the timer state is up to date (timer->loaded
254 * is false), and so we can simply check if the timer should fire now.
255 */
256 if (!vtimer->loaded && kvm_timer_should_fire(vtimer))
257 return true;
258
259 return kvm_timer_should_fire(ptimer);
260}
261
198/* 262/*
199 * Reflect the timer output level into the kvm_run structure 263 * Reflect the timer output level into the kvm_run structure
200 */ 264 */
@@ -218,7 +282,6 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
218{ 282{
219 int ret; 283 int ret;
220 284
221 timer_ctx->active_cleared_last = false;
222 timer_ctx->irq.level = new_level; 285 timer_ctx->irq.level = new_level;
223 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, 286 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
224 timer_ctx->irq.level); 287 timer_ctx->irq.level);
@@ -232,9 +295,29 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
232 } 295 }
233} 296}
234 297
298/* Schedule the background timer for the emulated timer. */
299static void phys_timer_emulate(struct kvm_vcpu *vcpu)
300{
301 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
302 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
303
304 /*
305 * If the timer can fire now we have just raised the IRQ line and we
306 * don't need to have a soft timer scheduled for the future. If the
307 * timer cannot fire at all, then we also don't need a soft timer.
308 */
309 if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
310 soft_timer_cancel(&timer->phys_timer, NULL);
311 return;
312 }
313
314 soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer));
315}
316
235/* 317/*
236 * Check if there was a change in the timer state (should we raise or lower 318 * Check if there was a change in the timer state, so that we should either
237 * the line level to the GIC). 319 * raise or lower the line level to the GIC or schedule a background timer to
320 * emulate the physical timer.
238 */ 321 */
239static void kvm_timer_update_state(struct kvm_vcpu *vcpu) 322static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
240{ 323{
@@ -242,12 +325,6 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
242 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 325 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
243 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 326 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
244 327
245 /*
246 * If userspace modified the timer registers via SET_ONE_REG before
247 * the vgic was initialized, we mustn't set the vtimer->irq.level value
248 * because the guest would never see the interrupt. Instead wait
249 * until we call this function from kvm_timer_flush_hwstate.
250 */
251 if (unlikely(!timer->enabled)) 328 if (unlikely(!timer->enabled))
252 return; 329 return;
253 330
@@ -256,22 +333,32 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
256 333
257 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) 334 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
258 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); 335 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
336
337 phys_timer_emulate(vcpu);
259} 338}
260 339
261/* Schedule the background timer for the emulated timer. */ 340static void vtimer_save_state(struct kvm_vcpu *vcpu)
262static void kvm_timer_emulate(struct kvm_vcpu *vcpu,
263 struct arch_timer_context *timer_ctx)
264{ 341{
265 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 342 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
343 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
344 unsigned long flags;
266 345
267 if (kvm_timer_should_fire(timer_ctx)) 346 local_irq_save(flags);
268 return;
269 347
270 if (!kvm_timer_irq_can_fire(timer_ctx)) 348 if (!vtimer->loaded)
271 return; 349 goto out;
272 350
273 /* The timer has not yet expired, schedule a background timer */ 351 if (timer->enabled) {
274 timer_arm(timer, kvm_timer_compute_delta(timer_ctx)); 352 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
353 vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
354 }
355
356 /* Disable the virtual timer */
357 write_sysreg_el0(0, cntv_ctl);
358
359 vtimer->loaded = false;
360out:
361 local_irq_restore(flags);
275} 362}
276 363
277/* 364/*
@@ -285,7 +372,7 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu)
285 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 372 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
286 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 373 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
287 374
288 BUG_ON(timer_is_armed(timer)); 375 vtimer_save_state(vcpu);
289 376
290 /* 377 /*
291 * No need to schedule a background timer if any guest timer has 378 * No need to schedule a background timer if any guest timer has
@@ -306,70 +393,97 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu)
306 * The guest timers have not yet expired, schedule a background timer. 393 * The guest timers have not yet expired, schedule a background timer.
307 * Set the earliest expiration time among the guest timers. 394 * Set the earliest expiration time among the guest timers.
308 */ 395 */
309 timer_arm(timer, kvm_timer_earliest_exp(vcpu)); 396 soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
397}
398
399static void vtimer_restore_state(struct kvm_vcpu *vcpu)
400{
401 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
402 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
403 unsigned long flags;
404
405 local_irq_save(flags);
406
407 if (vtimer->loaded)
408 goto out;
409
410 if (timer->enabled) {
411 write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
412 isb();
413 write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
414 }
415
416 vtimer->loaded = true;
417out:
418 local_irq_restore(flags);
310} 419}
311 420
312void kvm_timer_unschedule(struct kvm_vcpu *vcpu) 421void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
313{ 422{
314 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 423 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
315 timer_disarm(timer); 424
425 vtimer_restore_state(vcpu);
426
427 soft_timer_cancel(&timer->bg_timer, &timer->expired);
428}
429
430static void set_cntvoff(u64 cntvoff)
431{
432 u32 low = lower_32_bits(cntvoff);
433 u32 high = upper_32_bits(cntvoff);
434
435 /*
436 * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
437 * 32-bit systems, but rather passes register by register shifted one
438 * place (we put the function address in r0/x0), we cannot simply pass
439 * a 64-bit value as an argument, but have to split the value in two
440 * 32-bit halves.
441 */
442 kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
316} 443}
317 444
318static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu) 445static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
319{ 446{
320 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 447 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
321 bool phys_active; 448 bool phys_active;
322 int ret; 449 int ret;
323 450
324 /*
325 * If we enter the guest with the virtual input level to the VGIC
326 * asserted, then we have already told the VGIC what we need to, and
327 * we don't need to exit from the guest until the guest deactivates
328 * the already injected interrupt, so therefore we should set the
329 * hardware active state to prevent unnecessary exits from the guest.
330 *
331 * Also, if we enter the guest with the virtual timer interrupt active,
332 * then it must be active on the physical distributor, because we set
333 * the HW bit and the guest must be able to deactivate the virtual and
334 * physical interrupt at the same time.
335 *
336 * Conversely, if the virtual input level is deasserted and the virtual
337 * interrupt is not active, then always clear the hardware active state
338 * to ensure that hardware interrupts from the timer triggers a guest
339 * exit.
340 */
341 phys_active = vtimer->irq.level || 451 phys_active = vtimer->irq.level ||
342 kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); 452 kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
343
344 /*
345 * We want to avoid hitting the (re)distributor as much as
346 * possible, as this is a potentially expensive MMIO access
347 * (not to mention locks in the irq layer), and a solution for
348 * this is to cache the "active" state in memory.
349 *
350 * Things to consider: we cannot cache an "active set" state,
351 * because the HW can change this behind our back (it becomes
352 * "clear" in the HW). We must then restrict the caching to
353 * the "clear" state.
354 *
355 * The cache is invalidated on:
356 * - vcpu put, indicating that the HW cannot be trusted to be
357 * in a sane state on the next vcpu load,
358 * - any change in the interrupt state
359 *
360 * Usage conditions:
361 * - cached value is "active clear"
362 * - value to be programmed is "active clear"
363 */
364 if (vtimer->active_cleared_last && !phys_active)
365 return;
366 453
367 ret = irq_set_irqchip_state(host_vtimer_irq, 454 ret = irq_set_irqchip_state(host_vtimer_irq,
368 IRQCHIP_STATE_ACTIVE, 455 IRQCHIP_STATE_ACTIVE,
369 phys_active); 456 phys_active);
370 WARN_ON(ret); 457 WARN_ON(ret);
458}
371 459
372 vtimer->active_cleared_last = !phys_active; 460static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu)
461{
462 kvm_vtimer_update_mask_user(vcpu);
463}
464
465void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
466{
467 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
468 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
469
470 if (unlikely(!timer->enabled))
471 return;
472
473 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
474 kvm_timer_vcpu_load_user(vcpu);
475 else
476 kvm_timer_vcpu_load_vgic(vcpu);
477
478 set_cntvoff(vtimer->cntvoff);
479
480 vtimer_restore_state(vcpu);
481
482 if (has_vhe())
483 disable_el1_phys_timer_access();
484
485 /* Set the background timer for the physical timer emulation. */
486 phys_timer_emulate(vcpu);
373} 487}
374 488
375bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) 489bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -389,48 +503,60 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
389 ptimer->irq.level != plevel; 503 ptimer->irq.level != plevel;
390} 504}
391 505
392static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu) 506void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
393{ 507{
394 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 508 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
509
510 if (unlikely(!timer->enabled))
511 return;
512
513 if (has_vhe())
514 enable_el1_phys_timer_access();
515
516 vtimer_save_state(vcpu);
395 517
396 /* 518 /*
397 * To prevent continuously exiting from the guest, we mask the 519 * Cancel the physical timer emulation, because the only case where we
398 * physical interrupt such that the guest can make forward progress. 520 * need it after a vcpu_put is in the context of a sleeping VCPU, and
399 * Once we detect the output level being deasserted, we unmask the 521 * in that case we already factor in the deadline for the physical
400 * interrupt again so that we exit from the guest when the timer 522 * timer when scheduling the bg_timer.
401 * fires. 523 *
402 */ 524 * In any case, we re-schedule the hrtimer for the physical timer when
403 if (vtimer->irq.level) 525 * coming back to the VCPU thread in kvm_timer_vcpu_load().
404 disable_percpu_irq(host_vtimer_irq); 526 */
405 else 527 soft_timer_cancel(&timer->phys_timer, NULL);
406 enable_percpu_irq(host_vtimer_irq, 0); 528
529 /*
530 * The kernel may decide to run userspace after calling vcpu_put, so
531 * we reset cntvoff to 0 to ensure a consistent read between user
532 * accesses to the virtual counter and kernel access to the physical
533 * counter.
534 */
535 set_cntvoff(0);
407} 536}
408 537
409/** 538static void unmask_vtimer_irq(struct kvm_vcpu *vcpu)
410 * kvm_timer_flush_hwstate - prepare timers before running the vcpu
411 * @vcpu: The vcpu pointer
412 *
413 * Check if the virtual timer has expired while we were running in the host,
414 * and inject an interrupt if that was the case, making sure the timer is
415 * masked or disabled on the host so that we keep executing. Also schedule a
416 * software timer for the physical timer if it is enabled.
417 */
418void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
419{ 539{
420 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 540 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
421 541
422 if (unlikely(!timer->enabled)) 542 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
543 kvm_vtimer_update_mask_user(vcpu);
423 return; 544 return;
545 }
424 546
425 kvm_timer_update_state(vcpu); 547 /*
426 548 * If the guest disabled the timer without acking the interrupt, then
427 /* Set the background timer for the physical timer emulation. */ 549 * we must make sure the physical and virtual active states are in
428 kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); 550 * sync by deactivating the physical interrupt, because otherwise we
429 551 * wouldn't see the next timer interrupt in the host.
430 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) 552 */
431 kvm_timer_flush_hwstate_user(vcpu); 553 if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) {
432 else 554 int ret;
433 kvm_timer_flush_hwstate_vgic(vcpu); 555 ret = irq_set_irqchip_state(host_vtimer_irq,
556 IRQCHIP_STATE_ACTIVE,
557 false);
558 WARN_ON(ret);
559 }
434} 560}
435 561
436/** 562/**
@@ -442,19 +568,21 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
442 */ 568 */
443void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) 569void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
444{ 570{
445 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 571 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
446
447 /*
448 * This is to cancel the background timer for the physical timer
449 * emulation if it is set.
450 */
451 timer_disarm(timer);
452 572
453 /* 573 /*
454 * The guest could have modified the timer registers or the timer 574 * If we entered the guest with the vtimer output asserted we have to
455 * could have expired, update the timer state. 575 * check if the guest has modified the timer so that we should lower
576 * the line at this point.
456 */ 577 */
457 kvm_timer_update_state(vcpu); 578 if (vtimer->irq.level) {
579 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
580 vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
581 if (!kvm_timer_should_fire(vtimer)) {
582 kvm_timer_update_irq(vcpu, false, vtimer);
583 unmask_vtimer_irq(vcpu);
584 }
585 }
458} 586}
459 587
460int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) 588int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -505,8 +633,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
505 vcpu_ptimer(vcpu)->cntvoff = 0; 633 vcpu_ptimer(vcpu)->cntvoff = 0;
506 634
507 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); 635 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
508 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 636 hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
509 timer->timer.function = kvm_timer_expire; 637 timer->bg_timer.function = kvm_bg_timer_expire;
638
639 hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
640 timer->phys_timer.function = kvm_phys_timer_expire;
510 641
511 vtimer->irq.irq = default_vtimer_irq.irq; 642 vtimer->irq.irq = default_vtimer_irq.irq;
512 ptimer->irq.irq = default_ptimer_irq.irq; 643 ptimer->irq.irq = default_ptimer_irq.irq;
@@ -520,10 +651,11 @@ static void kvm_timer_init_interrupt(void *info)
520int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) 651int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
521{ 652{
522 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 653 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
654 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
523 655
524 switch (regid) { 656 switch (regid) {
525 case KVM_REG_ARM_TIMER_CTL: 657 case KVM_REG_ARM_TIMER_CTL:
526 vtimer->cnt_ctl = value; 658 vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
527 break; 659 break;
528 case KVM_REG_ARM_TIMER_CNT: 660 case KVM_REG_ARM_TIMER_CNT:
529 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); 661 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
@@ -531,6 +663,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
531 case KVM_REG_ARM_TIMER_CVAL: 663 case KVM_REG_ARM_TIMER_CVAL:
532 vtimer->cnt_cval = value; 664 vtimer->cnt_cval = value;
533 break; 665 break;
666 case KVM_REG_ARM_PTIMER_CTL:
667 ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
668 break;
669 case KVM_REG_ARM_PTIMER_CVAL:
670 ptimer->cnt_cval = value;
671 break;
672
534 default: 673 default:
535 return -1; 674 return -1;
536 } 675 }
@@ -539,17 +678,38 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
539 return 0; 678 return 0;
540} 679}
541 680
681static u64 read_timer_ctl(struct arch_timer_context *timer)
682{
683 /*
684 * Set ISTATUS bit if it's expired.
685 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
686 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
687 * regardless of ENABLE bit for our implementation convenience.
688 */
689 if (!kvm_timer_compute_delta(timer))
690 return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
691 else
692 return timer->cnt_ctl;
693}
694
542u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) 695u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
543{ 696{
697 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
544 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 698 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
545 699
546 switch (regid) { 700 switch (regid) {
547 case KVM_REG_ARM_TIMER_CTL: 701 case KVM_REG_ARM_TIMER_CTL:
548 return vtimer->cnt_ctl; 702 return read_timer_ctl(vtimer);
549 case KVM_REG_ARM_TIMER_CNT: 703 case KVM_REG_ARM_TIMER_CNT:
550 return kvm_phys_timer_read() - vtimer->cntvoff; 704 return kvm_phys_timer_read() - vtimer->cntvoff;
551 case KVM_REG_ARM_TIMER_CVAL: 705 case KVM_REG_ARM_TIMER_CVAL:
552 return vtimer->cnt_cval; 706 return vtimer->cnt_cval;
707 case KVM_REG_ARM_PTIMER_CTL:
708 return read_timer_ctl(ptimer);
709 case KVM_REG_ARM_PTIMER_CVAL:
710 return ptimer->cnt_cval;
711 case KVM_REG_ARM_PTIMER_CNT:
712 return kvm_phys_timer_read();
553 } 713 }
554 return (u64)-1; 714 return (u64)-1;
555} 715}
@@ -602,11 +762,20 @@ int kvm_timer_hyp_init(void)
602 return err; 762 return err;
603 } 763 }
604 764
765 err = irq_set_vcpu_affinity(host_vtimer_irq, kvm_get_running_vcpus());
766 if (err) {
767 kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
768 goto out_free_irq;
769 }
770
605 kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); 771 kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
606 772
607 cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, 773 cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
608 "kvm/arm/timer:starting", kvm_timer_starting_cpu, 774 "kvm/arm/timer:starting", kvm_timer_starting_cpu,
609 kvm_timer_dying_cpu); 775 kvm_timer_dying_cpu);
776 return 0;
777out_free_irq:
778 free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
610 return err; 779 return err;
611} 780}
612 781
@@ -615,7 +784,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
615 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 784 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
616 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 785 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
617 786
618 timer_disarm(timer); 787 soft_timer_cancel(&timer->bg_timer, &timer->expired);
788 soft_timer_cancel(&timer->phys_timer, NULL);
619 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); 789 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
620} 790}
621 791
@@ -691,7 +861,11 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
691 return ret; 861 return ret;
692 862
693no_vgic: 863no_vgic:
864 preempt_disable();
694 timer->enabled = 1; 865 timer->enabled = 1;
866 kvm_timer_vcpu_load_vgic(vcpu);
867 preempt_enable();
868
695 return 0; 869 return 0;
696} 870}
697 871
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 4cf9b91e6c9b..772bf74ac2e9 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -307,8 +307,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
307 307
308int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 308int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
309{ 309{
310 return kvm_timer_should_fire(vcpu_vtimer(vcpu)) || 310 return kvm_timer_is_pending(vcpu);
311 kvm_timer_should_fire(vcpu_ptimer(vcpu));
312} 311}
313 312
314void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 313void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -354,18 +353,18 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
354 vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); 353 vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
355 354
356 kvm_arm_set_running_vcpu(vcpu); 355 kvm_arm_set_running_vcpu(vcpu);
357
358 kvm_vgic_load(vcpu); 356 kvm_vgic_load(vcpu);
357 kvm_timer_vcpu_load(vcpu);
359} 358}
360 359
361void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 360void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
362{ 361{
362 kvm_timer_vcpu_put(vcpu);
363 kvm_vgic_put(vcpu); 363 kvm_vgic_put(vcpu);
364 364
365 vcpu->cpu = -1; 365 vcpu->cpu = -1;
366 366
367 kvm_arm_set_running_vcpu(NULL); 367 kvm_arm_set_running_vcpu(NULL);
368 kvm_timer_vcpu_put(vcpu);
369} 368}
370 369
371static void vcpu_power_off(struct kvm_vcpu *vcpu) 370static void vcpu_power_off(struct kvm_vcpu *vcpu)
@@ -657,11 +656,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
657 656
658 kvm_pmu_flush_hwstate(vcpu); 657 kvm_pmu_flush_hwstate(vcpu);
659 658
660 kvm_timer_flush_hwstate(vcpu);
661 kvm_vgic_flush_hwstate(vcpu);
662
663 local_irq_disable(); 659 local_irq_disable();
664 660
661 kvm_vgic_flush_hwstate(vcpu);
662
665 /* 663 /*
666 * If we have a singal pending, or need to notify a userspace 664 * If we have a singal pending, or need to notify a userspace
667 * irqchip about timer or PMU level changes, then we exit (and 665 * irqchip about timer or PMU level changes, then we exit (and
@@ -686,10 +684,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
686 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || 684 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
687 kvm_request_pending(vcpu)) { 685 kvm_request_pending(vcpu)) {
688 vcpu->mode = OUTSIDE_GUEST_MODE; 686 vcpu->mode = OUTSIDE_GUEST_MODE;
689 local_irq_enable();
690 kvm_pmu_sync_hwstate(vcpu); 687 kvm_pmu_sync_hwstate(vcpu);
691 kvm_timer_sync_hwstate(vcpu); 688 kvm_timer_sync_hwstate(vcpu);
692 kvm_vgic_sync_hwstate(vcpu); 689 kvm_vgic_sync_hwstate(vcpu);
690 local_irq_enable();
693 preempt_enable(); 691 preempt_enable();
694 continue; 692 continue;
695 } 693 }
@@ -713,6 +711,27 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
713 kvm_arm_clear_debug(vcpu); 711 kvm_arm_clear_debug(vcpu);
714 712
715 /* 713 /*
714 * We must sync the PMU state before the vgic state so
715 * that the vgic can properly sample the updated state of the
716 * interrupt line.
717 */
718 kvm_pmu_sync_hwstate(vcpu);
719
720 /*
721 * Sync the vgic state before syncing the timer state because
722 * the timer code needs to know if the virtual timer
723 * interrupts are active.
724 */
725 kvm_vgic_sync_hwstate(vcpu);
726
727 /*
728 * Sync the timer hardware state before enabling interrupts as
729 * we don't want vtimer interrupts to race with syncing the
730 * timer virtual interrupt state.
731 */
732 kvm_timer_sync_hwstate(vcpu);
733
734 /*
716 * We may have taken a host interrupt in HYP mode (ie 735 * We may have taken a host interrupt in HYP mode (ie
717 * while executing the guest). This interrupt is still 736 * while executing the guest). This interrupt is still
718 * pending, as we haven't serviced it yet! 737 * pending, as we haven't serviced it yet!
@@ -735,16 +754,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
735 guest_exit(); 754 guest_exit();
736 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); 755 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
737 756
738 /*
739 * We must sync the PMU and timer state before the vgic state so
740 * that the vgic can properly sample the updated state of the
741 * interrupt line.
742 */
743 kvm_pmu_sync_hwstate(vcpu);
744 kvm_timer_sync_hwstate(vcpu);
745
746 kvm_vgic_sync_hwstate(vcpu);
747
748 preempt_enable(); 757 preempt_enable();
749 758
750 ret = handle_exit(vcpu, run, ret); 759 ret = handle_exit(vcpu, run, ret);
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
index 4734915ab71f..f39861639f08 100644
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ b/virt/kvm/arm/hyp/timer-sr.c
@@ -21,58 +21,48 @@
21 21
22#include <asm/kvm_hyp.h> 22#include <asm/kvm_hyp.h>
23 23
24/* vcpu is already in the HYP VA space */ 24void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
25void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) 25{
26 u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
27 write_sysreg(cntvoff, cntvoff_el2);
28}
29
30void __hyp_text enable_el1_phys_timer_access(void)
26{ 31{
27 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
28 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
29 u64 val; 32 u64 val;
30 33
31 if (timer->enabled) { 34 /* Allow physical timer/counter access for the host */
32 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 35 val = read_sysreg(cnthctl_el2);
33 vtimer->cnt_cval = read_sysreg_el0(cntv_cval); 36 val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
34 } 37 write_sysreg(val, cnthctl_el2);
38}
35 39
36 /* Disable the virtual timer */ 40void __hyp_text disable_el1_phys_timer_access(void)
37 write_sysreg_el0(0, cntv_ctl); 41{
42 u64 val;
38 43
39 /* 44 /*
45 * Disallow physical timer access for the guest
46 * Physical counter access is allowed
47 */
48 val = read_sysreg(cnthctl_el2);
49 val &= ~CNTHCTL_EL1PCEN;
50 val |= CNTHCTL_EL1PCTEN;
51 write_sysreg(val, cnthctl_el2);
52}
53
54void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
55{
56 /*
40 * We don't need to do this for VHE since the host kernel runs in EL2 57 * We don't need to do this for VHE since the host kernel runs in EL2
41 * with HCR_EL2.TGE ==1, which makes those bits have no impact. 58 * with HCR_EL2.TGE ==1, which makes those bits have no impact.
42 */ 59 */
43 if (!has_vhe()) { 60 if (!has_vhe())
44 /* Allow physical timer/counter access for the host */ 61 enable_el1_phys_timer_access();
45 val = read_sysreg(cnthctl_el2);
46 val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
47 write_sysreg(val, cnthctl_el2);
48 }
49
50 /* Clear cntvoff for the host */
51 write_sysreg(0, cntvoff_el2);
52} 62}
53 63
54void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) 64void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
55{ 65{
56 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 66 if (!has_vhe())
57 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 67 disable_el1_phys_timer_access();
58 u64 val;
59
60 /* Those bits are already configured at boot on VHE-system */
61 if (!has_vhe()) {
62 /*
63 * Disallow physical timer access for the guest
64 * Physical counter access is allowed
65 */
66 val = read_sysreg(cnthctl_el2);
67 val &= ~CNTHCTL_EL1PCEN;
68 val |= CNTHCTL_EL1PCTEN;
69 write_sysreg(val, cnthctl_el2);
70 }
71
72 if (timer->enabled) {
73 write_sysreg(vtimer->cntvoff, cntvoff_el2);
74 write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
75 isb();
76 write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
77 }
78} 68}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 547f12dc4d54..d2a99ab0ade7 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -278,6 +278,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
278 u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); 278 u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
279 u8 prop; 279 u8 prop;
280 int ret; 280 int ret;
281 unsigned long flags;
281 282
282 ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, 283 ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
283 &prop, 1); 284 &prop, 1);
@@ -285,15 +286,15 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
285 if (ret) 286 if (ret)
286 return ret; 287 return ret;
287 288
288 spin_lock(&irq->irq_lock); 289 spin_lock_irqsave(&irq->irq_lock, flags);
289 290
290 if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { 291 if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
291 irq->priority = LPI_PROP_PRIORITY(prop); 292 irq->priority = LPI_PROP_PRIORITY(prop);
292 irq->enabled = LPI_PROP_ENABLE_BIT(prop); 293 irq->enabled = LPI_PROP_ENABLE_BIT(prop);
293 294
294 vgic_queue_irq_unlock(kvm, irq); 295 vgic_queue_irq_unlock(kvm, irq, flags);
295 } else { 296 } else {
296 spin_unlock(&irq->irq_lock); 297 spin_unlock_irqrestore(&irq->irq_lock, flags);
297 } 298 }
298 299
299 return 0; 300 return 0;
@@ -393,6 +394,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
393 int ret = 0; 394 int ret = 0;
394 u32 *intids; 395 u32 *intids;
395 int nr_irqs, i; 396 int nr_irqs, i;
397 unsigned long flags;
396 398
397 nr_irqs = vgic_copy_lpi_list(vcpu, &intids); 399 nr_irqs = vgic_copy_lpi_list(vcpu, &intids);
398 if (nr_irqs < 0) 400 if (nr_irqs < 0)
@@ -420,9 +422,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
420 } 422 }
421 423
422 irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); 424 irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
423 spin_lock(&irq->irq_lock); 425 spin_lock_irqsave(&irq->irq_lock, flags);
424 irq->pending_latch = pendmask & (1U << bit_nr); 426 irq->pending_latch = pendmask & (1U << bit_nr);
425 vgic_queue_irq_unlock(vcpu->kvm, irq); 427 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
426 vgic_put_irq(vcpu->kvm, irq); 428 vgic_put_irq(vcpu->kvm, irq);
427 } 429 }
428 430
@@ -515,6 +517,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
515{ 517{
516 struct kvm_vcpu *vcpu; 518 struct kvm_vcpu *vcpu;
517 struct its_ite *ite; 519 struct its_ite *ite;
520 unsigned long flags;
518 521
519 if (!its->enabled) 522 if (!its->enabled)
520 return -EBUSY; 523 return -EBUSY;
@@ -530,9 +533,9 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
530 if (!vcpu->arch.vgic_cpu.lpis_enabled) 533 if (!vcpu->arch.vgic_cpu.lpis_enabled)
531 return -EBUSY; 534 return -EBUSY;
532 535
533 spin_lock(&ite->irq->irq_lock); 536 spin_lock_irqsave(&ite->irq->irq_lock, flags);
534 ite->irq->pending_latch = true; 537 ite->irq->pending_latch = true;
535 vgic_queue_irq_unlock(kvm, ite->irq); 538 vgic_queue_irq_unlock(kvm, ite->irq, flags);
536 539
537 return 0; 540 return 0;
538} 541}
@@ -894,7 +897,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
894} 897}
895 898
896/* Requires the its_lock to be held. */ 899/* Requires the its_lock to be held. */
897static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) 900static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
898{ 901{
899 struct its_ite *ite, *temp; 902 struct its_ite *ite, *temp;
900 903
@@ -910,6 +913,24 @@ static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
910 kfree(device); 913 kfree(device);
911} 914}
912 915
916/* its lock must be held */
917static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
918{
919 struct its_device *cur, *temp;
920
921 list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
922 vgic_its_free_device(kvm, cur);
923}
924
925/* its lock must be held */
926static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
927{
928 struct its_collection *cur, *temp;
929
930 list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
931 vgic_its_free_collection(its, cur->collection_id);
932}
933
913/* Must be called with its_lock mutex held */ 934/* Must be called with its_lock mutex held */
914static struct its_device *vgic_its_alloc_device(struct vgic_its *its, 935static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
915 u32 device_id, gpa_t itt_addr, 936 u32 device_id, gpa_t itt_addr,
@@ -957,7 +978,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
957 * by removing the mapping and re-establishing it. 978 * by removing the mapping and re-establishing it.
958 */ 979 */
959 if (device) 980 if (device)
960 vgic_its_unmap_device(kvm, device); 981 vgic_its_free_device(kvm, device);
961 982
962 /* 983 /*
963 * The spec does not say whether unmapping a not-mapped device 984 * The spec does not say whether unmapping a not-mapped device
@@ -1410,7 +1431,7 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
1410 unsigned long val) 1431 unsigned long val)
1411{ 1432{
1412 const struct vgic_its_abi *abi = vgic_its_get_abi(its); 1433 const struct vgic_its_abi *abi = vgic_its_get_abi(its);
1413 u64 entry_size, device_type; 1434 u64 entry_size, table_type;
1414 u64 reg, *regptr, clearbits = 0; 1435 u64 reg, *regptr, clearbits = 0;
1415 1436
1416 /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ 1437 /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
@@ -1421,12 +1442,12 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
1421 case 0: 1442 case 0:
1422 regptr = &its->baser_device_table; 1443 regptr = &its->baser_device_table;
1423 entry_size = abi->dte_esz; 1444 entry_size = abi->dte_esz;
1424 device_type = GITS_BASER_TYPE_DEVICE; 1445 table_type = GITS_BASER_TYPE_DEVICE;
1425 break; 1446 break;
1426 case 1: 1447 case 1:
1427 regptr = &its->baser_coll_table; 1448 regptr = &its->baser_coll_table;
1428 entry_size = abi->cte_esz; 1449 entry_size = abi->cte_esz;
1429 device_type = GITS_BASER_TYPE_COLLECTION; 1450 table_type = GITS_BASER_TYPE_COLLECTION;
1430 clearbits = GITS_BASER_INDIRECT; 1451 clearbits = GITS_BASER_INDIRECT;
1431 break; 1452 break;
1432 default: 1453 default:
@@ -1438,10 +1459,24 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
1438 reg &= ~clearbits; 1459 reg &= ~clearbits;
1439 1460
1440 reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; 1461 reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
1441 reg |= device_type << GITS_BASER_TYPE_SHIFT; 1462 reg |= table_type << GITS_BASER_TYPE_SHIFT;
1442 reg = vgic_sanitise_its_baser(reg); 1463 reg = vgic_sanitise_its_baser(reg);
1443 1464
1444 *regptr = reg; 1465 *regptr = reg;
1466
1467 if (!(reg & GITS_BASER_VALID)) {
1468 /* Take the its_lock to prevent a race with a save/restore */
1469 mutex_lock(&its->its_lock);
1470 switch (table_type) {
1471 case GITS_BASER_TYPE_DEVICE:
1472 vgic_its_free_device_list(kvm, its);
1473 break;
1474 case GITS_BASER_TYPE_COLLECTION:
1475 vgic_its_free_collection_list(kvm, its);
1476 break;
1477 }
1478 mutex_unlock(&its->its_lock);
1479 }
1445} 1480}
1446 1481
1447static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, 1482static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
@@ -1623,46 +1658,17 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
1623 return vgic_its_set_abi(its, NR_ITS_ABIS - 1); 1658 return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
1624} 1659}
1625 1660
1626static void vgic_its_free_device(struct kvm *kvm, struct its_device *dev)
1627{
1628 struct its_ite *ite, *tmp;
1629
1630 list_for_each_entry_safe(ite, tmp, &dev->itt_head, ite_list)
1631 its_free_ite(kvm, ite);
1632 list_del(&dev->dev_list);
1633 kfree(dev);
1634}
1635
1636static void vgic_its_destroy(struct kvm_device *kvm_dev) 1661static void vgic_its_destroy(struct kvm_device *kvm_dev)
1637{ 1662{
1638 struct kvm *kvm = kvm_dev->kvm; 1663 struct kvm *kvm = kvm_dev->kvm;
1639 struct vgic_its *its = kvm_dev->private; 1664 struct vgic_its *its = kvm_dev->private;
1640 struct list_head *cur, *temp;
1641
1642 /*
1643 * We may end up here without the lists ever having been initialized.
1644 * Check this and bail out early to avoid dereferencing a NULL pointer.
1645 */
1646 if (!its->device_list.next)
1647 return;
1648 1665
1649 mutex_lock(&its->its_lock); 1666 mutex_lock(&its->its_lock);
1650 list_for_each_safe(cur, temp, &its->device_list) {
1651 struct its_device *dev;
1652 1667
1653 dev = list_entry(cur, struct its_device, dev_list); 1668 vgic_its_free_device_list(kvm, its);
1654 vgic_its_free_device(kvm, dev); 1669 vgic_its_free_collection_list(kvm, its);
1655 }
1656 1670
1657 list_for_each_safe(cur, temp, &its->collection_list) {
1658 struct its_collection *coll;
1659
1660 coll = list_entry(cur, struct its_collection, coll_list);
1661 list_del(cur);
1662 kfree(coll);
1663 }
1664 mutex_unlock(&its->its_lock); 1671 mutex_unlock(&its->its_lock);
1665
1666 kfree(its); 1672 kfree(its);
1667} 1673}
1668 1674
@@ -2290,29 +2296,13 @@ static int vgic_its_restore_collection_table(struct vgic_its *its)
2290 */ 2296 */
2291static int vgic_its_save_tables_v0(struct vgic_its *its) 2297static int vgic_its_save_tables_v0(struct vgic_its *its)
2292{ 2298{
2293 struct kvm *kvm = its->dev->kvm;
2294 int ret; 2299 int ret;
2295 2300
2296 mutex_lock(&kvm->lock);
2297 mutex_lock(&its->its_lock);
2298
2299 if (!lock_all_vcpus(kvm)) {
2300 mutex_unlock(&its->its_lock);
2301 mutex_unlock(&kvm->lock);
2302 return -EBUSY;
2303 }
2304
2305 ret = vgic_its_save_device_tables(its); 2301 ret = vgic_its_save_device_tables(its);
2306 if (ret) 2302 if (ret)
2307 goto out; 2303 return ret;
2308
2309 ret = vgic_its_save_collection_table(its);
2310 2304
2311out: 2305 return vgic_its_save_collection_table(its);
2312 unlock_all_vcpus(kvm);
2313 mutex_unlock(&its->its_lock);
2314 mutex_unlock(&kvm->lock);
2315 return ret;
2316} 2306}
2317 2307
2318/** 2308/**
@@ -2322,29 +2312,13 @@ out:
2322 */ 2312 */
2323static int vgic_its_restore_tables_v0(struct vgic_its *its) 2313static int vgic_its_restore_tables_v0(struct vgic_its *its)
2324{ 2314{
2325 struct kvm *kvm = its->dev->kvm;
2326 int ret; 2315 int ret;
2327 2316
2328 mutex_lock(&kvm->lock);
2329 mutex_lock(&its->its_lock);
2330
2331 if (!lock_all_vcpus(kvm)) {
2332 mutex_unlock(&its->its_lock);
2333 mutex_unlock(&kvm->lock);
2334 return -EBUSY;
2335 }
2336
2337 ret = vgic_its_restore_collection_table(its); 2317 ret = vgic_its_restore_collection_table(its);
2338 if (ret) 2318 if (ret)
2339 goto out; 2319 return ret;
2340
2341 ret = vgic_its_restore_device_tables(its);
2342out:
2343 unlock_all_vcpus(kvm);
2344 mutex_unlock(&its->its_lock);
2345 mutex_unlock(&kvm->lock);
2346 2320
2347 return ret; 2321 return vgic_its_restore_device_tables(its);
2348} 2322}
2349 2323
2350static int vgic_its_commit_v0(struct vgic_its *its) 2324static int vgic_its_commit_v0(struct vgic_its *its)
@@ -2363,6 +2337,19 @@ static int vgic_its_commit_v0(struct vgic_its *its)
2363 return 0; 2337 return 0;
2364} 2338}
2365 2339
2340static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
2341{
2342 /* We need to keep the ABI specific field values */
2343 its->baser_coll_table &= ~GITS_BASER_VALID;
2344 its->baser_device_table &= ~GITS_BASER_VALID;
2345 its->cbaser = 0;
2346 its->creadr = 0;
2347 its->cwriter = 0;
2348 its->enabled = 0;
2349 vgic_its_free_device_list(kvm, its);
2350 vgic_its_free_collection_list(kvm, its);
2351}
2352
2366static int vgic_its_has_attr(struct kvm_device *dev, 2353static int vgic_its_has_attr(struct kvm_device *dev,
2367 struct kvm_device_attr *attr) 2354 struct kvm_device_attr *attr)
2368{ 2355{
@@ -2377,6 +2364,8 @@ static int vgic_its_has_attr(struct kvm_device *dev,
2377 switch (attr->attr) { 2364 switch (attr->attr) {
2378 case KVM_DEV_ARM_VGIC_CTRL_INIT: 2365 case KVM_DEV_ARM_VGIC_CTRL_INIT:
2379 return 0; 2366 return 0;
2367 case KVM_DEV_ARM_ITS_CTRL_RESET:
2368 return 0;
2380 case KVM_DEV_ARM_ITS_SAVE_TABLES: 2369 case KVM_DEV_ARM_ITS_SAVE_TABLES:
2381 return 0; 2370 return 0;
2382 case KVM_DEV_ARM_ITS_RESTORE_TABLES: 2371 case KVM_DEV_ARM_ITS_RESTORE_TABLES:
@@ -2389,6 +2378,41 @@ static int vgic_its_has_attr(struct kvm_device *dev,
2389 return -ENXIO; 2378 return -ENXIO;
2390} 2379}
2391 2380
2381static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
2382{
2383 const struct vgic_its_abi *abi = vgic_its_get_abi(its);
2384 int ret = 0;
2385
2386 if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
2387 return 0;
2388
2389 mutex_lock(&kvm->lock);
2390 mutex_lock(&its->its_lock);
2391
2392 if (!lock_all_vcpus(kvm)) {
2393 mutex_unlock(&its->its_lock);
2394 mutex_unlock(&kvm->lock);
2395 return -EBUSY;
2396 }
2397
2398 switch (attr) {
2399 case KVM_DEV_ARM_ITS_CTRL_RESET:
2400 vgic_its_reset(kvm, its);
2401 break;
2402 case KVM_DEV_ARM_ITS_SAVE_TABLES:
2403 ret = abi->save_tables(its);
2404 break;
2405 case KVM_DEV_ARM_ITS_RESTORE_TABLES:
2406 ret = abi->restore_tables(its);
2407 break;
2408 }
2409
2410 unlock_all_vcpus(kvm);
2411 mutex_unlock(&its->its_lock);
2412 mutex_unlock(&kvm->lock);
2413 return ret;
2414}
2415
2392static int vgic_its_set_attr(struct kvm_device *dev, 2416static int vgic_its_set_attr(struct kvm_device *dev,
2393 struct kvm_device_attr *attr) 2417 struct kvm_device_attr *attr)
2394{ 2418{
@@ -2414,19 +2438,8 @@ static int vgic_its_set_attr(struct kvm_device *dev,
2414 2438
2415 return vgic_register_its_iodev(dev->kvm, its, addr); 2439 return vgic_register_its_iodev(dev->kvm, its, addr);
2416 } 2440 }
2417 case KVM_DEV_ARM_VGIC_GRP_CTRL: { 2441 case KVM_DEV_ARM_VGIC_GRP_CTRL:
2418 const struct vgic_its_abi *abi = vgic_its_get_abi(its); 2442 return vgic_its_ctrl(dev->kvm, its, attr->attr);
2419
2420 switch (attr->attr) {
2421 case KVM_DEV_ARM_VGIC_CTRL_INIT:
2422 /* Nothing to do */
2423 return 0;
2424 case KVM_DEV_ARM_ITS_SAVE_TABLES:
2425 return abi->save_tables(its);
2426 case KVM_DEV_ARM_ITS_RESTORE_TABLES:
2427 return abi->restore_tables(its);
2428 }
2429 }
2430 case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { 2443 case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
2431 u64 __user *uaddr = (u64 __user *)(long)attr->addr; 2444 u64 __user *uaddr = (u64 __user *)(long)attr->addr;
2432 u64 reg; 2445 u64 reg;
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index b3d4a10f09a1..e21e2f49b005 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -74,6 +74,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
74 int mode = (val >> 24) & 0x03; 74 int mode = (val >> 24) & 0x03;
75 int c; 75 int c;
76 struct kvm_vcpu *vcpu; 76 struct kvm_vcpu *vcpu;
77 unsigned long flags;
77 78
78 switch (mode) { 79 switch (mode) {
79 case 0x0: /* as specified by targets */ 80 case 0x0: /* as specified by targets */
@@ -97,11 +98,11 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
97 98
98 irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); 99 irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
99 100
100 spin_lock(&irq->irq_lock); 101 spin_lock_irqsave(&irq->irq_lock, flags);
101 irq->pending_latch = true; 102 irq->pending_latch = true;
102 irq->source |= 1U << source_vcpu->vcpu_id; 103 irq->source |= 1U << source_vcpu->vcpu_id;
103 104
104 vgic_queue_irq_unlock(source_vcpu->kvm, irq); 105 vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
105 vgic_put_irq(source_vcpu->kvm, irq); 106 vgic_put_irq(source_vcpu->kvm, irq);
106 } 107 }
107} 108}
@@ -131,6 +132,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
131 u32 intid = VGIC_ADDR_TO_INTID(addr, 8); 132 u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
132 u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); 133 u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
133 int i; 134 int i;
135 unsigned long flags;
134 136
135 /* GICD_ITARGETSR[0-7] are read-only */ 137 /* GICD_ITARGETSR[0-7] are read-only */
136 if (intid < VGIC_NR_PRIVATE_IRQS) 138 if (intid < VGIC_NR_PRIVATE_IRQS)
@@ -140,13 +142,13 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
140 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); 142 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
141 int target; 143 int target;
142 144
143 spin_lock(&irq->irq_lock); 145 spin_lock_irqsave(&irq->irq_lock, flags);
144 146
145 irq->targets = (val >> (i * 8)) & cpu_mask; 147 irq->targets = (val >> (i * 8)) & cpu_mask;
146 target = irq->targets ? __ffs(irq->targets) : 0; 148 target = irq->targets ? __ffs(irq->targets) : 0;
147 irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); 149 irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
148 150
149 spin_unlock(&irq->irq_lock); 151 spin_unlock_irqrestore(&irq->irq_lock, flags);
150 vgic_put_irq(vcpu->kvm, irq); 152 vgic_put_irq(vcpu->kvm, irq);
151 } 153 }
152} 154}
@@ -174,17 +176,18 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
174{ 176{
175 u32 intid = addr & 0x0f; 177 u32 intid = addr & 0x0f;
176 int i; 178 int i;
179 unsigned long flags;
177 180
178 for (i = 0; i < len; i++) { 181 for (i = 0; i < len; i++) {
179 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 182 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
180 183
181 spin_lock(&irq->irq_lock); 184 spin_lock_irqsave(&irq->irq_lock, flags);
182 185
183 irq->source &= ~((val >> (i * 8)) & 0xff); 186 irq->source &= ~((val >> (i * 8)) & 0xff);
184 if (!irq->source) 187 if (!irq->source)
185 irq->pending_latch = false; 188 irq->pending_latch = false;
186 189
187 spin_unlock(&irq->irq_lock); 190 spin_unlock_irqrestore(&irq->irq_lock, flags);
188 vgic_put_irq(vcpu->kvm, irq); 191 vgic_put_irq(vcpu->kvm, irq);
189 } 192 }
190} 193}
@@ -195,19 +198,20 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
195{ 198{
196 u32 intid = addr & 0x0f; 199 u32 intid = addr & 0x0f;
197 int i; 200 int i;
201 unsigned long flags;
198 202
199 for (i = 0; i < len; i++) { 203 for (i = 0; i < len; i++) {
200 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 204 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
201 205
202 spin_lock(&irq->irq_lock); 206 spin_lock_irqsave(&irq->irq_lock, flags);
203 207
204 irq->source |= (val >> (i * 8)) & 0xff; 208 irq->source |= (val >> (i * 8)) & 0xff;
205 209
206 if (irq->source) { 210 if (irq->source) {
207 irq->pending_latch = true; 211 irq->pending_latch = true;
208 vgic_queue_irq_unlock(vcpu->kvm, irq); 212 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
209 } else { 213 } else {
210 spin_unlock(&irq->irq_lock); 214 spin_unlock_irqrestore(&irq->irq_lock, flags);
211 } 215 }
212 vgic_put_irq(vcpu->kvm, irq); 216 vgic_put_irq(vcpu->kvm, irq);
213 } 217 }
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 408ef06638fc..83786108829e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -129,6 +129,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
129{ 129{
130 int intid = VGIC_ADDR_TO_INTID(addr, 64); 130 int intid = VGIC_ADDR_TO_INTID(addr, 64);
131 struct vgic_irq *irq; 131 struct vgic_irq *irq;
132 unsigned long flags;
132 133
133 /* The upper word is WI for us since we don't implement Aff3. */ 134 /* The upper word is WI for us since we don't implement Aff3. */
134 if (addr & 4) 135 if (addr & 4)
@@ -139,13 +140,13 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
139 if (!irq) 140 if (!irq)
140 return; 141 return;
141 142
142 spin_lock(&irq->irq_lock); 143 spin_lock_irqsave(&irq->irq_lock, flags);
143 144
144 /* We only care about and preserve Aff0, Aff1 and Aff2. */ 145 /* We only care about and preserve Aff0, Aff1 and Aff2. */
145 irq->mpidr = val & GENMASK(23, 0); 146 irq->mpidr = val & GENMASK(23, 0);
146 irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); 147 irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
147 148
148 spin_unlock(&irq->irq_lock); 149 spin_unlock_irqrestore(&irq->irq_lock, flags);
149 vgic_put_irq(vcpu->kvm, irq); 150 vgic_put_irq(vcpu->kvm, irq);
150} 151}
151 152
@@ -241,11 +242,12 @@ static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
241{ 242{
242 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 243 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
243 int i; 244 int i;
245 unsigned long flags;
244 246
245 for (i = 0; i < len * 8; i++) { 247 for (i = 0; i < len * 8; i++) {
246 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 248 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
247 249
248 spin_lock(&irq->irq_lock); 250 spin_lock_irqsave(&irq->irq_lock, flags);
249 if (test_bit(i, &val)) { 251 if (test_bit(i, &val)) {
250 /* 252 /*
251 * pending_latch is set irrespective of irq type 253 * pending_latch is set irrespective of irq type
@@ -253,10 +255,10 @@ static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
253 * restore irq config before pending info. 255 * restore irq config before pending info.
254 */ 256 */
255 irq->pending_latch = true; 257 irq->pending_latch = true;
256 vgic_queue_irq_unlock(vcpu->kvm, irq); 258 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
257 } else { 259 } else {
258 irq->pending_latch = false; 260 irq->pending_latch = false;
259 spin_unlock(&irq->irq_lock); 261 spin_unlock_irqrestore(&irq->irq_lock, flags);
260 } 262 }
261 263
262 vgic_put_irq(vcpu->kvm, irq); 264 vgic_put_irq(vcpu->kvm, irq);
@@ -799,6 +801,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
799 int sgi, c; 801 int sgi, c;
800 int vcpu_id = vcpu->vcpu_id; 802 int vcpu_id = vcpu->vcpu_id;
801 bool broadcast; 803 bool broadcast;
804 unsigned long flags;
802 805
803 sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; 806 sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
804 broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); 807 broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
@@ -837,10 +840,10 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
837 840
838 irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); 841 irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
839 842
840 spin_lock(&irq->irq_lock); 843 spin_lock_irqsave(&irq->irq_lock, flags);
841 irq->pending_latch = true; 844 irq->pending_latch = true;
842 845
843 vgic_queue_irq_unlock(vcpu->kvm, irq); 846 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
844 vgic_put_irq(vcpu->kvm, irq); 847 vgic_put_irq(vcpu->kvm, irq);
845 } 848 }
846} 849}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index c1e4bdd66131..deb51ee16a3d 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -69,13 +69,14 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
69{ 69{
70 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 70 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
71 int i; 71 int i;
72 unsigned long flags;
72 73
73 for_each_set_bit(i, &val, len * 8) { 74 for_each_set_bit(i, &val, len * 8) {
74 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 75 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
75 76
76 spin_lock(&irq->irq_lock); 77 spin_lock_irqsave(&irq->irq_lock, flags);
77 irq->enabled = true; 78 irq->enabled = true;
78 vgic_queue_irq_unlock(vcpu->kvm, irq); 79 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
79 80
80 vgic_put_irq(vcpu->kvm, irq); 81 vgic_put_irq(vcpu->kvm, irq);
81 } 82 }
@@ -87,15 +88,16 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
87{ 88{
88 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 89 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
89 int i; 90 int i;
91 unsigned long flags;
90 92
91 for_each_set_bit(i, &val, len * 8) { 93 for_each_set_bit(i, &val, len * 8) {
92 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 94 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
93 95
94 spin_lock(&irq->irq_lock); 96 spin_lock_irqsave(&irq->irq_lock, flags);
95 97
96 irq->enabled = false; 98 irq->enabled = false;
97 99
98 spin_unlock(&irq->irq_lock); 100 spin_unlock_irqrestore(&irq->irq_lock, flags);
99 vgic_put_irq(vcpu->kvm, irq); 101 vgic_put_irq(vcpu->kvm, irq);
100 } 102 }
101} 103}
@@ -126,14 +128,15 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
126{ 128{
127 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 129 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
128 int i; 130 int i;
131 unsigned long flags;
129 132
130 for_each_set_bit(i, &val, len * 8) { 133 for_each_set_bit(i, &val, len * 8) {
131 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 134 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
132 135
133 spin_lock(&irq->irq_lock); 136 spin_lock_irqsave(&irq->irq_lock, flags);
134 irq->pending_latch = true; 137 irq->pending_latch = true;
135 138
136 vgic_queue_irq_unlock(vcpu->kvm, irq); 139 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
137 vgic_put_irq(vcpu->kvm, irq); 140 vgic_put_irq(vcpu->kvm, irq);
138 } 141 }
139} 142}
@@ -144,15 +147,16 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
144{ 147{
145 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 148 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
146 int i; 149 int i;
150 unsigned long flags;
147 151
148 for_each_set_bit(i, &val, len * 8) { 152 for_each_set_bit(i, &val, len * 8) {
149 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 153 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
150 154
151 spin_lock(&irq->irq_lock); 155 spin_lock_irqsave(&irq->irq_lock, flags);
152 156
153 irq->pending_latch = false; 157 irq->pending_latch = false;
154 158
155 spin_unlock(&irq->irq_lock); 159 spin_unlock_irqrestore(&irq->irq_lock, flags);
156 vgic_put_irq(vcpu->kvm, irq); 160 vgic_put_irq(vcpu->kvm, irq);
157 } 161 }
158} 162}
@@ -181,7 +185,8 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
181 bool new_active_state) 185 bool new_active_state)
182{ 186{
183 struct kvm_vcpu *requester_vcpu; 187 struct kvm_vcpu *requester_vcpu;
184 spin_lock(&irq->irq_lock); 188 unsigned long flags;
189 spin_lock_irqsave(&irq->irq_lock, flags);
185 190
186 /* 191 /*
187 * The vcpu parameter here can mean multiple things depending on how 192 * The vcpu parameter here can mean multiple things depending on how
@@ -216,9 +221,9 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
216 221
217 irq->active = new_active_state; 222 irq->active = new_active_state;
218 if (new_active_state) 223 if (new_active_state)
219 vgic_queue_irq_unlock(vcpu->kvm, irq); 224 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
220 else 225 else
221 spin_unlock(&irq->irq_lock); 226 spin_unlock_irqrestore(&irq->irq_lock, flags);
222} 227}
223 228
224/* 229/*
@@ -352,14 +357,15 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
352{ 357{
353 u32 intid = VGIC_ADDR_TO_INTID(addr, 8); 358 u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
354 int i; 359 int i;
360 unsigned long flags;
355 361
356 for (i = 0; i < len; i++) { 362 for (i = 0; i < len; i++) {
357 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 363 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
358 364
359 spin_lock(&irq->irq_lock); 365 spin_lock_irqsave(&irq->irq_lock, flags);
360 /* Narrow the priority range to what we actually support */ 366 /* Narrow the priority range to what we actually support */
361 irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); 367 irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
362 spin_unlock(&irq->irq_lock); 368 spin_unlock_irqrestore(&irq->irq_lock, flags);
363 369
364 vgic_put_irq(vcpu->kvm, irq); 370 vgic_put_irq(vcpu->kvm, irq);
365 } 371 }
@@ -390,6 +396,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
390{ 396{
391 u32 intid = VGIC_ADDR_TO_INTID(addr, 2); 397 u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
392 int i; 398 int i;
399 unsigned long flags;
393 400
394 for (i = 0; i < len * 4; i++) { 401 for (i = 0; i < len * 4; i++) {
395 struct vgic_irq *irq; 402 struct vgic_irq *irq;
@@ -404,14 +411,14 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
404 continue; 411 continue;
405 412
406 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 413 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
407 spin_lock(&irq->irq_lock); 414 spin_lock_irqsave(&irq->irq_lock, flags);
408 415
409 if (test_bit(i * 2 + 1, &val)) 416 if (test_bit(i * 2 + 1, &val))
410 irq->config = VGIC_CONFIG_EDGE; 417 irq->config = VGIC_CONFIG_EDGE;
411 else 418 else
412 irq->config = VGIC_CONFIG_LEVEL; 419 irq->config = VGIC_CONFIG_LEVEL;
413 420
414 spin_unlock(&irq->irq_lock); 421 spin_unlock_irqrestore(&irq->irq_lock, flags);
415 vgic_put_irq(vcpu->kvm, irq); 422 vgic_put_irq(vcpu->kvm, irq);
416 } 423 }
417} 424}
@@ -443,6 +450,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
443{ 450{
444 int i; 451 int i;
445 int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; 452 int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
453 unsigned long flags;
446 454
447 for (i = 0; i < 32; i++) { 455 for (i = 0; i < 32; i++) {
448 struct vgic_irq *irq; 456 struct vgic_irq *irq;
@@ -459,12 +467,12 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
459 * restore irq config before line level. 467 * restore irq config before line level.
460 */ 468 */
461 new_level = !!(val & (1U << i)); 469 new_level = !!(val & (1U << i));
462 spin_lock(&irq->irq_lock); 470 spin_lock_irqsave(&irq->irq_lock, flags);
463 irq->line_level = new_level; 471 irq->line_level = new_level;
464 if (new_level) 472 if (new_level)
465 vgic_queue_irq_unlock(vcpu->kvm, irq); 473 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
466 else 474 else
467 spin_unlock(&irq->irq_lock); 475 spin_unlock_irqrestore(&irq->irq_lock, flags);
468 476
469 vgic_put_irq(vcpu->kvm, irq); 477 vgic_put_irq(vcpu->kvm, irq);
470 } 478 }
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index e4187e52bb26..80897102da26 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -62,6 +62,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
62 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 62 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
63 struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; 63 struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
64 int lr; 64 int lr;
65 unsigned long flags;
65 66
66 cpuif->vgic_hcr &= ~GICH_HCR_UIE; 67 cpuif->vgic_hcr &= ~GICH_HCR_UIE;
67 68
@@ -77,7 +78,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
77 78
78 irq = vgic_get_irq(vcpu->kvm, vcpu, intid); 79 irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
79 80
80 spin_lock(&irq->irq_lock); 81 spin_lock_irqsave(&irq->irq_lock, flags);
81 82
82 /* Always preserve the active bit */ 83 /* Always preserve the active bit */
83 irq->active = !!(val & GICH_LR_ACTIVE_BIT); 84 irq->active = !!(val & GICH_LR_ACTIVE_BIT);
@@ -104,7 +105,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
104 irq->pending_latch = false; 105 irq->pending_latch = false;
105 } 106 }
106 107
107 spin_unlock(&irq->irq_lock); 108 spin_unlock_irqrestore(&irq->irq_lock, flags);
108 vgic_put_irq(vcpu->kvm, irq); 109 vgic_put_irq(vcpu->kvm, irq);
109 } 110 }
110 111
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 96ea597db0e7..863351c090d8 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -44,6 +44,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
44 struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; 44 struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
45 u32 model = vcpu->kvm->arch.vgic.vgic_model; 45 u32 model = vcpu->kvm->arch.vgic.vgic_model;
46 int lr; 46 int lr;
47 unsigned long flags;
47 48
48 cpuif->vgic_hcr &= ~ICH_HCR_UIE; 49 cpuif->vgic_hcr &= ~ICH_HCR_UIE;
49 50
@@ -66,7 +67,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
66 if (!irq) /* An LPI could have been unmapped. */ 67 if (!irq) /* An LPI could have been unmapped. */
67 continue; 68 continue;
68 69
69 spin_lock(&irq->irq_lock); 70 spin_lock_irqsave(&irq->irq_lock, flags);
70 71
71 /* Always preserve the active bit */ 72 /* Always preserve the active bit */
72 irq->active = !!(val & ICH_LR_ACTIVE_BIT); 73 irq->active = !!(val & ICH_LR_ACTIVE_BIT);
@@ -94,7 +95,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
94 irq->pending_latch = false; 95 irq->pending_latch = false;
95 } 96 }
96 97
97 spin_unlock(&irq->irq_lock); 98 spin_unlock_irqrestore(&irq->irq_lock, flags);
98 vgic_put_irq(vcpu->kvm, irq); 99 vgic_put_irq(vcpu->kvm, irq);
99 } 100 }
100 101
@@ -278,6 +279,7 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
278 bool status; 279 bool status;
279 u8 val; 280 u8 val;
280 int ret; 281 int ret;
282 unsigned long flags;
281 283
282retry: 284retry:
283 vcpu = irq->target_vcpu; 285 vcpu = irq->target_vcpu;
@@ -296,13 +298,13 @@ retry:
296 298
297 status = val & (1 << bit_nr); 299 status = val & (1 << bit_nr);
298 300
299 spin_lock(&irq->irq_lock); 301 spin_lock_irqsave(&irq->irq_lock, flags);
300 if (irq->target_vcpu != vcpu) { 302 if (irq->target_vcpu != vcpu) {
301 spin_unlock(&irq->irq_lock); 303 spin_unlock_irqrestore(&irq->irq_lock, flags);
302 goto retry; 304 goto retry;
303 } 305 }
304 irq->pending_latch = status; 306 irq->pending_latch = status;
305 vgic_queue_irq_unlock(vcpu->kvm, irq); 307 vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
306 308
307 if (status) { 309 if (status) {
308 /* clear consumed data */ 310 /* clear consumed data */
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index fed717e07938..e54ef2fdf73d 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -53,6 +53,10 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
53 * vcpuX->vcpu_id < vcpuY->vcpu_id: 53 * vcpuX->vcpu_id < vcpuY->vcpu_id:
54 * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); 54 * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
55 * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); 55 * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
56 *
57 * Since the VGIC must support injecting virtual interrupts from ISRs, we have
58 * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
59 * spinlocks for any lock that may be taken while injecting an interrupt.
56 */ 60 */
57 61
58/* 62/*
@@ -261,7 +265,8 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
261 * Needs to be entered with the IRQ lock already held, but will return 265 * Needs to be entered with the IRQ lock already held, but will return
262 * with all locks dropped. 266 * with all locks dropped.
263 */ 267 */
264bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) 268bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
269 unsigned long flags)
265{ 270{
266 struct kvm_vcpu *vcpu; 271 struct kvm_vcpu *vcpu;
267 272
@@ -279,7 +284,7 @@ retry:
279 * not need to be inserted into an ap_list and there is also 284 * not need to be inserted into an ap_list and there is also
280 * no more work for us to do. 285 * no more work for us to do.
281 */ 286 */
282 spin_unlock(&irq->irq_lock); 287 spin_unlock_irqrestore(&irq->irq_lock, flags);
283 288
284 /* 289 /*
285 * We have to kick the VCPU here, because we could be 290 * We have to kick the VCPU here, because we could be
@@ -301,11 +306,11 @@ retry:
301 * We must unlock the irq lock to take the ap_list_lock where 306 * We must unlock the irq lock to take the ap_list_lock where
302 * we are going to insert this new pending interrupt. 307 * we are going to insert this new pending interrupt.
303 */ 308 */
304 spin_unlock(&irq->irq_lock); 309 spin_unlock_irqrestore(&irq->irq_lock, flags);
305 310
306 /* someone can do stuff here, which we re-check below */ 311 /* someone can do stuff here, which we re-check below */
307 312
308 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 313 spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
309 spin_lock(&irq->irq_lock); 314 spin_lock(&irq->irq_lock);
310 315
311 /* 316 /*
@@ -322,9 +327,9 @@ retry:
322 327
323 if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { 328 if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
324 spin_unlock(&irq->irq_lock); 329 spin_unlock(&irq->irq_lock);
325 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 330 spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
326 331
327 spin_lock(&irq->irq_lock); 332 spin_lock_irqsave(&irq->irq_lock, flags);
328 goto retry; 333 goto retry;
329 } 334 }
330 335
@@ -337,7 +342,7 @@ retry:
337 irq->vcpu = vcpu; 342 irq->vcpu = vcpu;
338 343
339 spin_unlock(&irq->irq_lock); 344 spin_unlock(&irq->irq_lock);
340 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 345 spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
341 346
342 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 347 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
343 kvm_vcpu_kick(vcpu); 348 kvm_vcpu_kick(vcpu);
@@ -367,6 +372,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
367{ 372{
368 struct kvm_vcpu *vcpu; 373 struct kvm_vcpu *vcpu;
369 struct vgic_irq *irq; 374 struct vgic_irq *irq;
375 unsigned long flags;
370 int ret; 376 int ret;
371 377
372 trace_vgic_update_irq_pending(cpuid, intid, level); 378 trace_vgic_update_irq_pending(cpuid, intid, level);
@@ -383,11 +389,11 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
383 if (!irq) 389 if (!irq)
384 return -EINVAL; 390 return -EINVAL;
385 391
386 spin_lock(&irq->irq_lock); 392 spin_lock_irqsave(&irq->irq_lock, flags);
387 393
388 if (!vgic_validate_injection(irq, level, owner)) { 394 if (!vgic_validate_injection(irq, level, owner)) {
389 /* Nothing to see here, move along... */ 395 /* Nothing to see here, move along... */
390 spin_unlock(&irq->irq_lock); 396 spin_unlock_irqrestore(&irq->irq_lock, flags);
391 vgic_put_irq(kvm, irq); 397 vgic_put_irq(kvm, irq);
392 return 0; 398 return 0;
393 } 399 }
@@ -397,7 +403,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
397 else 403 else
398 irq->pending_latch = true; 404 irq->pending_latch = true;
399 405
400 vgic_queue_irq_unlock(kvm, irq); 406 vgic_queue_irq_unlock(kvm, irq, flags);
401 vgic_put_irq(kvm, irq); 407 vgic_put_irq(kvm, irq);
402 408
403 return 0; 409 return 0;
@@ -406,15 +412,16 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
406int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) 412int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
407{ 413{
408 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 414 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
415 unsigned long flags;
409 416
410 BUG_ON(!irq); 417 BUG_ON(!irq);
411 418
412 spin_lock(&irq->irq_lock); 419 spin_lock_irqsave(&irq->irq_lock, flags);
413 420
414 irq->hw = true; 421 irq->hw = true;
415 irq->hwintid = phys_irq; 422 irq->hwintid = phys_irq;
416 423
417 spin_unlock(&irq->irq_lock); 424 spin_unlock_irqrestore(&irq->irq_lock, flags);
418 vgic_put_irq(vcpu->kvm, irq); 425 vgic_put_irq(vcpu->kvm, irq);
419 426
420 return 0; 427 return 0;
@@ -423,6 +430,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
423int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) 430int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
424{ 431{
425 struct vgic_irq *irq; 432 struct vgic_irq *irq;
433 unsigned long flags;
426 434
427 if (!vgic_initialized(vcpu->kvm)) 435 if (!vgic_initialized(vcpu->kvm))
428 return -EAGAIN; 436 return -EAGAIN;
@@ -430,12 +438,12 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
430 irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 438 irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
431 BUG_ON(!irq); 439 BUG_ON(!irq);
432 440
433 spin_lock(&irq->irq_lock); 441 spin_lock_irqsave(&irq->irq_lock, flags);
434 442
435 irq->hw = false; 443 irq->hw = false;
436 irq->hwintid = 0; 444 irq->hwintid = 0;
437 445
438 spin_unlock(&irq->irq_lock); 446 spin_unlock_irqrestore(&irq->irq_lock, flags);
439 vgic_put_irq(vcpu->kvm, irq); 447 vgic_put_irq(vcpu->kvm, irq);
440 448
441 return 0; 449 return 0;
@@ -486,9 +494,10 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
486{ 494{
487 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 495 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
488 struct vgic_irq *irq, *tmp; 496 struct vgic_irq *irq, *tmp;
497 unsigned long flags;
489 498
490retry: 499retry:
491 spin_lock(&vgic_cpu->ap_list_lock); 500 spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
492 501
493 list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { 502 list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
494 struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; 503 struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
@@ -528,7 +537,7 @@ retry:
528 /* This interrupt looks like it has to be migrated. */ 537 /* This interrupt looks like it has to be migrated. */
529 538
530 spin_unlock(&irq->irq_lock); 539 spin_unlock(&irq->irq_lock);
531 spin_unlock(&vgic_cpu->ap_list_lock); 540 spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
532 541
533 /* 542 /*
534 * Ensure locking order by always locking the smallest 543 * Ensure locking order by always locking the smallest
@@ -542,7 +551,7 @@ retry:
542 vcpuB = vcpu; 551 vcpuB = vcpu;
543 } 552 }
544 553
545 spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); 554 spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
546 spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, 555 spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
547 SINGLE_DEPTH_NESTING); 556 SINGLE_DEPTH_NESTING);
548 spin_lock(&irq->irq_lock); 557 spin_lock(&irq->irq_lock);
@@ -566,11 +575,11 @@ retry:
566 575
567 spin_unlock(&irq->irq_lock); 576 spin_unlock(&irq->irq_lock);
568 spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); 577 spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
569 spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); 578 spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags);
570 goto retry; 579 goto retry;
571 } 580 }
572 581
573 spin_unlock(&vgic_cpu->ap_list_lock); 582 spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
574} 583}
575 584
576static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) 585static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -703,6 +712,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
703 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) 712 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
704 return; 713 return;
705 714
715 DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
716
706 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 717 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
707 vgic_flush_lr_state(vcpu); 718 vgic_flush_lr_state(vcpu);
708 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 719 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
@@ -735,11 +746,12 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
735 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 746 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
736 struct vgic_irq *irq; 747 struct vgic_irq *irq;
737 bool pending = false; 748 bool pending = false;
749 unsigned long flags;
738 750
739 if (!vcpu->kvm->arch.vgic.enabled) 751 if (!vcpu->kvm->arch.vgic.enabled)
740 return false; 752 return false;
741 753
742 spin_lock(&vgic_cpu->ap_list_lock); 754 spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
743 755
744 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { 756 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
745 spin_lock(&irq->irq_lock); 757 spin_lock(&irq->irq_lock);
@@ -750,7 +762,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
750 break; 762 break;
751 } 763 }
752 764
753 spin_unlock(&vgic_cpu->ap_list_lock); 765 spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
754 766
755 return pending; 767 return pending;
756} 768}
@@ -776,10 +788,14 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
776{ 788{
777 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 789 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
778 bool map_is_active; 790 bool map_is_active;
791 unsigned long flags;
779 792
780 spin_lock(&irq->irq_lock); 793 if (!vgic_initialized(vcpu->kvm))
794 return false;
795
796 spin_lock_irqsave(&irq->irq_lock, flags);
781 map_is_active = irq->hw && irq->active; 797 map_is_active = irq->hw && irq->active;
782 spin_unlock(&irq->irq_lock); 798 spin_unlock_irqrestore(&irq->irq_lock, flags);
783 vgic_put_irq(vcpu->kvm, irq); 799 vgic_put_irq(vcpu->kvm, irq);
784 800
785 return map_is_active; 801 return map_is_active;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index bf9ceab67c77..4f8aecb07ae6 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -140,7 +140,8 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
140struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, 140struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
141 u32 intid); 141 u32 intid);
142void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); 142void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
143bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); 143bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
144 unsigned long flags);
144void vgic_kick_vcpus(struct kvm *kvm); 145void vgic_kick_vcpus(struct kvm *kvm);
145 146
146int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, 147int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ce507ae1d4f5..2dd1a9ca4599 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -122,7 +122,6 @@ static void hardware_disable_all(void);
122 122
123static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 123static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
124 124
125static void kvm_release_pfn_dirty(kvm_pfn_t pfn);
126static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 125static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
127 126
128__visible bool kvm_rebooting; 127__visible bool kvm_rebooting;
@@ -1679,11 +1678,12 @@ void kvm_release_page_dirty(struct page *page)
1679} 1678}
1680EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1679EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1681 1680
1682static void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1681void kvm_release_pfn_dirty(kvm_pfn_t pfn)
1683{ 1682{
1684 kvm_set_pfn_dirty(pfn); 1683 kvm_set_pfn_dirty(pfn);
1685 kvm_release_pfn_clean(pfn); 1684 kvm_release_pfn_clean(pfn);
1686} 1685}
1686EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1687 1687
1688void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1688void kvm_set_pfn_dirty(kvm_pfn_t pfn)
1689{ 1689{
@@ -4010,7 +4010,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4010 if (!vcpu_align) 4010 if (!vcpu_align)
4011 vcpu_align = __alignof__(struct kvm_vcpu); 4011 vcpu_align = __alignof__(struct kvm_vcpu);
4012 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 4012 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
4013 0, NULL); 4013 SLAB_ACCOUNT, NULL);
4014 if (!kvm_vcpu_cache) { 4014 if (!kvm_vcpu_cache) {
4015 r = -ENOMEM; 4015 r = -ENOMEM;
4016 goto out_free_3; 4016 goto out_free_3;