summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-06 21:38:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-06 21:38:31 -0400
commitc136b84393d4e340e1b53fc7f737dd5827b19ee5 (patch)
tree985a1bdfafe7ec5ce2d3c738f601cad3998d8ce9
parente0f25a3f2d052e36ff67a9b4db835c3e27e950d8 (diff)
parent1372324b328cd5dabaef5e345e37ad48c63df2a9 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "PPC: - Better machine check handling for HV KVM - Ability to support guests with threads=2, 4 or 8 on POWER9 - Fix for a race that could cause delayed recognition of signals - Fix for a bug where POWER9 guests could sleep with interrupts pending. ARM: - VCPU request overhaul - allow timer and PMU to have their interrupt number selected from userspace - workaround for Cavium erratum 30115 - handling of memory poisonning - the usual crop of fixes and cleanups s390: - initial machine check forwarding - migration support for the CMMA page hinting information - cleanups and fixes x86: - nested VMX bugfixes and improvements - more reliable NMI window detection on AMD - APIC timer optimizations Generic: - VCPU request overhaul + documentation of common code patterns - kvm_stat improvements" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (124 commits) Update my email address kvm: vmx: allow host to access guest MSR_IA32_BNDCFGS x86: kvm: mmu: use ept a/d in vmcs02 iff used in vmcs12 kvm: x86: mmu: allow A/D bits to be disabled in an mmu x86: kvm: mmu: make spte mmio mask more explicit x86: kvm: mmu: dead code thanks to access tracking KVM: PPC: Book3S: Fix typo in XICS-on-XIVE state saving code KVM: PPC: Book3S HV: Close race with testing for signals on guest entry KVM: PPC: Book3S HV: Simplify dynamic micro-threading code KVM: x86: remove ignored type attribute KVM: LAPIC: Fix lapic timer injection delay KVM: lapic: reorganize restart_apic_timer KVM: lapic: reorganize start_hv_timer kvm: nVMX: Check memory operand to INVVPID KVM: s390: Inject machine check into the nested guest KVM: s390: Inject machine check into the guest tools/kvm_stat: add new interactive command 'b' tools/kvm_stat: add new command line switch '-i' tools/kvm_stat: fix error on interactive command 'g' KVM: SVM: suppress unnecessary NMI singlestep on GIF=0 and nested exit ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt12
-rw-r--r--Documentation/arm64/silicon-errata.txt1
-rw-r--r--Documentation/virtual/kvm/api.txt172
-rw-r--r--Documentation/virtual/kvm/devices/s390_flic.txt15
-rw-r--r--Documentation/virtual/kvm/devices/vcpu.txt41
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt33
-rw-r--r--Documentation/virtual/kvm/mmu.txt4
-rw-r--r--Documentation/virtual/kvm/vcpu-requests.rst307
-rw-r--r--MAINTAINERS6
-rw-r--r--arch/arm/include/asm/kvm_host.h28
-rw-r--r--arch/arm/include/uapi/asm/kvm.h8
-rw-r--r--arch/arm/kvm/guest.c51
-rw-r--r--arch/arm/kvm/handle_exit.c1
-rw-r--r--arch/arm/kvm/hyp/switch.c2
-rw-r--r--arch/arm/kvm/reset.c16
-rw-r--r--arch/arm64/Kconfig11
-rw-r--r--arch/arm64/include/asm/arch_gicv3.h2
-rw-r--r--arch/arm64/include/asm/cpucaps.h3
-rw-r--r--arch/arm64/include/asm/cputype.h2
-rw-r--r--arch/arm64/include/asm/esr.h24
-rw-r--r--arch/arm64/include/asm/kvm_host.h6
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h1
-rw-r--r--arch/arm64/include/asm/sysreg.h23
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h3
-rw-r--r--arch/arm64/kernel/cpu_errata.c21
-rw-r--r--arch/arm64/kvm/guest.c9
-rw-r--r--arch/arm64/kvm/handle_exit.c1
-rw-r--r--arch/arm64/kvm/hyp/switch.c15
-rw-r--r--arch/arm64/kvm/reset.c16
-rw-r--r--arch/arm64/kvm/sys_regs.c27
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c45
-rw-r--r--arch/mips/kvm/trap_emul.c2
-rw-r--r--arch/mips/kvm/vz.c2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h13
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h6
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kernel/mce.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c511
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S8
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c18
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S165
-rw-r--r--arch/powerpc/kvm/book3s_xive.c4
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/emulate.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c45
-rw-r--r--arch/s390/include/asm/ctl_reg.h4
-rw-r--r--arch/s390/include/asm/kvm_host.h33
-rw-r--r--arch/s390/include/asm/nmi.h6
-rw-r--r--arch/s390/include/uapi/asm/kvm.h12
-rw-r--r--arch/s390/kvm/gaccess.c43
-rw-r--r--arch/s390/kvm/interrupt.c91
-rw-r--r--arch/s390/kvm/kvm-s390.c373
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/priv.c103
-rw-r--r--arch/s390/kvm/vsie.c25
-rw-r--r--arch/x86/include/asm/kvm_host.h50
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c84
-rw-r--r--arch/x86/kvm/lapic.c116
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c155
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/svm.c95
-rw-r--r--arch/x86/kvm/vmx.c83
-rw-r--r--arch/x86/kvm/x86.c14
-rw-r--r--include/kvm/arm_arch_timer.h8
-rw-r--r--include/kvm/arm_pmu.h6
-rw-r--r--include/kvm/arm_vgic.h14
-rw-r--r--include/linux/irqchip/arm-gic-v3.h6
-rw-r--r--include/linux/kvm_host.h12
-rw-r--r--include/uapi/linux/kvm.h35
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat669
-rw-r--r--tools/kvm/kvm_stat/kvm_stat.txt12
-rw-r--r--virt/kvm/arm/aarch32.c2
-rw-r--r--virt/kvm/arm/arch_timer.c139
-rw-r--r--virt/kvm/arm/arm.c82
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c823
-rw-r--r--virt/kvm/arm/mmu.c23
-rw-r--r--virt/kvm/arm/pmu.c117
-rw-r--r--virt/kvm/arm/psci.c8
-rw-r--r--virt/kvm/arm/vgic/vgic-irqfd.c2
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c24
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c22
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c68
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h12
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c45
-rw-r--r--virt/kvm/arm/vgic/vgic.c68
-rw-r--r--virt/kvm/kvm_main.c12
95 files changed, 4250 insertions, 967 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f24ee1c99412..aa1d4409fe0a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1862,6 +1862,18 @@
1862 for all guests. 1862 for all guests.
1863 Default is 1 (enabled) if in 64-bit or 32-bit PAE mode. 1863 Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
1864 1864
1865 kvm-arm.vgic_v3_group0_trap=
1866 [KVM,ARM] Trap guest accesses to GICv3 group-0
1867 system registers
1868
1869 kvm-arm.vgic_v3_group1_trap=
1870 [KVM,ARM] Trap guest accesses to GICv3 group-1
1871 system registers
1872
1873 kvm-arm.vgic_v3_common_trap=
1874 [KVM,ARM] Trap guest accesses to GICv3 common
1875 system registers
1876
1865 kvm-intel.ept= [KVM,Intel] Disable extended page tables 1877 kvm-intel.ept= [KVM,Intel] Disable extended page tables
1866 (virtualized MMU) support on capable Intel chips. 1878 (virtualized MMU) support on capable Intel chips.
1867 Default is 1 (enabled) 1879 Default is 1 (enabled)
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt
index 10f2dddbf449..f5f93dca54b7 100644
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -62,6 +62,7 @@ stable kernels.
62| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | 62| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
63| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | 63| Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 |
64| Cavium | ThunderX SMMUv2 | #27704 | N/A | 64| Cavium | ThunderX SMMUv2 | #27704 | N/A |
65| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 |
65| | | | | 66| | | | |
66| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | 67| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
67| | | | | 68| | | | |
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4029943887a3..3a9831b72945 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3255,6 +3255,141 @@ Otherwise, if the MCE is a corrected error, KVM will just
3255store it in the corresponding bank (provided this bank is 3255store it in the corresponding bank (provided this bank is
3256not holding a previously reported uncorrected error). 3256not holding a previously reported uncorrected error).
3257 3257
32584.107 KVM_S390_GET_CMMA_BITS
3259
3260Capability: KVM_CAP_S390_CMMA_MIGRATION
3261Architectures: s390
3262Type: vm ioctl
3263Parameters: struct kvm_s390_cmma_log (in, out)
3264Returns: 0 on success, a negative value on error
3265
3266This ioctl is used to get the values of the CMMA bits on the s390
3267architecture. It is meant to be used in two scenarios:
3268- During live migration to save the CMMA values. Live migration needs
3269 to be enabled via the KVM_REQ_START_MIGRATION VM property.
3270- To non-destructively peek at the CMMA values, with the flag
3271 KVM_S390_CMMA_PEEK set.
3272
3273The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired
3274values are written to a buffer whose location is indicated via the "values"
3275member in the kvm_s390_cmma_log struct. The values in the input struct are
3276also updated as needed.
3277Each CMMA value takes up one byte.
3278
3279struct kvm_s390_cmma_log {
3280 __u64 start_gfn;
3281 __u32 count;
3282 __u32 flags;
3283 union {
3284 __u64 remaining;
3285 __u64 mask;
3286 };
3287 __u64 values;
3288};
3289
3290start_gfn is the number of the first guest frame whose CMMA values are
3291to be retrieved,
3292
3293count is the length of the buffer in bytes,
3294
3295values points to the buffer where the result will be written to.
3296
3297If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be
3298KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with
3299other ioctls.
3300
3301The result is written in the buffer pointed to by the field values, and
3302the values of the input parameter are updated as follows.
3303
3304Depending on the flags, different actions are performed. The only
3305supported flag so far is KVM_S390_CMMA_PEEK.
3306
3307The default behaviour if KVM_S390_CMMA_PEEK is not set is:
3308start_gfn will indicate the first page frame whose CMMA bits were dirty.
3309It is not necessarily the same as the one passed as input, as clean pages
3310are skipped.
3311
3312count will indicate the number of bytes actually written in the buffer.
3313It can (and very often will) be smaller than the input value, since the
3314buffer is only filled until 16 bytes of clean values are found (which
3315are then not copied in the buffer). Since a CMMA migration block needs
3316the base address and the length, for a total of 16 bytes, we will send
3317back some clean data if there is some dirty data afterwards, as long as
3318the size of the clean data does not exceed the size of the header. This
3319allows to minimize the amount of data to be saved or transferred over
3320the network at the expense of more roundtrips to userspace. The next
3321invocation of the ioctl will skip over all the clean values, saving
3322potentially more than just the 16 bytes we found.
3323
3324If KVM_S390_CMMA_PEEK is set:
3325the existing storage attributes are read even when not in migration
3326mode, and no other action is performed;
3327
3328the output start_gfn will be equal to the input start_gfn,
3329
3330the output count will be equal to the input count, except if the end of
3331memory has been reached.
3332
3333In both cases:
3334the field "remaining" will indicate the total number of dirty CMMA values
3335still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is
3336not enabled.
3337
3338mask is unused.
3339
3340values points to the userspace buffer where the result will be stored.
3341
3342This ioctl can fail with -ENOMEM if not enough memory can be allocated to
3343complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
3344KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
3345-EFAULT if the userspace address is invalid or if no page table is
3346present for the addresses (e.g. when using hugepages).
3347
33484.108 KVM_S390_SET_CMMA_BITS
3349
3350Capability: KVM_CAP_S390_CMMA_MIGRATION
3351Architectures: s390
3352Type: vm ioctl
3353Parameters: struct kvm_s390_cmma_log (in)
3354Returns: 0 on success, a negative value on error
3355
3356This ioctl is used to set the values of the CMMA bits on the s390
3357architecture. It is meant to be used during live migration to restore
3358the CMMA values, but there are no restrictions on its use.
3359The ioctl takes parameters via the kvm_s390_cmma_values struct.
3360Each CMMA value takes up one byte.
3361
3362struct kvm_s390_cmma_log {
3363 __u64 start_gfn;
3364 __u32 count;
3365 __u32 flags;
3366 union {
3367 __u64 remaining;
3368 __u64 mask;
3369 };
3370 __u64 values;
3371};
3372
3373start_gfn indicates the starting guest frame number,
3374
3375count indicates how many values are to be considered in the buffer,
3376
3377flags is not used and must be 0.
3378
3379mask indicates which PGSTE bits are to be considered.
3380
3381remaining is not used.
3382
3383values points to the buffer in userspace where to store the values.
3384
3385This ioctl can fail with -ENOMEM if not enough memory can be allocated to
3386complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
3387the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or
3388if the flags field was not 0, with -EFAULT if the userspace address is
3389invalid, if invalid pages are written to (e.g. after the end of memory)
3390or if no page table is present for the addresses (e.g. when using
3391hugepages).
3392
32585. The kvm_run structure 33935. The kvm_run structure
3259------------------------ 3394------------------------
3260 3395
@@ -3996,6 +4131,34 @@ Parameters: none
3996Allow use of adapter-interruption suppression. 4131Allow use of adapter-interruption suppression.
3997Returns: 0 on success; -EBUSY if a VCPU has already been created. 4132Returns: 0 on success; -EBUSY if a VCPU has already been created.
3998 4133
41347.11 KVM_CAP_PPC_SMT
4135
4136Architectures: ppc
4137Parameters: vsmt_mode, flags
4138
4139Enabling this capability on a VM provides userspace with a way to set
4140the desired virtual SMT mode (i.e. the number of virtual CPUs per
4141virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2
4142between 1 and 8. On POWER8, vsmt_mode must also be no greater than
4143the number of threads per subcore for the host. Currently flags must
4144be 0. A successful call to enable this capability will result in
4145vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is
4146subsequently queried for the VM. This capability is only supported by
4147HV KVM, and can only be set before any VCPUs have been created.
4148The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT
4149modes are available.
4150
41517.12 KVM_CAP_PPC_FWNMI
4152
4153Architectures: ppc
4154Parameters: none
4155
4156With this capability a machine check exception in the guest address
4157space will cause KVM to exit the guest with NMI exit reason. This
4158enables QEMU to build error log and branch to guest kernel registered
4159machine check handling routine. Without this capability KVM will
4160branch to guests' 0x200 interrupt vector.
4161
39998. Other capabilities. 41628. Other capabilities.
4000---------------------- 4163----------------------
4001 4164
@@ -4157,3 +4320,12 @@ Currently the following bits are defined for the device_irq_level bitmap:
4157Future versions of kvm may implement additional events. These will get 4320Future versions of kvm may implement additional events. These will get
4158indicated by returning a higher number from KVM_CHECK_EXTENSION and will be 4321indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
4159listed above. 4322listed above.
4323
43248.10 KVM_CAP_PPC_SMT_POSSIBLE
4325
4326Architectures: ppc
4327
4328Querying this capability returns a bitmap indicating the possible
4329virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N
4330(counting from the right) is set, then a virtual SMT mode of 2^N is
4331available.
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
index c2518cea8ab4..2f1cbf1301d2 100644
--- a/Documentation/virtual/kvm/devices/s390_flic.txt
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -16,6 +16,7 @@ FLIC provides support to
16- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) 16- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
17- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) 17- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM)
18- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) 18- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT)
19- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL)
19 20
20Groups: 21Groups:
21 KVM_DEV_FLIC_ENQUEUE 22 KVM_DEV_FLIC_ENQUEUE
@@ -136,6 +137,20 @@ struct kvm_s390_ais_req {
136 an isc according to the adapter-interruption-suppression mode on condition 137 an isc according to the adapter-interruption-suppression mode on condition
137 that the AIS capability is enabled. 138 that the AIS capability is enabled.
138 139
140 KVM_DEV_FLIC_AISM_ALL
141 Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes
142 a kvm_s390_ais_all describing:
143
144struct kvm_s390_ais_all {
145 __u8 simm; /* Single-Interruption-Mode mask */
146 __u8 nimm; /* No-Interruption-Mode mask *
147};
148
149 simm contains Single-Interruption-Mode mask for all ISCs, nimm contains
150 No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds
151 to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and
152 nimm bit presents AIS mode for a ISC.
153
139Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on 154Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on
140FLIC with an unknown group or attribute gives the error code EINVAL (instead of 155FLIC with an unknown group or attribute gives the error code EINVAL (instead of
141ENXIO, as specified in the API documentation). It is not possible to conclude 156ENXIO, as specified in the API documentation). It is not possible to conclude
diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt
index 02f50686c418..2b5dab16c4f2 100644
--- a/Documentation/virtual/kvm/devices/vcpu.txt
+++ b/Documentation/virtual/kvm/devices/vcpu.txt
@@ -16,7 +16,9 @@ Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a
16Returns: -EBUSY: The PMU overflow interrupt is already set 16Returns: -EBUSY: The PMU overflow interrupt is already set
17 -ENXIO: The overflow interrupt not set when attempting to get it 17 -ENXIO: The overflow interrupt not set when attempting to get it
18 -ENODEV: PMUv3 not supported 18 -ENODEV: PMUv3 not supported
19 -EINVAL: Invalid PMU overflow interrupt number supplied 19 -EINVAL: Invalid PMU overflow interrupt number supplied or
20 trying to set the IRQ number without using an in-kernel
21 irqchip.
20 22
21A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt 23A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt
22number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt 24number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt
@@ -25,11 +27,36 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
25 27
261.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT 281.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT
27Parameters: no additional parameter in kvm_device_attr.addr 29Parameters: no additional parameter in kvm_device_attr.addr
28Returns: -ENODEV: PMUv3 not supported 30Returns: -ENODEV: PMUv3 not supported or GIC not initialized
29 -ENXIO: PMUv3 not properly configured as required prior to calling this 31 -ENXIO: PMUv3 not properly configured or in-kernel irqchip not
30 attribute 32 configured as required prior to calling this attribute
31 -EBUSY: PMUv3 already initialized 33 -EBUSY: PMUv3 already initialized
32 34
33Request the initialization of the PMUv3. This must be done after creating the 35Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel
34in-kernel irqchip. Creating a PMU with a userspace irqchip is currently not 36virtual GIC implementation, this must be done after initializing the in-kernel
35supported. 37irqchip.
38
39
402. GROUP: KVM_ARM_VCPU_TIMER_CTRL
41Architectures: ARM,ARM64
42
432.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER
442.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER
45Parameters: in kvm_device_attr.addr the address for the timer interrupt is a
46 pointer to an int
47Returns: -EINVAL: Invalid timer interrupt number
48 -EBUSY: One or more VCPUs has already run
49
50A value describing the architected timer interrupt number when connected to an
51in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the
52attribute overrides the default values (see below).
53
54KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27)
55KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30)
56
57Setting the same PPI for different timers will prevent the VCPUs from running.
58Setting the interrupt number on a VCPU configures all VCPUs created at that
59time to use the number provided for a given timer, overwriting any previously
60configured values on other VCPUs. Userspace should configure the interrupt
61numbers on at least one VCPU after creating all VCPUs and before running any
62VCPUs.
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index 575ccb022aac..903fc926860b 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -222,3 +222,36 @@ Allows user space to disable dea key wrapping, clearing the wrapping key.
222 222
223Parameters: none 223Parameters: none
224Returns: 0 224Returns: 0
225
2265. GROUP: KVM_S390_VM_MIGRATION
227Architectures: s390
228
2295.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o)
230
231Allows userspace to stop migration mode, needed for PGSTE migration.
232Setting this attribute when migration mode is not active will have no
233effects.
234
235Parameters: none
236Returns: 0
237
2385.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o)
239
240Allows userspace to start migration mode, needed for PGSTE migration.
241Setting this attribute when migration mode is already active will have
242no effects.
243
244Parameters: none
245Returns: -ENOMEM if there is not enough free memory to start migration mode
246 -EINVAL if the state of the VM is invalid (e.g. no memory defined)
247 0 in case of success.
248
2495.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o)
250
251Allows userspace to query the status of migration mode.
252
253Parameters: address of a buffer in user space to store the data (u64) to;
254 the data itself is either 0 if migration mode is disabled or 1
255 if it is enabled
256Returns: -EFAULT if the given address is not accessible from kernel space
257 0 in case of success.
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 481b6a9c25d5..f50d45b1e967 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -179,6 +179,10 @@ Shadow pages contain the following information:
179 shadow page; it is also used to go back from a struct kvm_mmu_page 179 shadow page; it is also used to go back from a struct kvm_mmu_page
180 to a memslot, through the kvm_memslots_for_spte_role macro and 180 to a memslot, through the kvm_memslots_for_spte_role macro and
181 __gfn_to_memslot. 181 __gfn_to_memslot.
182 role.ad_disabled:
183 Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D
184 bits before Haswell; shadow EPT page tables also cannot use A/D bits
185 if the L1 hypervisor does not enable them.
182 gfn: 186 gfn:
183 Either the guest page table containing the translations shadowed by this 187 Either the guest page table containing the translations shadowed by this
184 page, or the base page frame for linear translations. See role.direct. 188 page, or the base page frame for linear translations. See role.direct.
diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virtual/kvm/vcpu-requests.rst
new file mode 100644
index 000000000000..5feb3706a7ae
--- /dev/null
+++ b/Documentation/virtual/kvm/vcpu-requests.rst
@@ -0,0 +1,307 @@
1=================
2KVM VCPU Requests
3=================
4
5Overview
6========
7
8KVM supports an internal API enabling threads to request a VCPU thread to
9perform some activity. For example, a thread may request a VCPU to flush
10its TLB with a VCPU request. The API consists of the following functions::
11
12 /* Check if any requests are pending for VCPU @vcpu. */
13 bool kvm_request_pending(struct kvm_vcpu *vcpu);
14
15 /* Check if VCPU @vcpu has request @req pending. */
16 bool kvm_test_request(int req, struct kvm_vcpu *vcpu);
17
18 /* Clear request @req for VCPU @vcpu. */
19 void kvm_clear_request(int req, struct kvm_vcpu *vcpu);
20
21 /*
22 * Check if VCPU @vcpu has request @req pending. When the request is
23 * pending it will be cleared and a memory barrier, which pairs with
24 * another in kvm_make_request(), will be issued.
25 */
26 bool kvm_check_request(int req, struct kvm_vcpu *vcpu);
27
28 /*
29 * Make request @req of VCPU @vcpu. Issues a memory barrier, which pairs
30 * with another in kvm_check_request(), prior to setting the request.
31 */
32 void kvm_make_request(int req, struct kvm_vcpu *vcpu);
33
34 /* Make request @req of all VCPUs of the VM with struct kvm @kvm. */
35 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
36
37Typically a requester wants the VCPU to perform the activity as soon
38as possible after making the request. This means most requests
39(kvm_make_request() calls) are followed by a call to kvm_vcpu_kick(),
40and kvm_make_all_cpus_request() has the kicking of all VCPUs built
41into it.
42
43VCPU Kicks
44----------
45
46The goal of a VCPU kick is to bring a VCPU thread out of guest mode in
47order to perform some KVM maintenance. To do so, an IPI is sent, forcing
48a guest mode exit. However, a VCPU thread may not be in guest mode at the
49time of the kick. Therefore, depending on the mode and state of the VCPU
50thread, there are two other actions a kick may take. All three actions
51are listed below:
52
531) Send an IPI. This forces a guest mode exit.
542) Waking a sleeping VCPU. Sleeping VCPUs are VCPU threads outside guest
55 mode that wait on waitqueues. Waking them removes the threads from
56 the waitqueues, allowing the threads to run again. This behavior
57 may be suppressed, see KVM_REQUEST_NO_WAKEUP below.
583) Nothing. When the VCPU is not in guest mode and the VCPU thread is not
59 sleeping, then there is nothing to do.
60
61VCPU Mode
62---------
63
64VCPUs have a mode state, ``vcpu->mode``, that is used to track whether the
65guest is running in guest mode or not, as well as some specific
66outside guest mode states. The architecture may use ``vcpu->mode`` to
67ensure VCPU requests are seen by VCPUs (see "Ensuring Requests Are Seen"),
68as well as to avoid sending unnecessary IPIs (see "IPI Reduction"), and
69even to ensure IPI acknowledgements are waited upon (see "Waiting for
70Acknowledgements"). The following modes are defined:
71
72OUTSIDE_GUEST_MODE
73
74 The VCPU thread is outside guest mode.
75
76IN_GUEST_MODE
77
78 The VCPU thread is in guest mode.
79
80EXITING_GUEST_MODE
81
82 The VCPU thread is transitioning from IN_GUEST_MODE to
83 OUTSIDE_GUEST_MODE.
84
85READING_SHADOW_PAGE_TABLES
86
87 The VCPU thread is outside guest mode, but it wants the sender of
88 certain VCPU requests, namely KVM_REQ_TLB_FLUSH, to wait until the VCPU
89 thread is done reading the page tables.
90
91VCPU Request Internals
92======================
93
94VCPU requests are simply bit indices of the ``vcpu->requests`` bitmap.
95This means general bitops, like those documented in [atomic-ops]_ could
96also be used, e.g. ::
97
98 clear_bit(KVM_REQ_UNHALT & KVM_REQUEST_MASK, &vcpu->requests);
99
100However, VCPU request users should refrain from doing so, as it would
101break the abstraction. The first 8 bits are reserved for architecture
102independent requests, all additional bits are available for architecture
103dependent requests.
104
105Architecture Independent Requests
106---------------------------------
107
108KVM_REQ_TLB_FLUSH
109
110 KVM's common MMU notifier may need to flush all of a guest's TLB
111 entries, calling kvm_flush_remote_tlbs() to do so. Architectures that
112 choose to use the common kvm_flush_remote_tlbs() implementation will
113 need to handle this VCPU request.
114
115KVM_REQ_MMU_RELOAD
116
117 When shadow page tables are used and memory slots are removed it's
118 necessary to inform each VCPU to completely refresh the tables. This
119 request is used for that.
120
121KVM_REQ_PENDING_TIMER
122
123 This request may be made from a timer handler run on the host on behalf
124 of a VCPU. It informs the VCPU thread to inject a timer interrupt.
125
126KVM_REQ_UNHALT
127
128 This request may be made from the KVM common function kvm_vcpu_block(),
129 which is used to emulate an instruction that causes a CPU to halt until
130 one of an architectural specific set of events and/or interrupts is
131 received (determined by checking kvm_arch_vcpu_runnable()). When that
132 event or interrupt arrives kvm_vcpu_block() makes the request. This is
133 in contrast to when kvm_vcpu_block() returns due to any other reason,
134 such as a pending signal, which does not indicate the VCPU's halt
135 emulation should stop, and therefore does not make the request.
136
137KVM_REQUEST_MASK
138----------------
139
140VCPU requests should be masked by KVM_REQUEST_MASK before using them with
141bitops. This is because only the lower 8 bits are used to represent the
142request's number. The upper bits are used as flags. Currently only two
143flags are defined.
144
145VCPU Request Flags
146------------------
147
148KVM_REQUEST_NO_WAKEUP
149
150 This flag is applied to requests that only need immediate attention
151 from VCPUs running in guest mode. That is, sleeping VCPUs do not need
152 to be awaken for these requests. Sleeping VCPUs will handle the
153 requests when they are awaken later for some other reason.
154
155KVM_REQUEST_WAIT
156
157 When requests with this flag are made with kvm_make_all_cpus_request(),
158 then the caller will wait for each VCPU to acknowledge its IPI before
159 proceeding. This flag only applies to VCPUs that would receive IPIs.
160 If, for example, the VCPU is sleeping, so no IPI is necessary, then
161 the requesting thread does not wait. This means that this flag may be
162 safely combined with KVM_REQUEST_NO_WAKEUP. See "Waiting for
163 Acknowledgements" for more information about requests with
164 KVM_REQUEST_WAIT.
165
166VCPU Requests with Associated State
167===================================
168
169Requesters that want the receiving VCPU to handle new state need to ensure
170the newly written state is observable to the receiving VCPU thread's CPU
171by the time it observes the request. This means a write memory barrier
172must be inserted after writing the new state and before setting the VCPU
173request bit. Additionally, on the receiving VCPU thread's side, a
174corresponding read barrier must be inserted after reading the request bit
175and before proceeding to read the new state associated with it. See
176scenario 3, Message and Flag, of [lwn-mb]_ and the kernel documentation
177[memory-barriers]_.
178
179The pair of functions, kvm_check_request() and kvm_make_request(), provide
180the memory barriers, allowing this requirement to be handled internally by
181the API.
182
183Ensuring Requests Are Seen
184==========================
185
186When making requests to VCPUs, we want to avoid the receiving VCPU
187executing in guest mode for an arbitrary long time without handling the
188request. We can be sure this won't happen as long as we ensure the VCPU
189thread checks kvm_request_pending() before entering guest mode and that a
190kick will send an IPI to force an exit from guest mode when necessary.
191Extra care must be taken to cover the period after the VCPU thread's last
192kvm_request_pending() check and before it has entered guest mode, as kick
193IPIs will only trigger guest mode exits for VCPU threads that are in guest
194mode or at least have already disabled interrupts in order to prepare to
195enter guest mode. This means that an optimized implementation (see "IPI
196Reduction") must be certain when it's safe to not send the IPI. One
197solution, which all architectures except s390 apply, is to:
198
199- set ``vcpu->mode`` to IN_GUEST_MODE between disabling the interrupts and
200 the last kvm_request_pending() check;
201- enable interrupts atomically when entering the guest.
202
203This solution also requires memory barriers to be placed carefully in both
204the requesting thread and the receiving VCPU. With the memory barriers we
205can exclude the possibility of a VCPU thread observing
206!kvm_request_pending() on its last check and then not receiving an IPI for
207the next request made of it, even if the request is made immediately after
208the check. This is done by way of the Dekker memory barrier pattern
209(scenario 10 of [lwn-mb]_). As the Dekker pattern requires two variables,
210this solution pairs ``vcpu->mode`` with ``vcpu->requests``. Substituting
211them into the pattern gives::
212
213 CPU1 CPU2
214 ================= =================
215 local_irq_disable();
216 WRITE_ONCE(vcpu->mode, IN_GUEST_MODE); kvm_make_request(REQ, vcpu);
217 smp_mb(); smp_mb();
218 if (kvm_request_pending(vcpu)) { if (READ_ONCE(vcpu->mode) ==
219 IN_GUEST_MODE) {
220 ...abort guest entry... ...send IPI...
221 } }
222
223As stated above, the IPI is only useful for VCPU threads in guest mode or
224that have already disabled interrupts. This is why this specific case of
225the Dekker pattern has been extended to disable interrupts before setting
226``vcpu->mode`` to IN_GUEST_MODE. WRITE_ONCE() and READ_ONCE() are used to
227pedantically implement the memory barrier pattern, guaranteeing the
228compiler doesn't interfere with ``vcpu->mode``'s carefully planned
229accesses.
230
231IPI Reduction
232-------------
233
234As only one IPI is needed to get a VCPU to check for any/all requests,
235then they may be coalesced. This is easily done by having the first IPI
236sending kick also change the VCPU mode to something !IN_GUEST_MODE. The
237transitional state, EXITING_GUEST_MODE, is used for this purpose.
238
239Waiting for Acknowledgements
240----------------------------
241
242Some requests, those with the KVM_REQUEST_WAIT flag set, require IPIs to
243be sent, and the acknowledgements to be waited upon, even when the target
244VCPU threads are in modes other than IN_GUEST_MODE. For example, one case
245is when a target VCPU thread is in READING_SHADOW_PAGE_TABLES mode, which
246is set after disabling interrupts. To support these cases, the
247KVM_REQUEST_WAIT flag changes the condition for sending an IPI from
248checking that the VCPU is IN_GUEST_MODE to checking that it is not
249OUTSIDE_GUEST_MODE.
250
251Request-less VCPU Kicks
252-----------------------
253
254As the determination of whether or not to send an IPI depends on the
255two-variable Dekker memory barrier pattern, then it's clear that
256request-less VCPU kicks are almost never correct. Without the assurance
257that a non-IPI generating kick will still result in an action by the
258receiving VCPU, as the final kvm_request_pending() check does for
259request-accompanying kicks, then the kick may not do anything useful at
260all. If, for instance, a request-less kick was made to a VCPU that was
261just about to set its mode to IN_GUEST_MODE, meaning no IPI is sent, then
262the VCPU thread may continue its entry without actually having done
263whatever it was the kick was meant to initiate.
264
265One exception is x86's posted interrupt mechanism. In this case, however,
266even the request-less VCPU kick is coupled with the same
267local_irq_disable() + smp_mb() pattern described above; the ON bit
268(Outstanding Notification) in the posted interrupt descriptor takes the
269role of ``vcpu->requests``. When sending a posted interrupt, PIR.ON is
270set before reading ``vcpu->mode``; dually, in the VCPU thread,
271vmx_sync_pir_to_irr() reads PIR after setting ``vcpu->mode`` to
272IN_GUEST_MODE.
273
274Additional Considerations
275=========================
276
277Sleeping VCPUs
278--------------
279
280VCPU threads may need to consider requests before and/or after calling
281functions that may put them to sleep, e.g. kvm_vcpu_block(). Whether they
282do or not, and, if they do, which requests need consideration, is
283architecture dependent. kvm_vcpu_block() calls kvm_arch_vcpu_runnable()
284to check if it should awaken. One reason to do so is to provide
285architectures a function where requests may be checked if necessary.
286
287Clearing Requests
288-----------------
289
290Generally it only makes sense for the receiving VCPU thread to clear a
291request. However, in some circumstances, such as when the requesting
292thread and the receiving VCPU thread are executed serially, such as when
293they are the same thread, or when they are using some form of concurrency
294control to temporarily execute synchronously, then it's possible to know
295that the request may be cleared immediately, rather than waiting for the
296receiving VCPU thread to handle the request in VCPU RUN. The only current
297examples of this are kvm_vcpu_block() calls made by VCPUs to block
298themselves. A possible side-effect of that call is to make the
299KVM_REQ_UNHALT request, which may then be cleared immediately when the
300VCPU returns from the call.
301
302References
303==========
304
305.. [atomic-ops] Documentation/core-api/atomic_ops.rst
306.. [memory-barriers] Documentation/memory-barriers.txt
307.. [lwn-mb] https://lwn.net/Articles/573436/
diff --git a/MAINTAINERS b/MAINTAINERS
index 75ac9dc85804..1c1d106a3347 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7350,7 +7350,7 @@ F: arch/powerpc/kvm/
7350 7350
7351KERNEL VIRTUAL MACHINE for s390 (KVM/s390) 7351KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
7352M: Christian Borntraeger <borntraeger@de.ibm.com> 7352M: Christian Borntraeger <borntraeger@de.ibm.com>
7353M: Cornelia Huck <cornelia.huck@de.ibm.com> 7353M: Cornelia Huck <cohuck@redhat.com>
7354L: linux-s390@vger.kernel.org 7354L: linux-s390@vger.kernel.org
7355W: http://www.ibm.com/developerworks/linux/linux390/ 7355W: http://www.ibm.com/developerworks/linux/linux390/
7356T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git 7356T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
@@ -11268,7 +11268,7 @@ S: Supported
11268F: drivers/iommu/s390-iommu.c 11268F: drivers/iommu/s390-iommu.c
11269 11269
11270S390 VFIO-CCW DRIVER 11270S390 VFIO-CCW DRIVER
11271M: Cornelia Huck <cornelia.huck@de.ibm.com> 11271M: Cornelia Huck <cohuck@redhat.com>
11272M: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com> 11272M: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
11273L: linux-s390@vger.kernel.org 11273L: linux-s390@vger.kernel.org
11274L: kvm@vger.kernel.org 11274L: kvm@vger.kernel.org
@@ -13814,7 +13814,7 @@ F: include/uapi/linux/virtio_*.h
13814F: drivers/crypto/virtio/ 13814F: drivers/crypto/virtio/
13815 13815
13816VIRTIO DRIVERS FOR S390 13816VIRTIO DRIVERS FOR S390
13817M: Cornelia Huck <cornelia.huck@de.ibm.com> 13817M: Cornelia Huck <cohuck@redhat.com>
13818M: Halil Pasic <pasic@linux.vnet.ibm.com> 13818M: Halil Pasic <pasic@linux.vnet.ibm.com>
13819L: linux-s390@vger.kernel.org 13819L: linux-s390@vger.kernel.org
13820L: virtualization@lists.linux-foundation.org 13820L: virtualization@lists.linux-foundation.org
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index f0e66577ce05..127e2dd2e21c 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,9 @@
44#define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS 44#define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
45#endif 45#endif
46 46
47#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 47#define KVM_REQ_SLEEP \
48 KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
49#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
48 50
49u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); 51u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
50int __attribute_const__ kvm_target_cpu(void); 52int __attribute_const__ kvm_target_cpu(void);
@@ -233,8 +235,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
233struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); 235struct kvm_vcpu __percpu **kvm_get_running_vcpus(void);
234void kvm_arm_halt_guest(struct kvm *kvm); 236void kvm_arm_halt_guest(struct kvm *kvm);
235void kvm_arm_resume_guest(struct kvm *kvm); 237void kvm_arm_resume_guest(struct kvm *kvm);
236void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
237void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
238 238
239int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); 239int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
240unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); 240unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu);
@@ -291,20 +291,12 @@ static inline void kvm_arm_init_debug(void) {}
291static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} 291static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
292static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} 292static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
293static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} 293static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
294static inline int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, 294
295 struct kvm_device_attr *attr) 295int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
296{ 296 struct kvm_device_attr *attr);
297 return -ENXIO; 297int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
298} 298 struct kvm_device_attr *attr);
299static inline int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, 299int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
300 struct kvm_device_attr *attr) 300 struct kvm_device_attr *attr);
301{
302 return -ENXIO;
303}
304static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
305 struct kvm_device_attr *attr)
306{
307 return -ENXIO;
308}
309 301
310#endif /* __ARM_KVM_HOST_H__ */ 302#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 5e3c673fa3f4..5db2d4c6a55f 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -203,6 +203,14 @@ struct kvm_arch_memory_slot {
203#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff 203#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
204#define VGIC_LEVEL_INFO_LINE_LEVEL 0 204#define VGIC_LEVEL_INFO_LINE_LEVEL 0
205 205
206/* Device Control API on vcpu fd */
207#define KVM_ARM_VCPU_PMU_V3_CTRL 0
208#define KVM_ARM_VCPU_PMU_V3_IRQ 0
209#define KVM_ARM_VCPU_PMU_V3_INIT 1
210#define KVM_ARM_VCPU_TIMER_CTRL 1
211#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
212#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
213
206#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 214#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
207#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 215#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
208#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 216#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index fa6182a40941..1e0784ebbfd6 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -301,3 +301,54 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
301{ 301{
302 return -EINVAL; 302 return -EINVAL;
303} 303}
304
305int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
306 struct kvm_device_attr *attr)
307{
308 int ret;
309
310 switch (attr->group) {
311 case KVM_ARM_VCPU_TIMER_CTRL:
312 ret = kvm_arm_timer_set_attr(vcpu, attr);
313 break;
314 default:
315 ret = -ENXIO;
316 break;
317 }
318
319 return ret;
320}
321
322int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
323 struct kvm_device_attr *attr)
324{
325 int ret;
326
327 switch (attr->group) {
328 case KVM_ARM_VCPU_TIMER_CTRL:
329 ret = kvm_arm_timer_get_attr(vcpu, attr);
330 break;
331 default:
332 ret = -ENXIO;
333 break;
334 }
335
336 return ret;
337}
338
339int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
340 struct kvm_device_attr *attr)
341{
342 int ret;
343
344 switch (attr->group) {
345 case KVM_ARM_VCPU_TIMER_CTRL:
346 ret = kvm_arm_timer_has_attr(vcpu, attr);
347 break;
348 default:
349 ret = -ENXIO;
350 break;
351 }
352
353 return ret;
354}
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index f86a9aaef462..54442e375354 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -72,6 +72,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
72 trace_kvm_wfx(*vcpu_pc(vcpu), false); 72 trace_kvm_wfx(*vcpu_pc(vcpu), false);
73 vcpu->stat.wfi_exit_stat++; 73 vcpu->stat.wfi_exit_stat++;
74 kvm_vcpu_block(vcpu); 74 kvm_vcpu_block(vcpu);
75 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
75 } 76 }
76 77
77 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 78 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index 624a510d31df..ebd2dd46adf7 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -237,8 +237,10 @@ void __hyp_text __noreturn __hyp_panic(int cause)
237 237
238 vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); 238 vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR);
239 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 239 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
240 __timer_save_state(vcpu);
240 __deactivate_traps(vcpu); 241 __deactivate_traps(vcpu);
241 __deactivate_vm(vcpu); 242 __deactivate_vm(vcpu);
243 __banked_restore_state(host_ctxt);
242 __sysreg_restore_state(host_ctxt); 244 __sysreg_restore_state(host_ctxt);
243 } 245 }
244 246
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 1da8b2d14550..5ed0c3ee33d6 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -37,16 +37,6 @@ static struct kvm_regs cortexa_regs_reset = {
37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, 37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
38}; 38};
39 39
40static const struct kvm_irq_level cortexa_ptimer_irq = {
41 { .irq = 30 },
42 .level = 1,
43};
44
45static const struct kvm_irq_level cortexa_vtimer_irq = {
46 { .irq = 27 },
47 .level = 1,
48};
49
50 40
51/******************************************************************************* 41/*******************************************************************************
52 * Exported reset function 42 * Exported reset function
@@ -62,16 +52,12 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
62int kvm_reset_vcpu(struct kvm_vcpu *vcpu) 52int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
63{ 53{
64 struct kvm_regs *reset_regs; 54 struct kvm_regs *reset_regs;
65 const struct kvm_irq_level *cpu_vtimer_irq;
66 const struct kvm_irq_level *cpu_ptimer_irq;
67 55
68 switch (vcpu->arch.target) { 56 switch (vcpu->arch.target) {
69 case KVM_ARM_TARGET_CORTEX_A7: 57 case KVM_ARM_TARGET_CORTEX_A7:
70 case KVM_ARM_TARGET_CORTEX_A15: 58 case KVM_ARM_TARGET_CORTEX_A15:
71 reset_regs = &cortexa_regs_reset; 59 reset_regs = &cortexa_regs_reset;
72 vcpu->arch.midr = read_cpuid_id(); 60 vcpu->arch.midr = read_cpuid_id();
73 cpu_vtimer_irq = &cortexa_vtimer_irq;
74 cpu_ptimer_irq = &cortexa_ptimer_irq;
75 break; 61 break;
76 default: 62 default:
77 return -ENODEV; 63 return -ENODEV;
@@ -84,5 +70,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
84 kvm_reset_coprocs(vcpu); 70 kvm_reset_coprocs(vcpu);
85 71
86 /* Reset arch_timer context */ 72 /* Reset arch_timer context */
87 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); 73 return kvm_timer_vcpu_reset(vcpu);
88} 74}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f7a934ff707..192208ea2842 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -488,6 +488,17 @@ config CAVIUM_ERRATUM_27456
488 488
489 If unsure, say Y. 489 If unsure, say Y.
490 490
491config CAVIUM_ERRATUM_30115
492 bool "Cavium erratum 30115: Guest may disable interrupts in host"
493 default y
494 help
495 On ThunderX T88 pass 1.x through 2.2, T81 pass 1.0 through
496 1.2, and T83 Pass 1.0, KVM guest execution may disable
497 interrupts in host. Trapping both GICv3 group-0 and group-1
498 accesses sidesteps the issue.
499
500 If unsure, say Y.
501
491config QCOM_FALKOR_ERRATUM_1003 502config QCOM_FALKOR_ERRATUM_1003
492 bool "Falkor E1003: Incorrect translation due to ASID change" 503 bool "Falkor E1003: Incorrect translation due to ASID change"
493 default y 504 default y
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 1a98bc8602a2..8cef47fa2218 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -89,7 +89,7 @@ static inline void gic_write_ctlr(u32 val)
89 89
90static inline void gic_write_grpen1(u32 val) 90static inline void gic_write_grpen1(u32 val)
91{ 91{
92 write_sysreg_s(val, SYS_ICC_GRPEN1_EL1); 92 write_sysreg_s(val, SYS_ICC_IGRPEN1_EL1);
93 isb(); 93 isb();
94} 94}
95 95
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index b3aab8a17868..8d2272c6822c 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -38,7 +38,8 @@
38#define ARM64_WORKAROUND_REPEAT_TLBI 17 38#define ARM64_WORKAROUND_REPEAT_TLBI 17
39#define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 39#define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18
40#define ARM64_WORKAROUND_858921 19 40#define ARM64_WORKAROUND_858921 19
41#define ARM64_WORKAROUND_CAVIUM_30115 20
41 42
42#define ARM64_NCAPS 20 43#define ARM64_NCAPS 21
43 44
44#endif /* __ASM_CPUCAPS_H */ 45#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 0984d1b3a8f2..235e77d98261 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -86,6 +86,7 @@
86 86
87#define CAVIUM_CPU_PART_THUNDERX 0x0A1 87#define CAVIUM_CPU_PART_THUNDERX 0x0A1
88#define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 88#define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
89#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
89 90
90#define BRCM_CPU_PART_VULCAN 0x516 91#define BRCM_CPU_PART_VULCAN 0x516
91 92
@@ -96,6 +97,7 @@
96#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) 97#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
97#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) 98#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
98#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) 99#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
100#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
99#define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) 101#define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
100 102
101#ifndef __ASSEMBLY__ 103#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 28bf02efce76..8cabd57b6348 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -19,6 +19,7 @@
19#define __ASM_ESR_H 19#define __ASM_ESR_H
20 20
21#include <asm/memory.h> 21#include <asm/memory.h>
22#include <asm/sysreg.h>
22 23
23#define ESR_ELx_EC_UNKNOWN (0x00) 24#define ESR_ELx_EC_UNKNOWN (0x00)
24#define ESR_ELx_EC_WFx (0x01) 25#define ESR_ELx_EC_WFx (0x01)
@@ -182,6 +183,29 @@
182#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ 183#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
183 ESR_ELx_SYS64_ISS_DIR_READ) 184 ESR_ELx_SYS64_ISS_DIR_READ)
184 185
186#define esr_sys64_to_sysreg(e) \
187 sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \
188 ESR_ELx_SYS64_ISS_OP0_SHIFT), \
189 (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \
190 ESR_ELx_SYS64_ISS_OP1_SHIFT), \
191 (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \
192 ESR_ELx_SYS64_ISS_CRN_SHIFT), \
193 (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \
194 ESR_ELx_SYS64_ISS_CRM_SHIFT), \
195 (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \
196 ESR_ELx_SYS64_ISS_OP2_SHIFT))
197
198#define esr_cp15_to_sysreg(e) \
199 sys_reg(3, \
200 (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \
201 ESR_ELx_SYS64_ISS_OP1_SHIFT), \
202 (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \
203 ESR_ELx_SYS64_ISS_CRN_SHIFT), \
204 (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \
205 ESR_ELx_SYS64_ISS_CRM_SHIFT), \
206 (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \
207 ESR_ELx_SYS64_ISS_OP2_SHIFT))
208
185#ifndef __ASSEMBLY__ 209#ifndef __ASSEMBLY__
186#include <asm/types.h> 210#include <asm/types.h>
187 211
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1f252a95bc02..d68630007b14 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -42,7 +42,9 @@
42 42
43#define KVM_VCPU_MAX_FEATURES 4 43#define KVM_VCPU_MAX_FEATURES 4
44 44
45#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 45#define KVM_REQ_SLEEP \
46 KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
47#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
46 48
47int __attribute_const__ kvm_target_cpu(void); 49int __attribute_const__ kvm_target_cpu(void);
48int kvm_reset_vcpu(struct kvm_vcpu *vcpu); 50int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -334,8 +336,6 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
334struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); 336struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
335void kvm_arm_halt_guest(struct kvm *kvm); 337void kvm_arm_halt_guest(struct kvm *kvm);
336void kvm_arm_resume_guest(struct kvm *kvm); 338void kvm_arm_resume_guest(struct kvm *kvm);
337void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu);
338void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu);
339 339
340u64 __kvm_call_hyp(void *hypfn, ...); 340u64 __kvm_call_hyp(void *hypfn, ...);
341#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) 341#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index b18e852d27e8..4572a9b560fa 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -127,6 +127,7 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
127 127
128void __vgic_v3_save_state(struct kvm_vcpu *vcpu); 128void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
129void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); 129void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
130int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
130 131
131void __timer_save_state(struct kvm_vcpu *vcpu); 132void __timer_save_state(struct kvm_vcpu *vcpu);
132void __timer_restore_state(struct kvm_vcpu *vcpu); 133void __timer_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index b4d13d9267ff..16e44fa9b3b6 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -180,14 +180,31 @@
180 180
181#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) 181#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0)
182 182
183#define SYS_ICC_IAR0_EL1 sys_reg(3, 0, 12, 8, 0)
184#define SYS_ICC_EOIR0_EL1 sys_reg(3, 0, 12, 8, 1)
185#define SYS_ICC_HPPIR0_EL1 sys_reg(3, 0, 12, 8, 2)
186#define SYS_ICC_BPR0_EL1 sys_reg(3, 0, 12, 8, 3)
187#define SYS_ICC_AP0Rn_EL1(n) sys_reg(3, 0, 12, 8, 4 | n)
188#define SYS_ICC_AP0R0_EL1 SYS_ICC_AP0Rn_EL1(0)
189#define SYS_ICC_AP0R1_EL1 SYS_ICC_AP0Rn_EL1(1)
190#define SYS_ICC_AP0R2_EL1 SYS_ICC_AP0Rn_EL1(2)
191#define SYS_ICC_AP0R3_EL1 SYS_ICC_AP0Rn_EL1(3)
192#define SYS_ICC_AP1Rn_EL1(n) sys_reg(3, 0, 12, 9, n)
193#define SYS_ICC_AP1R0_EL1 SYS_ICC_AP1Rn_EL1(0)
194#define SYS_ICC_AP1R1_EL1 SYS_ICC_AP1Rn_EL1(1)
195#define SYS_ICC_AP1R2_EL1 SYS_ICC_AP1Rn_EL1(2)
196#define SYS_ICC_AP1R3_EL1 SYS_ICC_AP1Rn_EL1(3)
183#define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) 197#define SYS_ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1)
198#define SYS_ICC_RPR_EL1 sys_reg(3, 0, 12, 11, 3)
184#define SYS_ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5) 199#define SYS_ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5)
185#define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) 200#define SYS_ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0)
186#define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) 201#define SYS_ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1)
202#define SYS_ICC_HPPIR1_EL1 sys_reg(3, 0, 12, 12, 2)
187#define SYS_ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3) 203#define SYS_ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3)
188#define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) 204#define SYS_ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4)
189#define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) 205#define SYS_ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5)
190#define SYS_ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) 206#define SYS_ICC_IGRPEN0_EL1 sys_reg(3, 0, 12, 12, 6)
207#define SYS_ICC_IGRPEN1_EL1 sys_reg(3, 0, 12, 12, 7)
191 208
192#define SYS_CONTEXTIDR_EL1 sys_reg(3, 0, 13, 0, 1) 209#define SYS_CONTEXTIDR_EL1 sys_reg(3, 0, 13, 0, 1)
193#define SYS_TPIDR_EL1 sys_reg(3, 0, 13, 0, 4) 210#define SYS_TPIDR_EL1 sys_reg(3, 0, 13, 0, 4)
@@ -287,8 +304,8 @@
287#define SCTLR_ELx_M 1 304#define SCTLR_ELx_M 1
288 305
289#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ 306#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \
290 (1 << 16) | (1 << 18) | (1 << 22) | (1 << 23) | \ 307 (1 << 18) | (1 << 22) | (1 << 23) | (1 << 28) | \
291 (1 << 28) | (1 << 29)) 308 (1 << 29))
292 309
293#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ 310#define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
294 SCTLR_ELx_SA | SCTLR_ELx_I) 311 SCTLR_ELx_SA | SCTLR_ELx_I)
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 70eea2ecc663..9f3ca24bbcc6 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -232,6 +232,9 @@ struct kvm_arch_memory_slot {
232#define KVM_ARM_VCPU_PMU_V3_CTRL 0 232#define KVM_ARM_VCPU_PMU_V3_CTRL 0
233#define KVM_ARM_VCPU_PMU_V3_IRQ 0 233#define KVM_ARM_VCPU_PMU_V3_IRQ 0
234#define KVM_ARM_VCPU_PMU_V3_INIT 1 234#define KVM_ARM_VCPU_PMU_V3_INIT 1
235#define KVM_ARM_VCPU_TIMER_CTRL 1
236#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
237#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
235 238
236/* KVM_IRQ_LINE irq field index values */ 239/* KVM_IRQ_LINE irq field index values */
237#define KVM_ARM_IRQ_TYPE_SHIFT 24 240#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 2ed2a7657711..0e27f86ee709 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -133,6 +133,27 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
133 MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), 133 MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00),
134 }, 134 },
135#endif 135#endif
136#ifdef CONFIG_CAVIUM_ERRATUM_30115
137 {
138 /* Cavium ThunderX, T88 pass 1.x - 2.2 */
139 .desc = "Cavium erratum 30115",
140 .capability = ARM64_WORKAROUND_CAVIUM_30115,
141 MIDR_RANGE(MIDR_THUNDERX, 0x00,
142 (1 << MIDR_VARIANT_SHIFT) | 2),
143 },
144 {
145 /* Cavium ThunderX, T81 pass 1.0 - 1.2 */
146 .desc = "Cavium erratum 30115",
147 .capability = ARM64_WORKAROUND_CAVIUM_30115,
148 MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02),
149 },
150 {
151 /* Cavium ThunderX, T83 pass 1.0 */
152 .desc = "Cavium erratum 30115",
153 .capability = ARM64_WORKAROUND_CAVIUM_30115,
154 MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00),
155 },
156#endif
136 { 157 {
137 .desc = "Mismatched cache line size", 158 .desc = "Mismatched cache line size",
138 .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE, 159 .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE,
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index b37446a8ffdb..5c7f657dd207 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -390,6 +390,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
390 case KVM_ARM_VCPU_PMU_V3_CTRL: 390 case KVM_ARM_VCPU_PMU_V3_CTRL:
391 ret = kvm_arm_pmu_v3_set_attr(vcpu, attr); 391 ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
392 break; 392 break;
393 case KVM_ARM_VCPU_TIMER_CTRL:
394 ret = kvm_arm_timer_set_attr(vcpu, attr);
395 break;
393 default: 396 default:
394 ret = -ENXIO; 397 ret = -ENXIO;
395 break; 398 break;
@@ -407,6 +410,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
407 case KVM_ARM_VCPU_PMU_V3_CTRL: 410 case KVM_ARM_VCPU_PMU_V3_CTRL:
408 ret = kvm_arm_pmu_v3_get_attr(vcpu, attr); 411 ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
409 break; 412 break;
413 case KVM_ARM_VCPU_TIMER_CTRL:
414 ret = kvm_arm_timer_get_attr(vcpu, attr);
415 break;
410 default: 416 default:
411 ret = -ENXIO; 417 ret = -ENXIO;
412 break; 418 break;
@@ -424,6 +430,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
424 case KVM_ARM_VCPU_PMU_V3_CTRL: 430 case KVM_ARM_VCPU_PMU_V3_CTRL:
425 ret = kvm_arm_pmu_v3_has_attr(vcpu, attr); 431 ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
426 break; 432 break;
433 case KVM_ARM_VCPU_TIMER_CTRL:
434 ret = kvm_arm_timer_has_attr(vcpu, attr);
435 break;
427 default: 436 default:
428 ret = -ENXIO; 437 ret = -ENXIO;
429 break; 438 break;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index fa1b18e364fc..17d8a1677a0b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -89,6 +89,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
89 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false); 89 trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
90 vcpu->stat.wfi_exit_stat++; 90 vcpu->stat.wfi_exit_stat++;
91 kvm_vcpu_block(vcpu); 91 kvm_vcpu_block(vcpu);
92 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
92 } 93 }
93 94
94 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); 95 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index aede1658aeda..945e79c641c4 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -350,6 +350,20 @@ again:
350 } 350 }
351 } 351 }
352 352
353 if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
354 exit_code == ARM_EXCEPTION_TRAP &&
355 (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 ||
356 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
357 int ret = __vgic_v3_perform_cpuif_access(vcpu);
358
359 if (ret == 1) {
360 __skip_instr(vcpu);
361 goto again;
362 }
363
364 /* 0 falls through to be handled out of EL2 */
365 }
366
353 fp_enabled = __fpsimd_enabled(); 367 fp_enabled = __fpsimd_enabled();
354 368
355 __sysreg_save_guest_state(guest_ctxt); 369 __sysreg_save_guest_state(guest_ctxt);
@@ -422,6 +436,7 @@ void __hyp_text __noreturn __hyp_panic(void)
422 436
423 vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2); 437 vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
424 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); 438 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
439 __timer_save_state(vcpu);
425 __deactivate_traps(vcpu); 440 __deactivate_traps(vcpu);
426 __deactivate_vm(vcpu); 441 __deactivate_vm(vcpu);
427 __sysreg_restore_host_state(host_ctxt); 442 __sysreg_restore_host_state(host_ctxt);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 561badf93de8..3256b9228e75 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -46,16 +46,6 @@ static const struct kvm_regs default_regs_reset32 = {
46 COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), 46 COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT),
47}; 47};
48 48
49static const struct kvm_irq_level default_ptimer_irq = {
50 .irq = 30,
51 .level = 1,
52};
53
54static const struct kvm_irq_level default_vtimer_irq = {
55 .irq = 27,
56 .level = 1,
57};
58
59static bool cpu_has_32bit_el1(void) 49static bool cpu_has_32bit_el1(void)
60{ 50{
61 u64 pfr0; 51 u64 pfr0;
@@ -108,8 +98,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
108 */ 98 */
109int kvm_reset_vcpu(struct kvm_vcpu *vcpu) 99int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
110{ 100{
111 const struct kvm_irq_level *cpu_vtimer_irq;
112 const struct kvm_irq_level *cpu_ptimer_irq;
113 const struct kvm_regs *cpu_reset; 101 const struct kvm_regs *cpu_reset;
114 102
115 switch (vcpu->arch.target) { 103 switch (vcpu->arch.target) {
@@ -122,8 +110,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
122 cpu_reset = &default_regs_reset; 110 cpu_reset = &default_regs_reset;
123 } 111 }
124 112
125 cpu_vtimer_irq = &default_vtimer_irq;
126 cpu_ptimer_irq = &default_ptimer_irq;
127 break; 113 break;
128 } 114 }
129 115
@@ -137,5 +123,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
137 kvm_pmu_vcpu_reset(vcpu); 123 kvm_pmu_vcpu_reset(vcpu);
138 124
139 /* Reset timer */ 125 /* Reset timer */
140 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq); 126 return kvm_timer_vcpu_reset(vcpu);
141} 127}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 0fe27024a2e1..77862881ae86 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -56,7 +56,8 @@
56 */ 56 */
57 57
58static bool read_from_write_only(struct kvm_vcpu *vcpu, 58static bool read_from_write_only(struct kvm_vcpu *vcpu,
59 const struct sys_reg_params *params) 59 struct sys_reg_params *params,
60 const struct sys_reg_desc *r)
60{ 61{
61 WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); 62 WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n");
62 print_sys_reg_instr(params); 63 print_sys_reg_instr(params);
@@ -64,6 +65,16 @@ static bool read_from_write_only(struct kvm_vcpu *vcpu,
64 return false; 65 return false;
65} 66}
66 67
68static bool write_to_read_only(struct kvm_vcpu *vcpu,
69 struct sys_reg_params *params,
70 const struct sys_reg_desc *r)
71{
72 WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n");
73 print_sys_reg_instr(params);
74 kvm_inject_undefined(vcpu);
75 return false;
76}
77
67/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ 78/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
68static u32 cache_levels; 79static u32 cache_levels;
69 80
@@ -93,7 +104,7 @@ static bool access_dcsw(struct kvm_vcpu *vcpu,
93 const struct sys_reg_desc *r) 104 const struct sys_reg_desc *r)
94{ 105{
95 if (!p->is_write) 106 if (!p->is_write)
96 return read_from_write_only(vcpu, p); 107 return read_from_write_only(vcpu, p, r);
97 108
98 kvm_set_way_flush(vcpu); 109 kvm_set_way_flush(vcpu);
99 return true; 110 return true;
@@ -135,7 +146,7 @@ static bool access_gic_sgi(struct kvm_vcpu *vcpu,
135 const struct sys_reg_desc *r) 146 const struct sys_reg_desc *r)
136{ 147{
137 if (!p->is_write) 148 if (!p->is_write)
138 return read_from_write_only(vcpu, p); 149 return read_from_write_only(vcpu, p, r);
139 150
140 vgic_v3_dispatch_sgi(vcpu, p->regval); 151 vgic_v3_dispatch_sgi(vcpu, p->regval);
141 152
@@ -773,7 +784,7 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
773 return trap_raz_wi(vcpu, p, r); 784 return trap_raz_wi(vcpu, p, r);
774 785
775 if (!p->is_write) 786 if (!p->is_write)
776 return read_from_write_only(vcpu, p); 787 return read_from_write_only(vcpu, p, r);
777 788
778 if (pmu_write_swinc_el0_disabled(vcpu)) 789 if (pmu_write_swinc_el0_disabled(vcpu))
779 return false; 790 return false;
@@ -953,7 +964,15 @@ static const struct sys_reg_desc sys_reg_descs[] = {
953 964
954 { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, 965 { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
955 966
967 { SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
968 { SYS_DESC(SYS_ICC_EOIR0_EL1), read_from_write_only },
969 { SYS_DESC(SYS_ICC_HPPIR0_EL1), write_to_read_only },
970 { SYS_DESC(SYS_ICC_DIR_EL1), read_from_write_only },
971 { SYS_DESC(SYS_ICC_RPR_EL1), write_to_read_only },
956 { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, 972 { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
973 { SYS_DESC(SYS_ICC_IAR1_EL1), write_to_read_only },
974 { SYS_DESC(SYS_ICC_EOIR1_EL1), read_from_write_only },
975 { SYS_DESC(SYS_ICC_HPPIR1_EL1), write_to_read_only },
957 { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, 976 { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
958 977
959 { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, 978 { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index 6260b69e5622..116786d2e8e8 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -268,36 +268,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
268 return true; 268 return true;
269} 269}
270static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { 270static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
271 /* ICC_PMR_EL1 */ 271 { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr },
272 { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr }, 272 { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 },
273 /* ICC_BPR0_EL1 */ 273 { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r },
274 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 }, 274 { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r },
275 /* ICC_AP0R0_EL1 */ 275 { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r },
276 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r }, 276 { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r },
277 /* ICC_AP0R1_EL1 */ 277 { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r },
278 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r }, 278 { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r },
279 /* ICC_AP0R2_EL1 */ 279 { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r },
280 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r }, 280 { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r },
281 /* ICC_AP0R3_EL1 */ 281 { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 },
282 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r }, 282 { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr },
283 /* ICC_AP1R0_EL1 */ 283 { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
284 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r }, 284 { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 },
285 /* ICC_AP1R1_EL1 */ 285 { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 },
286 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r },
287 /* ICC_AP1R2_EL1 */
288 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r },
289 /* ICC_AP1R3_EL1 */
290 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r },
291 /* ICC_BPR1_EL1 */
292 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 },
293 /* ICC_CTLR_EL1 */
294 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr },
295 /* ICC_SRE_EL1 */
296 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre },
297 /* ICC_IGRPEN0_EL1 */
298 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 },
299 /* ICC_GRPEN1_EL1 */
300 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 },
301}; 286};
302 287
303int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, 288int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index a563759fd142..6a0d7040d882 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -1094,7 +1094,7 @@ static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
1094 struct mm_struct *mm; 1094 struct mm_struct *mm;
1095 int i; 1095 int i;
1096 1096
1097 if (likely(!vcpu->requests)) 1097 if (likely(!kvm_request_pending(vcpu)))
1098 return; 1098 return;
1099 1099
1100 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { 1100 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 71d8856ade64..74805035edc8 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -2337,7 +2337,7 @@ static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
2337 int ret = 0; 2337 int ret = 0;
2338 int i; 2338 int i;
2339 2339
2340 if (!vcpu->requests) 2340 if (!kvm_request_pending(vcpu))
2341 return 0; 2341 return 0;
2342 2342
2343 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { 2343 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 2bf35017ffc0..b8d5b8e35244 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -86,7 +86,6 @@ struct kvmppc_vcore {
86 u16 last_cpu; 86 u16 last_cpu;
87 u8 vcore_state; 87 u8 vcore_state;
88 u8 in_guest; 88 u8 in_guest;
89 struct kvmppc_vcore *master_vcore;
90 struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; 89 struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
91 struct list_head preempt_list; 90 struct list_head preempt_list;
92 spinlock_t lock; 91 spinlock_t lock;
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index b148496ffe36..7cea76f11c26 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -81,7 +81,7 @@ struct kvm_split_mode {
81 u8 subcore_size; 81 u8 subcore_size;
82 u8 do_nap; 82 u8 do_nap;
83 u8 napped[MAX_SMT_THREADS]; 83 u8 napped[MAX_SMT_THREADS];
84 struct kvmppc_vcore *master_vcs[MAX_SUBCORES]; 84 struct kvmppc_vcore *vc[MAX_SUBCORES];
85}; 85};
86 86
87/* 87/*
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9c51ac4b8f36..8b3f1238d07f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
35#include <asm/page.h> 35#include <asm/page.h>
36#include <asm/cacheflush.h> 36#include <asm/cacheflush.h>
37#include <asm/hvcall.h> 37#include <asm/hvcall.h>
38#include <asm/mce.h>
38 39
39#define KVM_MAX_VCPUS NR_CPUS 40#define KVM_MAX_VCPUS NR_CPUS
40#define KVM_MAX_VCORES NR_CPUS 41#define KVM_MAX_VCORES NR_CPUS
@@ -52,8 +53,8 @@
52#define KVM_IRQCHIP_NUM_PINS 256 53#define KVM_IRQCHIP_NUM_PINS 256
53 54
54/* PPC-specific vcpu->requests bit members */ 55/* PPC-specific vcpu->requests bit members */
55#define KVM_REQ_WATCHDOG 8 56#define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0)
56#define KVM_REQ_EPR_EXIT 9 57#define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1)
57 58
58#include <linux/mmu_notifier.h> 59#include <linux/mmu_notifier.h>
59 60
@@ -267,6 +268,8 @@ struct kvm_resize_hpt;
267 268
268struct kvm_arch { 269struct kvm_arch {
269 unsigned int lpid; 270 unsigned int lpid;
271 unsigned int smt_mode; /* # vcpus per virtual core */
272 unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */
270#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 273#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
271 unsigned int tlb_sets; 274 unsigned int tlb_sets;
272 struct kvm_hpt_info hpt; 275 struct kvm_hpt_info hpt;
@@ -285,6 +288,7 @@ struct kvm_arch {
285 cpumask_t need_tlb_flush; 288 cpumask_t need_tlb_flush;
286 cpumask_t cpu_in_guest; 289 cpumask_t cpu_in_guest;
287 u8 radix; 290 u8 radix;
291 u8 fwnmi_enabled;
288 pgd_t *pgtable; 292 pgd_t *pgtable;
289 u64 process_table; 293 u64 process_table;
290 struct dentry *debugfs_dir; 294 struct dentry *debugfs_dir;
@@ -566,6 +570,7 @@ struct kvm_vcpu_arch {
566 ulong wort; 570 ulong wort;
567 ulong tid; 571 ulong tid;
568 ulong psscr; 572 ulong psscr;
573 ulong hfscr;
569 ulong shadow_srr1; 574 ulong shadow_srr1;
570#endif 575#endif
571 u32 vrsave; /* also USPRG0 */ 576 u32 vrsave; /* also USPRG0 */
@@ -579,7 +584,7 @@ struct kvm_vcpu_arch {
579 ulong mcsrr0; 584 ulong mcsrr0;
580 ulong mcsrr1; 585 ulong mcsrr1;
581 ulong mcsr; 586 ulong mcsr;
582 u32 dec; 587 ulong dec;
583#ifdef CONFIG_BOOKE 588#ifdef CONFIG_BOOKE
584 u32 decar; 589 u32 decar;
585#endif 590#endif
@@ -710,6 +715,7 @@ struct kvm_vcpu_arch {
710 unsigned long pending_exceptions; 715 unsigned long pending_exceptions;
711 u8 ceded; 716 u8 ceded;
712 u8 prodded; 717 u8 prodded;
718 u8 doorbell_request;
713 u32 last_inst; 719 u32 last_inst;
714 720
715 struct swait_queue_head *wqp; 721 struct swait_queue_head *wqp;
@@ -722,6 +728,7 @@ struct kvm_vcpu_arch {
722 int prev_cpu; 728 int prev_cpu;
723 bool timer_running; 729 bool timer_running;
724 wait_queue_head_t cpu_run; 730 wait_queue_head_t cpu_run;
731 struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
725 732
726 struct kvm_vcpu_arch_shared *shared; 733 struct kvm_vcpu_arch_shared *shared;
727#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) 734#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e0d88c38602b..ba5fadd6f3c9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -315,6 +315,8 @@ struct kvmppc_ops {
315 struct irq_bypass_producer *); 315 struct irq_bypass_producer *);
316 int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); 316 int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
317 int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); 317 int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
318 int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
319 unsigned long flags);
318}; 320};
319 321
320extern struct kvmppc_ops *kvmppc_hv_ops; 322extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 3a8d278e7421..1a9b45198c06 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -103,6 +103,8 @@
103#define OP_31_XOP_STBUX 247 103#define OP_31_XOP_STBUX 247
104#define OP_31_XOP_LHZX 279 104#define OP_31_XOP_LHZX 279
105#define OP_31_XOP_LHZUX 311 105#define OP_31_XOP_LHZUX 311
106#define OP_31_XOP_MSGSNDP 142
107#define OP_31_XOP_MSGCLRP 174
106#define OP_31_XOP_MFSPR 339 108#define OP_31_XOP_MFSPR 339
107#define OP_31_XOP_LWAX 341 109#define OP_31_XOP_LWAX 341
108#define OP_31_XOP_LHAX 343 110#define OP_31_XOP_LHAX 343
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 07fbeb927834..8cf8f0c96906 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -60,6 +60,12 @@ struct kvm_regs {
60 60
61#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ 61#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */
62 62
63/* flags for kvm_run.flags */
64#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0)
65#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0)
66#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0)
67#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0)
68
63/* 69/*
64 * Feature bits indicate which sections of the sregs struct are valid, 70 * Feature bits indicate which sections of the sregs struct are valid,
65 * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers 71 * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 709e23425317..ae8e89e0d083 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -485,6 +485,7 @@ int main(void)
485 OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); 485 OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
486 OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); 486 OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
487 OFFSET(KVM_RADIX, kvm, arch.radix); 487 OFFSET(KVM_RADIX, kvm, arch.radix);
488 OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
488 OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); 489 OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
489 OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); 490 OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
490 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); 491 OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
@@ -513,6 +514,7 @@ int main(void)
513 OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); 514 OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
514 OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); 515 OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
515 OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); 516 OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
517 OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
516 OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); 518 OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
517 OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); 519 OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
518 OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); 520 OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
@@ -542,6 +544,7 @@ int main(void)
542 OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); 544 OFFSET(VCPU_WORT, kvm_vcpu, arch.wort);
543 OFFSET(VCPU_TID, kvm_vcpu, arch.tid); 545 OFFSET(VCPU_TID, kvm_vcpu, arch.tid);
544 OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); 546 OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr);
547 OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr);
545 OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); 548 OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map);
546 OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); 549 OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest);
547 OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); 550 OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 5f9eada3519b..a9bfa49f3698 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -405,6 +405,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
405 break; 405 break;
406 } 406 }
407} 407}
408EXPORT_SYMBOL_GPL(machine_check_print_event_info);
408 409
409uint64_t get_mce_fault_addr(struct machine_check_event *evt) 410uint64_t get_mce_fault_addr(struct machine_check_event *evt)
410{ 411{
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 773b35d16a0b..0b436df746fc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -46,6 +46,8 @@
46#include <linux/of.h> 46#include <linux/of.h>
47 47
48#include <asm/reg.h> 48#include <asm/reg.h>
49#include <asm/ppc-opcode.h>
50#include <asm/disassemble.h>
49#include <asm/cputable.h> 51#include <asm/cputable.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
51#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
@@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
645 unsigned long stolen; 647 unsigned long stolen;
646 unsigned long core_stolen; 648 unsigned long core_stolen;
647 u64 now; 649 u64 now;
650 unsigned long flags;
648 651
649 dt = vcpu->arch.dtl_ptr; 652 dt = vcpu->arch.dtl_ptr;
650 vpa = vcpu->arch.vpa.pinned_addr; 653 vpa = vcpu->arch.vpa.pinned_addr;
@@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
652 core_stolen = vcore_stolen_time(vc, now); 655 core_stolen = vcore_stolen_time(vc, now);
653 stolen = core_stolen - vcpu->arch.stolen_logged; 656 stolen = core_stolen - vcpu->arch.stolen_logged;
654 vcpu->arch.stolen_logged = core_stolen; 657 vcpu->arch.stolen_logged = core_stolen;
655 spin_lock_irq(&vcpu->arch.tbacct_lock); 658 spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
656 stolen += vcpu->arch.busy_stolen; 659 stolen += vcpu->arch.busy_stolen;
657 vcpu->arch.busy_stolen = 0; 660 vcpu->arch.busy_stolen = 0;
658 spin_unlock_irq(&vcpu->arch.tbacct_lock); 661 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
659 if (!dt || !vpa) 662 if (!dt || !vpa)
660 return; 663 return;
661 memset(dt, 0, sizeof(struct dtl_entry)); 664 memset(dt, 0, sizeof(struct dtl_entry));
@@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
675 vcpu->arch.dtl.dirty = true; 678 vcpu->arch.dtl.dirty = true;
676} 679}
677 680
681/* See if there is a doorbell interrupt pending for a vcpu */
682static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
683{
684 int thr;
685 struct kvmppc_vcore *vc;
686
687 if (vcpu->arch.doorbell_request)
688 return true;
689 /*
690 * Ensure that the read of vcore->dpdes comes after the read
691 * of vcpu->doorbell_request. This barrier matches the
692 * lwsync in book3s_hv_rmhandlers.S just before the
693 * fast_guest_return label.
694 */
695 smp_rmb();
696 vc = vcpu->arch.vcore;
697 thr = vcpu->vcpu_id - vc->first_vcpuid;
698 return !!(vc->dpdes & (1 << thr));
699}
700
678static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) 701static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
679{ 702{
680 if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) 703 if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
@@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run,
926 } 949 }
927} 950}
928 951
952static void do_nothing(void *x)
953{
954}
955
956static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
957{
958 int thr, cpu, pcpu, nthreads;
959 struct kvm_vcpu *v;
960 unsigned long dpdes;
961
962 nthreads = vcpu->kvm->arch.emul_smt_mode;
963 dpdes = 0;
964 cpu = vcpu->vcpu_id & ~(nthreads - 1);
965 for (thr = 0; thr < nthreads; ++thr, ++cpu) {
966 v = kvmppc_find_vcpu(vcpu->kvm, cpu);
967 if (!v)
968 continue;
969 /*
970 * If the vcpu is currently running on a physical cpu thread,
971 * interrupt it in order to pull it out of the guest briefly,
972 * which will update its vcore->dpdes value.
973 */
974 pcpu = READ_ONCE(v->cpu);
975 if (pcpu >= 0)
976 smp_call_function_single(pcpu, do_nothing, NULL, 1);
977 if (kvmppc_doorbell_pending(v))
978 dpdes |= 1 << thr;
979 }
980 return dpdes;
981}
982
983/*
984 * On POWER9, emulate doorbell-related instructions in order to
985 * give the guest the illusion of running on a multi-threaded core.
986 * The instructions emulated are msgsndp, msgclrp, mfspr TIR,
987 * and mfspr DPDES.
988 */
989static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
990{
991 u32 inst, rb, thr;
992 unsigned long arg;
993 struct kvm *kvm = vcpu->kvm;
994 struct kvm_vcpu *tvcpu;
995
996 if (!cpu_has_feature(CPU_FTR_ARCH_300))
997 return EMULATE_FAIL;
998 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
999 return RESUME_GUEST;
1000 if (get_op(inst) != 31)
1001 return EMULATE_FAIL;
1002 rb = get_rb(inst);
1003 thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
1004 switch (get_xop(inst)) {
1005 case OP_31_XOP_MSGSNDP:
1006 arg = kvmppc_get_gpr(vcpu, rb);
1007 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
1008 break;
1009 arg &= 0x3f;
1010 if (arg >= kvm->arch.emul_smt_mode)
1011 break;
1012 tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
1013 if (!tvcpu)
1014 break;
1015 if (!tvcpu->arch.doorbell_request) {
1016 tvcpu->arch.doorbell_request = 1;
1017 kvmppc_fast_vcpu_kick_hv(tvcpu);
1018 }
1019 break;
1020 case OP_31_XOP_MSGCLRP:
1021 arg = kvmppc_get_gpr(vcpu, rb);
1022 if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
1023 break;
1024 vcpu->arch.vcore->dpdes = 0;
1025 vcpu->arch.doorbell_request = 0;
1026 break;
1027 case OP_31_XOP_MFSPR:
1028 switch (get_sprn(inst)) {
1029 case SPRN_TIR:
1030 arg = thr;
1031 break;
1032 case SPRN_DPDES:
1033 arg = kvmppc_read_dpdes(vcpu);
1034 break;
1035 default:
1036 return EMULATE_FAIL;
1037 }
1038 kvmppc_set_gpr(vcpu, get_rt(inst), arg);
1039 break;
1040 default:
1041 return EMULATE_FAIL;
1042 }
1043 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
1044 return RESUME_GUEST;
1045}
1046
929static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, 1047static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
930 struct task_struct *tsk) 1048 struct task_struct *tsk)
931{ 1049{
@@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
971 r = RESUME_GUEST; 1089 r = RESUME_GUEST;
972 break; 1090 break;
973 case BOOK3S_INTERRUPT_MACHINE_CHECK: 1091 case BOOK3S_INTERRUPT_MACHINE_CHECK:
974 /* 1092 /* Exit to guest with KVM_EXIT_NMI as exit reason */
975 * Deliver a machine check interrupt to the guest. 1093 run->exit_reason = KVM_EXIT_NMI;
976 * We have to do this, even if the host has handled the 1094 run->hw.hardware_exit_reason = vcpu->arch.trap;
977 * machine check, because machine checks use SRR0/1 and 1095 /* Clear out the old NMI status from run->flags */
978 * the interrupt might have trashed guest state in them. 1096 run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
979 */ 1097 /* Now set the NMI status */
980 kvmppc_book3s_queue_irqprio(vcpu, 1098 if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
981 BOOK3S_INTERRUPT_MACHINE_CHECK); 1099 run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
982 r = RESUME_GUEST; 1100 else
1101 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
1102
1103 r = RESUME_HOST;
1104 /* Print the MCE event to host console. */
1105 machine_check_print_event_info(&vcpu->arch.mce_evt, false);
983 break; 1106 break;
984 case BOOK3S_INTERRUPT_PROGRAM: 1107 case BOOK3S_INTERRUPT_PROGRAM:
985 { 1108 {
@@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1048 break; 1171 break;
1049 /* 1172 /*
1050 * This occurs if the guest (kernel or userspace), does something that 1173 * This occurs if the guest (kernel or userspace), does something that
1051 * is prohibited by HFSCR. We just generate a program interrupt to 1174 * is prohibited by HFSCR.
1052 * the guest. 1175 * On POWER9, this could be a doorbell instruction that we need
1176 * to emulate.
1177 * Otherwise, we just generate a program interrupt to the guest.
1053 */ 1178 */
1054 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: 1179 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
1055 kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 1180 r = EMULATE_FAIL;
1056 r = RESUME_GUEST; 1181 if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
1182 r = kvmppc_emulate_doorbell_instr(vcpu);
1183 if (r == EMULATE_FAIL) {
1184 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
1185 r = RESUME_GUEST;
1186 }
1057 break; 1187 break;
1058 case BOOK3S_INTERRUPT_HV_RM_HARD: 1188 case BOOK3S_INTERRUPT_HV_RM_HARD:
1059 r = RESUME_PASSTHROUGH; 1189 r = RESUME_PASSTHROUGH;
@@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1143 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; 1273 mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
1144 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 1274 if (cpu_has_feature(CPU_FTR_ARCH_207S))
1145 mask |= LPCR_AIL; 1275 mask |= LPCR_AIL;
1276 /*
1277 * On POWER9, allow userspace to enable large decrementer for the
1278 * guest, whether or not the host has it enabled.
1279 */
1280 if (cpu_has_feature(CPU_FTR_ARCH_300))
1281 mask |= LPCR_LD;
1146 1282
1147 /* Broken 32-bit version of LPCR must not clear top bits */ 1283 /* Broken 32-bit version of LPCR must not clear top bits */
1148 if (preserve_top32) 1284 if (preserve_top32)
@@ -1611,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1611 init_swait_queue_head(&vcore->wq); 1747 init_swait_queue_head(&vcore->wq);
1612 vcore->preempt_tb = TB_NIL; 1748 vcore->preempt_tb = TB_NIL;
1613 vcore->lpcr = kvm->arch.lpcr; 1749 vcore->lpcr = kvm->arch.lpcr;
1614 vcore->first_vcpuid = core * threads_per_vcore(); 1750 vcore->first_vcpuid = core * kvm->arch.smt_mode;
1615 vcore->kvm = kvm; 1751 vcore->kvm = kvm;
1616 INIT_LIST_HEAD(&vcore->preempt_list); 1752 INIT_LIST_HEAD(&vcore->preempt_list);
1617 1753
@@ -1770,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1770 unsigned int id) 1906 unsigned int id)
1771{ 1907{
1772 struct kvm_vcpu *vcpu; 1908 struct kvm_vcpu *vcpu;
1773 int err = -EINVAL; 1909 int err;
1774 int core; 1910 int core;
1775 struct kvmppc_vcore *vcore; 1911 struct kvmppc_vcore *vcore;
1776 1912
1777 core = id / threads_per_vcore();
1778 if (core >= KVM_MAX_VCORES)
1779 goto out;
1780
1781 err = -ENOMEM; 1913 err = -ENOMEM;
1782 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 1914 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1783 if (!vcpu) 1915 if (!vcpu)
@@ -1808,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1808 vcpu->arch.busy_preempt = TB_NIL; 1940 vcpu->arch.busy_preempt = TB_NIL;
1809 vcpu->arch.intr_msr = MSR_SF | MSR_ME; 1941 vcpu->arch.intr_msr = MSR_SF | MSR_ME;
1810 1942
1943 /*
1944 * Set the default HFSCR for the guest from the host value.
1945 * This value is only used on POWER9.
1946 * On POWER9 DD1, TM doesn't work, so we make sure to
1947 * prevent the guest from using it.
1948 * On POWER9, we want to virtualize the doorbell facility, so we
1949 * turn off the HFSCR bit, which causes those instructions to trap.
1950 */
1951 vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
1952 if (!cpu_has_feature(CPU_FTR_TM))
1953 vcpu->arch.hfscr &= ~HFSCR_TM;
1954 if (cpu_has_feature(CPU_FTR_ARCH_300))
1955 vcpu->arch.hfscr &= ~HFSCR_MSGP;
1956
1811 kvmppc_mmu_book3s_hv_init(vcpu); 1957 kvmppc_mmu_book3s_hv_init(vcpu);
1812 1958
1813 vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 1959 vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -1815,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1815 init_waitqueue_head(&vcpu->arch.cpu_run); 1961 init_waitqueue_head(&vcpu->arch.cpu_run);
1816 1962
1817 mutex_lock(&kvm->lock); 1963 mutex_lock(&kvm->lock);
1818 vcore = kvm->arch.vcores[core]; 1964 vcore = NULL;
1819 if (!vcore) { 1965 err = -EINVAL;
1820 vcore = kvmppc_vcore_create(kvm, core); 1966 core = id / kvm->arch.smt_mode;
1821 kvm->arch.vcores[core] = vcore; 1967 if (core < KVM_MAX_VCORES) {
1822 kvm->arch.online_vcores++; 1968 vcore = kvm->arch.vcores[core];
1969 if (!vcore) {
1970 err = -ENOMEM;
1971 vcore = kvmppc_vcore_create(kvm, core);
1972 kvm->arch.vcores[core] = vcore;
1973 kvm->arch.online_vcores++;
1974 }
1823 } 1975 }
1824 mutex_unlock(&kvm->lock); 1976 mutex_unlock(&kvm->lock);
1825 1977
@@ -1847,6 +1999,43 @@ out:
1847 return ERR_PTR(err); 1999 return ERR_PTR(err);
1848} 2000}
1849 2001
2002static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
2003 unsigned long flags)
2004{
2005 int err;
2006 int esmt = 0;
2007
2008 if (flags)
2009 return -EINVAL;
2010 if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
2011 return -EINVAL;
2012 if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
2013 /*
2014 * On POWER8 (or POWER7), the threading mode is "strict",
2015 * so we pack smt_mode vcpus per vcore.
2016 */
2017 if (smt_mode > threads_per_subcore)
2018 return -EINVAL;
2019 } else {
2020 /*
2021 * On POWER9, the threading mode is "loose",
2022 * so each vcpu gets its own vcore.
2023 */
2024 esmt = smt_mode;
2025 smt_mode = 1;
2026 }
2027 mutex_lock(&kvm->lock);
2028 err = -EBUSY;
2029 if (!kvm->arch.online_vcores) {
2030 kvm->arch.smt_mode = smt_mode;
2031 kvm->arch.emul_smt_mode = esmt;
2032 err = 0;
2033 }
2034 mutex_unlock(&kvm->lock);
2035
2036 return err;
2037}
2038
1850static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) 2039static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
1851{ 2040{
1852 if (vpa->pinned_addr) 2041 if (vpa->pinned_addr)
@@ -1897,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
1897 } 2086 }
1898} 2087}
1899 2088
1900extern void __kvmppc_vcore_entry(void); 2089extern int __kvmppc_vcore_entry(void);
1901 2090
1902static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 2091static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
1903 struct kvm_vcpu *vcpu) 2092 struct kvm_vcpu *vcpu)
@@ -1962,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu)
1962 tpaca->kvm_hstate.kvm_split_mode = NULL; 2151 tpaca->kvm_hstate.kvm_split_mode = NULL;
1963} 2152}
1964 2153
1965static void do_nothing(void *x)
1966{
1967}
1968
1969static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) 2154static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
1970{ 2155{
1971 int i; 2156 int i;
@@ -1983,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
1983 smp_call_function_single(cpu + i, do_nothing, NULL, 1); 2168 smp_call_function_single(cpu + i, do_nothing, NULL, 1);
1984} 2169}
1985 2170
2171static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
2172{
2173 struct kvm *kvm = vcpu->kvm;
2174
2175 /*
2176 * With radix, the guest can do TLB invalidations itself,
2177 * and it could choose to use the local form (tlbiel) if
2178 * it is invalidating a translation that has only ever been
2179 * used on one vcpu. However, that doesn't mean it has
2180 * only ever been used on one physical cpu, since vcpus
2181 * can move around between pcpus. To cope with this, when
2182 * a vcpu moves from one pcpu to another, we need to tell
2183 * any vcpus running on the same core as this vcpu previously
2184 * ran to flush the TLB. The TLB is shared between threads,
2185 * so we use a single bit in .need_tlb_flush for all 4 threads.
2186 */
2187 if (vcpu->arch.prev_cpu != pcpu) {
2188 if (vcpu->arch.prev_cpu >= 0 &&
2189 cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2190 cpu_first_thread_sibling(pcpu))
2191 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2192 vcpu->arch.prev_cpu = pcpu;
2193 }
2194}
2195
1986static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) 2196static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1987{ 2197{
1988 int cpu; 2198 int cpu;
1989 struct paca_struct *tpaca; 2199 struct paca_struct *tpaca;
1990 struct kvmppc_vcore *mvc = vc->master_vcore;
1991 struct kvm *kvm = vc->kvm; 2200 struct kvm *kvm = vc->kvm;
1992 2201
1993 cpu = vc->pcpu; 2202 cpu = vc->pcpu;
@@ -1997,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1997 vcpu->arch.timer_running = 0; 2206 vcpu->arch.timer_running = 0;
1998 } 2207 }
1999 cpu += vcpu->arch.ptid; 2208 cpu += vcpu->arch.ptid;
2000 vcpu->cpu = mvc->pcpu; 2209 vcpu->cpu = vc->pcpu;
2001 vcpu->arch.thread_cpu = cpu; 2210 vcpu->arch.thread_cpu = cpu;
2002
2003 /*
2004 * With radix, the guest can do TLB invalidations itself,
2005 * and it could choose to use the local form (tlbiel) if
2006 * it is invalidating a translation that has only ever been
2007 * used on one vcpu. However, that doesn't mean it has
2008 * only ever been used on one physical cpu, since vcpus
2009 * can move around between pcpus. To cope with this, when
2010 * a vcpu moves from one pcpu to another, we need to tell
2011 * any vcpus running on the same core as this vcpu previously
2012 * ran to flush the TLB. The TLB is shared between threads,
2013 * so we use a single bit in .need_tlb_flush for all 4 threads.
2014 */
2015 if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
2016 if (vcpu->arch.prev_cpu >= 0 &&
2017 cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2018 cpu_first_thread_sibling(cpu))
2019 radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2020 vcpu->arch.prev_cpu = cpu;
2021 }
2022 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); 2211 cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
2023 } 2212 }
2024 tpaca = &paca[cpu]; 2213 tpaca = &paca[cpu];
2025 tpaca->kvm_hstate.kvm_vcpu = vcpu; 2214 tpaca->kvm_hstate.kvm_vcpu = vcpu;
2026 tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; 2215 tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
2027 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ 2216 /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
2028 smp_wmb(); 2217 smp_wmb();
2029 tpaca->kvm_hstate.kvm_vcore = mvc; 2218 tpaca->kvm_hstate.kvm_vcore = vc;
2030 if (cpu != smp_processor_id()) 2219 if (cpu != smp_processor_id())
2031 kvmppc_ipi_thread(cpu); 2220 kvmppc_ipi_thread(cpu);
2032} 2221}
@@ -2155,8 +2344,7 @@ struct core_info {
2155 int max_subcore_threads; 2344 int max_subcore_threads;
2156 int total_threads; 2345 int total_threads;
2157 int subcore_threads[MAX_SUBCORES]; 2346 int subcore_threads[MAX_SUBCORES];
2158 struct kvm *subcore_vm[MAX_SUBCORES]; 2347 struct kvmppc_vcore *vc[MAX_SUBCORES];
2159 struct list_head vcs[MAX_SUBCORES];
2160}; 2348};
2161 2349
2162/* 2350/*
@@ -2167,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
2167 2355
2168static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) 2356static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
2169{ 2357{
2170 int sub;
2171
2172 memset(cip, 0, sizeof(*cip)); 2358 memset(cip, 0, sizeof(*cip));
2173 cip->n_subcores = 1; 2359 cip->n_subcores = 1;
2174 cip->max_subcore_threads = vc->num_threads; 2360 cip->max_subcore_threads = vc->num_threads;
2175 cip->total_threads = vc->num_threads; 2361 cip->total_threads = vc->num_threads;
2176 cip->subcore_threads[0] = vc->num_threads; 2362 cip->subcore_threads[0] = vc->num_threads;
2177 cip->subcore_vm[0] = vc->kvm; 2363 cip->vc[0] = vc;
2178 for (sub = 0; sub < MAX_SUBCORES; ++sub)
2179 INIT_LIST_HEAD(&cip->vcs[sub]);
2180 list_add_tail(&vc->preempt_list, &cip->vcs[0]);
2181} 2364}
2182 2365
2183static bool subcore_config_ok(int n_subcores, int n_threads) 2366static bool subcore_config_ok(int n_subcores, int n_threads)
@@ -2197,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads)
2197 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; 2380 return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
2198} 2381}
2199 2382
2200static void init_master_vcore(struct kvmppc_vcore *vc) 2383static void init_vcore_to_run(struct kvmppc_vcore *vc)
2201{ 2384{
2202 vc->master_vcore = vc;
2203 vc->entry_exit_map = 0; 2385 vc->entry_exit_map = 0;
2204 vc->in_guest = 0; 2386 vc->in_guest = 0;
2205 vc->napping_threads = 0; 2387 vc->napping_threads = 0;
@@ -2224,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2224 ++cip->n_subcores; 2406 ++cip->n_subcores;
2225 cip->total_threads += vc->num_threads; 2407 cip->total_threads += vc->num_threads;
2226 cip->subcore_threads[sub] = vc->num_threads; 2408 cip->subcore_threads[sub] = vc->num_threads;
2227 cip->subcore_vm[sub] = vc->kvm; 2409 cip->vc[sub] = vc;
2228 init_master_vcore(vc); 2410 init_vcore_to_run(vc);
2229 list_move_tail(&vc->preempt_list, &cip->vcs[sub]); 2411 list_del_init(&vc->preempt_list);
2230 2412
2231 return true; 2413 return true;
2232} 2414}
@@ -2294,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
2294 spin_unlock(&lp->lock); 2476 spin_unlock(&lp->lock);
2295} 2477}
2296 2478
2479static bool recheck_signals(struct core_info *cip)
2480{
2481 int sub, i;
2482 struct kvm_vcpu *vcpu;
2483
2484 for (sub = 0; sub < cip->n_subcores; ++sub)
2485 for_each_runnable_thread(i, vcpu, cip->vc[sub])
2486 if (signal_pending(vcpu->arch.run_task))
2487 return true;
2488 return false;
2489}
2490
2297static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) 2491static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2298{ 2492{
2299 int still_running = 0, i; 2493 int still_running = 0, i;
@@ -2331,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2331 wake_up(&vcpu->arch.cpu_run); 2525 wake_up(&vcpu->arch.cpu_run);
2332 } 2526 }
2333 } 2527 }
2334 list_del_init(&vc->preempt_list);
2335 if (!is_master) { 2528 if (!is_master) {
2336 if (still_running > 0) { 2529 if (still_running > 0) {
2337 kvmppc_vcore_preempt(vc); 2530 kvmppc_vcore_preempt(vc);
@@ -2393,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu)
2393 return 0; 2586 return 0;
2394} 2587}
2395 2588
2589static void set_irq_happened(int trap)
2590{
2591 switch (trap) {
2592 case BOOK3S_INTERRUPT_EXTERNAL:
2593 local_paca->irq_happened |= PACA_IRQ_EE;
2594 break;
2595 case BOOK3S_INTERRUPT_H_DOORBELL:
2596 local_paca->irq_happened |= PACA_IRQ_DBELL;
2597 break;
2598 case BOOK3S_INTERRUPT_HMI:
2599 local_paca->irq_happened |= PACA_IRQ_HMI;
2600 break;
2601 }
2602}
2603
2396/* 2604/*
2397 * Run a set of guest threads on a physical core. 2605 * Run a set of guest threads on a physical core.
2398 * Called with vc->lock held. 2606 * Called with vc->lock held.
@@ -2403,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2403 int i; 2611 int i;
2404 int srcu_idx; 2612 int srcu_idx;
2405 struct core_info core_info; 2613 struct core_info core_info;
2406 struct kvmppc_vcore *pvc, *vcnext; 2614 struct kvmppc_vcore *pvc;
2407 struct kvm_split_mode split_info, *sip; 2615 struct kvm_split_mode split_info, *sip;
2408 int split, subcore_size, active; 2616 int split, subcore_size, active;
2409 int sub; 2617 int sub;
@@ -2412,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2412 int pcpu, thr; 2620 int pcpu, thr;
2413 int target_threads; 2621 int target_threads;
2414 int controlled_threads; 2622 int controlled_threads;
2623 int trap;
2415 2624
2416 /* 2625 /*
2417 * Remove from the list any threads that have a signal pending 2626 * Remove from the list any threads that have a signal pending
@@ -2426,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2426 /* 2635 /*
2427 * Initialize *vc. 2636 * Initialize *vc.
2428 */ 2637 */
2429 init_master_vcore(vc); 2638 init_vcore_to_run(vc);
2430 vc->preempt_tb = TB_NIL; 2639 vc->preempt_tb = TB_NIL;
2431 2640
2432 /* 2641 /*
@@ -2463,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2463 if (vc->num_threads < target_threads) 2672 if (vc->num_threads < target_threads)
2464 collect_piggybacks(&core_info, target_threads); 2673 collect_piggybacks(&core_info, target_threads);
2465 2674
2675 /*
2676 * On radix, arrange for TLB flushing if necessary.
2677 * This has to be done before disabling interrupts since
2678 * it uses smp_call_function().
2679 */
2680 pcpu = smp_processor_id();
2681 if (kvm_is_radix(vc->kvm)) {
2682 for (sub = 0; sub < core_info.n_subcores; ++sub)
2683 for_each_runnable_thread(i, vcpu, core_info.vc[sub])
2684 kvmppc_prepare_radix_vcpu(vcpu, pcpu);
2685 }
2686
2687 /*
2688 * Hard-disable interrupts, and check resched flag and signals.
2689 * If we need to reschedule or deliver a signal, clean up
2690 * and return without going into the guest(s).
2691 */
2692 local_irq_disable();
2693 hard_irq_disable();
2694 if (lazy_irq_pending() || need_resched() ||
2695 recheck_signals(&core_info)) {
2696 local_irq_enable();
2697 vc->vcore_state = VCORE_INACTIVE;
2698 /* Unlock all except the primary vcore */
2699 for (sub = 1; sub < core_info.n_subcores; ++sub) {
2700 pvc = core_info.vc[sub];
2701 /* Put back on to the preempted vcores list */
2702 kvmppc_vcore_preempt(pvc);
2703 spin_unlock(&pvc->lock);
2704 }
2705 for (i = 0; i < controlled_threads; ++i)
2706 kvmppc_release_hwthread(pcpu + i);
2707 return;
2708 }
2709
2710 kvmppc_clear_host_core(pcpu);
2711
2466 /* Decide on micro-threading (split-core) mode */ 2712 /* Decide on micro-threading (split-core) mode */
2467 subcore_size = threads_per_subcore; 2713 subcore_size = threads_per_subcore;
2468 cmd_bit = stat_bit = 0; 2714 cmd_bit = stat_bit = 0;
@@ -2486,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2486 split_info.ldbar = mfspr(SPRN_LDBAR); 2732 split_info.ldbar = mfspr(SPRN_LDBAR);
2487 split_info.subcore_size = subcore_size; 2733 split_info.subcore_size = subcore_size;
2488 for (sub = 0; sub < core_info.n_subcores; ++sub) 2734 for (sub = 0; sub < core_info.n_subcores; ++sub)
2489 split_info.master_vcs[sub] = 2735 split_info.vc[sub] = core_info.vc[sub];
2490 list_first_entry(&core_info.vcs[sub],
2491 struct kvmppc_vcore, preempt_list);
2492 /* order writes to split_info before kvm_split_mode pointer */ 2736 /* order writes to split_info before kvm_split_mode pointer */
2493 smp_wmb(); 2737 smp_wmb();
2494 } 2738 }
2495 pcpu = smp_processor_id();
2496 for (thr = 0; thr < controlled_threads; ++thr) 2739 for (thr = 0; thr < controlled_threads; ++thr)
2497 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; 2740 paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
2498 2741
@@ -2512,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2512 } 2755 }
2513 } 2756 }
2514 2757
2515 kvmppc_clear_host_core(pcpu);
2516
2517 /* Start all the threads */ 2758 /* Start all the threads */
2518 active = 0; 2759 active = 0;
2519 for (sub = 0; sub < core_info.n_subcores; ++sub) { 2760 for (sub = 0; sub < core_info.n_subcores; ++sub) {
2520 thr = subcore_thread_map[sub]; 2761 thr = subcore_thread_map[sub];
2521 thr0_done = false; 2762 thr0_done = false;
2522 active |= 1 << thr; 2763 active |= 1 << thr;
2523 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { 2764 pvc = core_info.vc[sub];
2524 pvc->pcpu = pcpu + thr; 2765 pvc->pcpu = pcpu + thr;
2525 for_each_runnable_thread(i, vcpu, pvc) { 2766 for_each_runnable_thread(i, vcpu, pvc) {
2526 kvmppc_start_thread(vcpu, pvc); 2767 kvmppc_start_thread(vcpu, pvc);
2527 kvmppc_create_dtl_entry(vcpu, pvc); 2768 kvmppc_create_dtl_entry(vcpu, pvc);
2528 trace_kvm_guest_enter(vcpu); 2769 trace_kvm_guest_enter(vcpu);
2529 if (!vcpu->arch.ptid) 2770 if (!vcpu->arch.ptid)
2530 thr0_done = true; 2771 thr0_done = true;
2531 active |= 1 << (thr + vcpu->arch.ptid); 2772 active |= 1 << (thr + vcpu->arch.ptid);
2532 }
2533 /*
2534 * We need to start the first thread of each subcore
2535 * even if it doesn't have a vcpu.
2536 */
2537 if (pvc->master_vcore == pvc && !thr0_done)
2538 kvmppc_start_thread(NULL, pvc);
2539 thr += pvc->num_threads;
2540 } 2773 }
2774 /*
2775 * We need to start the first thread of each subcore
2776 * even if it doesn't have a vcpu.
2777 */
2778 if (!thr0_done)
2779 kvmppc_start_thread(NULL, pvc);
2780 thr += pvc->num_threads;
2541 } 2781 }
2542 2782
2543 /* 2783 /*
@@ -2564,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2564 trace_kvmppc_run_core(vc, 0); 2804 trace_kvmppc_run_core(vc, 0);
2565 2805
2566 for (sub = 0; sub < core_info.n_subcores; ++sub) 2806 for (sub = 0; sub < core_info.n_subcores; ++sub)
2567 list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) 2807 spin_unlock(&core_info.vc[sub]->lock);
2568 spin_unlock(&pvc->lock); 2808
2809 /*
2810 * Interrupts will be enabled once we get into the guest,
2811 * so tell lockdep that we're about to enable interrupts.
2812 */
2813 trace_hardirqs_on();
2569 2814
2570 guest_enter(); 2815 guest_enter();
2571 2816
2572 srcu_idx = srcu_read_lock(&vc->kvm->srcu); 2817 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
2573 2818
2574 __kvmppc_vcore_entry(); 2819 trap = __kvmppc_vcore_entry();
2575 2820
2576 srcu_read_unlock(&vc->kvm->srcu, srcu_idx); 2821 srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
2577 2822
2823 guest_exit();
2824
2825 trace_hardirqs_off();
2826 set_irq_happened(trap);
2827
2578 spin_lock(&vc->lock); 2828 spin_lock(&vc->lock);
2579 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 2829 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
2580 vc->vcore_state = VCORE_EXITING; 2830 vc->vcore_state = VCORE_EXITING;
@@ -2602,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2602 split_info.do_nap = 0; 2852 split_info.do_nap = 0;
2603 } 2853 }
2604 2854
2855 kvmppc_set_host_core(pcpu);
2856
2857 local_irq_enable();
2858
2605 /* Let secondaries go back to the offline loop */ 2859 /* Let secondaries go back to the offline loop */
2606 for (i = 0; i < controlled_threads; ++i) { 2860 for (i = 0; i < controlled_threads; ++i) {
2607 kvmppc_release_hwthread(pcpu + i); 2861 kvmppc_release_hwthread(pcpu + i);
@@ -2610,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2610 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); 2864 cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
2611 } 2865 }
2612 2866
2613 kvmppc_set_host_core(pcpu);
2614
2615 spin_unlock(&vc->lock); 2867 spin_unlock(&vc->lock);
2616 2868
2617 /* make sure updates to secondary vcpu structs are visible now */ 2869 /* make sure updates to secondary vcpu structs are visible now */
2618 smp_mb(); 2870 smp_mb();
2619 guest_exit();
2620 2871
2621 for (sub = 0; sub < core_info.n_subcores; ++sub) 2872 for (sub = 0; sub < core_info.n_subcores; ++sub) {
2622 list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], 2873 pvc = core_info.vc[sub];
2623 preempt_list) 2874 post_guest_process(pvc, pvc == vc);
2624 post_guest_process(pvc, pvc == vc); 2875 }
2625 2876
2626 spin_lock(&vc->lock); 2877 spin_lock(&vc->lock);
2627 preempt_enable(); 2878 preempt_enable();
@@ -2666,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
2666 vc->halt_poll_ns /= halt_poll_ns_shrink; 2917 vc->halt_poll_ns /= halt_poll_ns_shrink;
2667} 2918}
2668 2919
2920#ifdef CONFIG_KVM_XICS
2921static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
2922{
2923 if (!xive_enabled())
2924 return false;
2925 return vcpu->arch.xive_saved_state.pipr <
2926 vcpu->arch.xive_saved_state.cppr;
2927}
2928#else
2929static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
2930{
2931 return false;
2932}
2933#endif /* CONFIG_KVM_XICS */
2934
2935static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
2936{
2937 if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
2938 kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
2939 return true;
2940
2941 return false;
2942}
2943
2669/* 2944/*
2670 * Check to see if any of the runnable vcpus on the vcore have pending 2945 * Check to see if any of the runnable vcpus on the vcore have pending
2671 * exceptions or are no longer ceded 2946 * exceptions or are no longer ceded
@@ -2676,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
2676 int i; 2951 int i;
2677 2952
2678 for_each_runnable_thread(i, vcpu, vc) { 2953 for_each_runnable_thread(i, vcpu, vc) {
2679 if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || 2954 if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
2680 vcpu->arch.prodded)
2681 return 1; 2955 return 1;
2682 } 2956 }
2683 2957
@@ -2819,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2819 */ 3093 */
2820 if (!signal_pending(current)) { 3094 if (!signal_pending(current)) {
2821 if (vc->vcore_state == VCORE_PIGGYBACK) { 3095 if (vc->vcore_state == VCORE_PIGGYBACK) {
2822 struct kvmppc_vcore *mvc = vc->master_vcore; 3096 if (spin_trylock(&vc->lock)) {
2823 if (spin_trylock(&mvc->lock)) { 3097 if (vc->vcore_state == VCORE_RUNNING &&
2824 if (mvc->vcore_state == VCORE_RUNNING && 3098 !VCORE_IS_EXITING(vc)) {
2825 !VCORE_IS_EXITING(mvc)) {
2826 kvmppc_create_dtl_entry(vcpu, vc); 3099 kvmppc_create_dtl_entry(vcpu, vc);
2827 kvmppc_start_thread(vcpu, vc); 3100 kvmppc_start_thread(vcpu, vc);
2828 trace_kvm_guest_enter(vcpu); 3101 trace_kvm_guest_enter(vcpu);
2829 } 3102 }
2830 spin_unlock(&mvc->lock); 3103 spin_unlock(&vc->lock);
2831 } 3104 }
2832 } else if (vc->vcore_state == VCORE_RUNNING && 3105 } else if (vc->vcore_state == VCORE_RUNNING &&
2833 !VCORE_IS_EXITING(vc)) { 3106 !VCORE_IS_EXITING(vc)) {
@@ -2863,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2863 break; 3136 break;
2864 n_ceded = 0; 3137 n_ceded = 0;
2865 for_each_runnable_thread(i, v, vc) { 3138 for_each_runnable_thread(i, v, vc) {
2866 if (!v->arch.pending_exceptions && !v->arch.prodded) 3139 if (!kvmppc_vcpu_woken(v))
2867 n_ceded += v->arch.ceded; 3140 n_ceded += v->arch.ceded;
2868 else 3141 else
2869 v->arch.ceded = 0; 3142 v->arch.ceded = 0;
@@ -3519,6 +3792,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3519 kvm_hv_vm_activated(); 3792 kvm_hv_vm_activated();
3520 3793
3521 /* 3794 /*
3795 * Initialize smt_mode depending on processor.
3796 * POWER8 and earlier have to use "strict" threading, where
3797 * all vCPUs in a vcore have to run on the same (sub)core,
3798 * whereas on POWER9 the threads can each run a different
3799 * guest.
3800 */
3801 if (!cpu_has_feature(CPU_FTR_ARCH_300))
3802 kvm->arch.smt_mode = threads_per_subcore;
3803 else
3804 kvm->arch.smt_mode = 1;
3805 kvm->arch.emul_smt_mode = 1;
3806
3807 /*
3522 * Create a debugfs directory for the VM 3808 * Create a debugfs directory for the VM
3523 */ 3809 */
3524 snprintf(buf, sizeof(buf), "vm%d", current->pid); 3810 snprintf(buf, sizeof(buf), "vm%d", current->pid);
@@ -3947,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = {
3947#endif 4233#endif
3948 .configure_mmu = kvmhv_configure_mmu, 4234 .configure_mmu = kvmhv_configure_mmu,
3949 .get_rmmu_info = kvmhv_get_rmmu_info, 4235 .get_rmmu_info = kvmhv_get_rmmu_info,
4236 .set_smt_mode = kvmhv_set_smt_mode,
3950}; 4237};
3951 4238
3952static int kvm_init_subcore_bitmap(void) 4239static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index ee4c2558c305..90644db9d38e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -307,7 +307,7 @@ void kvmhv_commence_exit(int trap)
307 return; 307 return;
308 308
309 for (i = 0; i < MAX_SUBCORES; ++i) { 309 for (i = 0; i < MAX_SUBCORES; ++i) {
310 vc = sip->master_vcs[i]; 310 vc = sip->vc[i];
311 if (!vc) 311 if (!vc)
312 break; 312 break;
313 do { 313 do {
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 404deb512844..dc54373c8780 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -61,13 +61,6 @@ BEGIN_FTR_SECTION
61 std r3, HSTATE_DABR(r13) 61 std r3, HSTATE_DABR(r13)
62END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 62END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
63 63
64 /* Hard-disable interrupts */
65 mfmsr r10
66 std r10, HSTATE_HOST_MSR(r13)
67 rldicl r10,r10,48,1
68 rotldi r10,r10,16
69 mtmsrd r10,1
70
71 /* Save host PMU registers */ 64 /* Save host PMU registers */
72BEGIN_FTR_SECTION 65BEGIN_FTR_SECTION
73 /* Work around P8 PMAE bug */ 66 /* Work around P8 PMAE bug */
@@ -153,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
153 * 146 *
154 * R1 = host R1 147 * R1 = host R1
155 * R2 = host R2 148 * R2 = host R2
149 * R3 = trap number on this thread
156 * R12 = exit handler id 150 * R12 = exit handler id
157 * R13 = PACA 151 * R13 = PACA
158 */ 152 */
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 7ef0993214f3..c356f9a40b24 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
130 130
131out: 131out:
132 /* 132 /*
133 * For guest that supports FWNMI capability, hook the MCE event into
134 * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
135 * exit reason. On our way to exit we will pull this event from vcpu
136 * structure and print it from thread 0 of the core/subcore.
137 *
138 * For guest that does not support FWNMI capability (old QEMU):
133 * We are now going enter guest either through machine check 139 * We are now going enter guest either through machine check
134 * interrupt (for unhandled errors) or will continue from 140 * interrupt (for unhandled errors) or will continue from
135 * current HSRR0 (for handled errors) in guest. Hence 141 * current HSRR0 (for handled errors) in guest. Hence
136 * queue up the event so that we can log it from host console later. 142 * queue up the event so that we can log it from host console later.
137 */ 143 */
138 machine_check_queue_event(); 144 if (vcpu->kvm->arch.fwnmi_enabled) {
145 /*
146 * Hook up the mce event on to vcpu structure.
147 * First clear the old event.
148 */
149 memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
150 if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
151 vcpu->arch.mce_evt = mce_evt;
152 }
153 } else
154 machine_check_queue_event();
139 155
140 return handled; 156 return handled;
141} 157}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4888dd494604..6ea4b53f4b16 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -45,7 +45,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
45#define NAPPING_NOVCPU 2 45#define NAPPING_NOVCPU 2
46 46
47/* Stack frame offsets for kvmppc_hv_entry */ 47/* Stack frame offsets for kvmppc_hv_entry */
48#define SFS 144 48#define SFS 160
49#define STACK_SLOT_TRAP (SFS-4) 49#define STACK_SLOT_TRAP (SFS-4)
50#define STACK_SLOT_TID (SFS-16) 50#define STACK_SLOT_TID (SFS-16)
51#define STACK_SLOT_PSSCR (SFS-24) 51#define STACK_SLOT_PSSCR (SFS-24)
@@ -54,6 +54,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
54#define STACK_SLOT_CIABR (SFS-48) 54#define STACK_SLOT_CIABR (SFS-48)
55#define STACK_SLOT_DAWR (SFS-56) 55#define STACK_SLOT_DAWR (SFS-56)
56#define STACK_SLOT_DAWRX (SFS-64) 56#define STACK_SLOT_DAWRX (SFS-64)
57#define STACK_SLOT_HFSCR (SFS-72)
57 58
58/* 59/*
59 * Call kvmppc_hv_entry in real mode. 60 * Call kvmppc_hv_entry in real mode.
@@ -68,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
68 std r0, PPC_LR_STKOFF(r1) 69 std r0, PPC_LR_STKOFF(r1)
69 stdu r1, -112(r1) 70 stdu r1, -112(r1)
70 mfmsr r10 71 mfmsr r10
72 std r10, HSTATE_HOST_MSR(r13)
71 LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) 73 LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
72 li r0,MSR_RI 74 li r0,MSR_RI
73 andc r0,r10,r0 75 andc r0,r10,r0
@@ -152,20 +154,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
152 stb r0, HSTATE_HWTHREAD_REQ(r13) 154 stb r0, HSTATE_HWTHREAD_REQ(r13)
153 155
154 /* 156 /*
155 * For external and machine check interrupts, we need 157 * For external interrupts we need to call the Linux
156 * to call the Linux handler to process the interrupt. 158 * handler to process the interrupt. We do that by jumping
157 * We do that by jumping to absolute address 0x500 for 159 * to absolute address 0x500 for external interrupts.
158 * external interrupts, or the machine_check_fwnmi label 160 * The [h]rfid at the end of the handler will return to
159 * for machine checks (since firmware might have patched 161 * the book3s_hv_interrupts.S code. For other interrupts
160 * the vector area at 0x200). The [h]rfid at the end of the 162 * we do the rfid to get back to the book3s_hv_interrupts.S
161 * handler will return to the book3s_hv_interrupts.S code. 163 * code here.
162 * For other interrupts we do the rfid to get back
163 * to the book3s_hv_interrupts.S code here.
164 */ 164 */
165 ld r8, 112+PPC_LR_STKOFF(r1) 165 ld r8, 112+PPC_LR_STKOFF(r1)
166 addi r1, r1, 112 166 addi r1, r1, 112
167 ld r7, HSTATE_HOST_MSR(r13) 167 ld r7, HSTATE_HOST_MSR(r13)
168 168
169 /* Return the trap number on this thread as the return value */
170 mr r3, r12
171
169 /* 172 /*
170 * If we came back from the guest via a relocation-on interrupt, 173 * If we came back from the guest via a relocation-on interrupt,
171 * we will be in virtual mode at this point, which makes it a 174 * we will be in virtual mode at this point, which makes it a
@@ -175,59 +178,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
175 andi. r0, r0, MSR_IR /* in real mode? */ 178 andi. r0, r0, MSR_IR /* in real mode? */
176 bne .Lvirt_return 179 bne .Lvirt_return
177 180
178 cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 181 /* RFI into the highmem handler */
179 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
180 beq 11f
181 cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
182 beq 15f /* Invoke the H_DOORBELL handler */
183 cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
184 beq cr2, 14f /* HMI check */
185
186 /* RFI into the highmem handler, or branch to interrupt handler */
187 mfmsr r6 182 mfmsr r6
188 li r0, MSR_RI 183 li r0, MSR_RI
189 andc r6, r6, r0 184 andc r6, r6, r0
190 mtmsrd r6, 1 /* Clear RI in MSR */ 185 mtmsrd r6, 1 /* Clear RI in MSR */
191 mtsrr0 r8 186 mtsrr0 r8
192 mtsrr1 r7 187 mtsrr1 r7
193 beq cr1, 13f /* machine check */
194 RFI 188 RFI
195 189
196 /* On POWER7, we have external interrupts set to use HSRR0/1 */ 190 /* Virtual-mode return */
19711: mtspr SPRN_HSRR0, r8
198 mtspr SPRN_HSRR1, r7
199 ba 0x500
200
20113: b machine_check_fwnmi
202
20314: mtspr SPRN_HSRR0, r8
204 mtspr SPRN_HSRR1, r7
205 b hmi_exception_after_realmode
206
20715: mtspr SPRN_HSRR0, r8
208 mtspr SPRN_HSRR1, r7
209 ba 0xe80
210
211 /* Virtual-mode return - can't get here for HMI or machine check */
212.Lvirt_return: 191.Lvirt_return:
213 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 192 mtlr r8
214 beq 16f
215 cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
216 beq 17f
217 andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
218 beq 18f
219 mtmsrd r7, 1 /* if so then re-enable them */
22018: mtlr r8
221 blr 193 blr
222 194
22316: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
224 mtspr SPRN_HSRR1, r7
225 b exc_virt_0x4500_hardware_interrupt
226
22717: mtspr SPRN_HSRR0, r8
228 mtspr SPRN_HSRR1, r7
229 b exc_virt_0x4e80_h_doorbell
230
231kvmppc_primary_no_guest: 195kvmppc_primary_no_guest:
232 /* We handle this much like a ceded vcpu */ 196 /* We handle this much like a ceded vcpu */
233 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 197 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -769,6 +733,8 @@ BEGIN_FTR_SECTION
769 std r6, STACK_SLOT_PSSCR(r1) 733 std r6, STACK_SLOT_PSSCR(r1)
770 std r7, STACK_SLOT_PID(r1) 734 std r7, STACK_SLOT_PID(r1)
771 std r8, STACK_SLOT_IAMR(r1) 735 std r8, STACK_SLOT_IAMR(r1)
736 mfspr r5, SPRN_HFSCR
737 std r5, STACK_SLOT_HFSCR(r1)
772END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 738END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
773BEGIN_FTR_SECTION 739BEGIN_FTR_SECTION
774 mfspr r5, SPRN_CIABR 740 mfspr r5, SPRN_CIABR
@@ -920,8 +886,10 @@ FTR_SECTION_ELSE
920 ld r5, VCPU_TID(r4) 886 ld r5, VCPU_TID(r4)
921 ld r6, VCPU_PSSCR(r4) 887 ld r6, VCPU_PSSCR(r4)
922 oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ 888 oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */
889 ld r7, VCPU_HFSCR(r4)
923 mtspr SPRN_TIDR, r5 890 mtspr SPRN_TIDR, r5
924 mtspr SPRN_PSSCR, r6 891 mtspr SPRN_PSSCR, r6
892 mtspr SPRN_HFSCR, r7
925ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 893ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
9268: 8948:
927 895
@@ -936,7 +904,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
936 mftb r7 904 mftb r7
937 subf r3,r7,r8 905 subf r3,r7,r8
938 mtspr SPRN_DEC,r3 906 mtspr SPRN_DEC,r3
939 stw r3,VCPU_DEC(r4) 907 std r3,VCPU_DEC(r4)
940 908
941 ld r5, VCPU_SPRG0(r4) 909 ld r5, VCPU_SPRG0(r4)
942 ld r6, VCPU_SPRG1(r4) 910 ld r6, VCPU_SPRG1(r4)
@@ -1048,7 +1016,13 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
1048 li r0, BOOK3S_INTERRUPT_EXTERNAL 1016 li r0, BOOK3S_INTERRUPT_EXTERNAL
1049 bne cr1, 12f 1017 bne cr1, 12f
1050 mfspr r0, SPRN_DEC 1018 mfspr r0, SPRN_DEC
1051 cmpwi r0, 0 1019BEGIN_FTR_SECTION
1020 /* On POWER9 check whether the guest has large decrementer enabled */
1021 andis. r8, r8, LPCR_LD@h
1022 bne 15f
1023END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1024 extsw r0, r0
102515: cmpdi r0, 0
1052 li r0, BOOK3S_INTERRUPT_DECREMENTER 1026 li r0, BOOK3S_INTERRUPT_DECREMENTER
1053 bge 5f 1027 bge 5f
1054 1028
@@ -1058,6 +1032,23 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
1058 mr r9, r4 1032 mr r9, r4
1059 bl kvmppc_msr_interrupt 1033 bl kvmppc_msr_interrupt
10605: 10345:
1035BEGIN_FTR_SECTION
1036 b fast_guest_return
1037END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
1038 /* On POWER9, check for pending doorbell requests */
1039 lbz r0, VCPU_DBELL_REQ(r4)
1040 cmpwi r0, 0
1041 beq fast_guest_return
1042 ld r5, HSTATE_KVM_VCORE(r13)
1043 /* Set DPDES register so the CPU will take a doorbell interrupt */
1044 li r0, 1
1045 mtspr SPRN_DPDES, r0
1046 std r0, VCORE_DPDES(r5)
1047 /* Make sure other cpus see vcore->dpdes set before dbell req clear */
1048 lwsync
1049 /* Clear the pending doorbell request */
1050 li r0, 0
1051 stb r0, VCPU_DBELL_REQ(r4)
1061 1052
1062/* 1053/*
1063 * Required state: 1054 * Required state:
@@ -1232,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1232 1223
1233 stw r12,VCPU_TRAP(r9) 1224 stw r12,VCPU_TRAP(r9)
1234 1225
1226 /*
1227 * Now that we have saved away SRR0/1 and HSRR0/1,
1228 * interrupts are recoverable in principle, so set MSR_RI.
1229 * This becomes important for relocation-on interrupts from
1230 * the guest, which we can get in radix mode on POWER9.
1231 */
1232 li r0, MSR_RI
1233 mtmsrd r0, 1
1234
1235#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1235#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1236 addi r3, r9, VCPU_TB_RMINTR 1236 addi r3, r9, VCPU_TB_RMINTR
1237 mr r4, r9 1237 mr r4, r9
@@ -1288,6 +1288,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
1288 beq 4f 1288 beq 4f
1289 b guest_exit_cont 1289 b guest_exit_cont
12903: 12903:
1291 /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
1292 cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL
1293 bne 14f
1294 mfspr r3, SPRN_HFSCR
1295 std r3, VCPU_HFSCR(r9)
1296 b guest_exit_cont
129714:
1291 /* External interrupt ? */ 1298 /* External interrupt ? */
1292 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1299 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
1293 bne+ guest_exit_cont 1300 bne+ guest_exit_cont
@@ -1475,12 +1482,18 @@ mc_cont:
1475 mtspr SPRN_SPURR,r4 1482 mtspr SPRN_SPURR,r4
1476 1483
1477 /* Save DEC */ 1484 /* Save DEC */
1485 ld r3, HSTATE_KVM_VCORE(r13)
1478 mfspr r5,SPRN_DEC 1486 mfspr r5,SPRN_DEC
1479 mftb r6 1487 mftb r6
1488 /* On P9, if the guest has large decr enabled, don't sign extend */
1489BEGIN_FTR_SECTION
1490 ld r4, VCORE_LPCR(r3)
1491 andis. r4, r4, LPCR_LD@h
1492 bne 16f
1493END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
1480 extsw r5,r5 1494 extsw r5,r5
1481 add r5,r5,r6 149516: add r5,r5,r6
1482 /* r5 is a guest timebase value here, convert to host TB */ 1496 /* r5 is a guest timebase value here, convert to host TB */
1483 ld r3,HSTATE_KVM_VCORE(r13)
1484 ld r4,VCORE_TB_OFFSET(r3) 1497 ld r4,VCORE_TB_OFFSET(r3)
1485 subf r5,r4,r5 1498 subf r5,r4,r5
1486 std r5,VCPU_DEC_EXPIRES(r9) 1499 std r5,VCPU_DEC_EXPIRES(r9)
@@ -1525,6 +1538,9 @@ FTR_SECTION_ELSE
1525 rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ 1538 rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */
1526 rotldi r6, r6, 60 1539 rotldi r6, r6, 60
1527 std r6, VCPU_PSSCR(r9) 1540 std r6, VCPU_PSSCR(r9)
1541 /* Restore host HFSCR value */
1542 ld r7, STACK_SLOT_HFSCR(r1)
1543 mtspr SPRN_HFSCR, r7
1528ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 1544ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
1529 /* 1545 /*
1530 * Restore various registers to 0, where non-zero values 1546 * Restore various registers to 0, where non-zero values
@@ -2402,8 +2418,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
2402 mfspr r3, SPRN_DEC 2418 mfspr r3, SPRN_DEC
2403 mfspr r4, SPRN_HDEC 2419 mfspr r4, SPRN_HDEC
2404 mftb r5 2420 mftb r5
2421BEGIN_FTR_SECTION
2422 /* On P9 check whether the guest has large decrementer mode enabled */
2423 ld r6, HSTATE_KVM_VCORE(r13)
2424 ld r6, VCORE_LPCR(r6)
2425 andis. r6, r6, LPCR_LD@h
2426 bne 68f
2427END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
2405 extsw r3, r3 2428 extsw r3, r3
2406 EXTEND_HDEC(r4) 242968: EXTEND_HDEC(r4)
2407 cmpd r3, r4 2430 cmpd r3, r4
2408 ble 67f 2431 ble 67f
2409 mtspr SPRN_DEC, r4 2432 mtspr SPRN_DEC, r4
@@ -2589,22 +2612,32 @@ machine_check_realmode:
2589 ld r9, HSTATE_KVM_VCPU(r13) 2612 ld r9, HSTATE_KVM_VCPU(r13)
2590 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 2613 li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
2591 /* 2614 /*
2592 * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through 2615 * For the guest that is FWNMI capable, deliver all the MCE errors
2593 * machine check interrupt (set HSRR0 to 0x200). And for handled 2616 * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
2594 * errors (no-fatal), just go back to guest execution with current 2617 * reason. This new approach injects machine check errors in guest
2595 * HSRR0 instead of exiting guest. This new approach will inject 2618 * address space to guest with additional information in the form
2596 * machine check to guest for fatal error causing guest to crash. 2619 * of RTAS event, thus enabling guest kernel to suitably handle
2597 * 2620 * such errors.
2598 * The old code used to return to host for unhandled errors which
2599 * was causing guest to hang with soft lockups inside guest and
2600 * makes it difficult to recover guest instance.
2601 * 2621 *
2622 * For the guest that is not FWNMI capable (old QEMU) fallback
2623 * to old behaviour for backward compatibility:
2624 * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
2625 * through machine check interrupt (set HSRR0 to 0x200).
2626 * For handled errors (no-fatal), just go back to guest execution
2627 * with current HSRR0.
2602 * if we receive machine check with MSR(RI=0) then deliver it to 2628 * if we receive machine check with MSR(RI=0) then deliver it to
2603 * guest as machine check causing guest to crash. 2629 * guest as machine check causing guest to crash.
2604 */ 2630 */
2605 ld r11, VCPU_MSR(r9) 2631 ld r11, VCPU_MSR(r9)
2606 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ 2632 rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
2607 bne mc_cont /* if so, exit to host */ 2633 bne mc_cont /* if so, exit to host */
2634 /* Check if guest is capable of handling NMI exit */
2635 ld r10, VCPU_KVM(r9)
2636 lbz r10, KVM_FWNMI(r10)
2637 cmpdi r10, 1 /* FWNMI capable? */
2638 beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
2639
2640 /* if not, fall through for backward compatibility. */
2608 andi. r10, r11, MSR_RI /* check for unrecoverable exception */ 2641 andi. r10, r11, MSR_RI /* check for unrecoverable exception */
2609 beq 1f /* Deliver a machine check to guest */ 2642 beq 1f /* Deliver a machine check to guest */
2610 ld r10, VCPU_PC(r9) 2643 ld r10, VCPU_PC(r9)
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index ffe1da95033a..08b200a0bbce 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1257,8 +1257,8 @@ static void xive_pre_save_scan(struct kvmppc_xive *xive)
1257 if (!xc) 1257 if (!xc)
1258 continue; 1258 continue;
1259 for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { 1259 for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
1260 if (xc->queues[i].qpage) 1260 if (xc->queues[j].qpage)
1261 xive_pre_save_queue(xive, &xc->queues[i]); 1261 xive_pre_save_queue(xive, &xc->queues[j]);
1262 } 1262 }
1263 } 1263 }
1264 1264
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3eaac3809977..071b87ee682f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -687,7 +687,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
687 687
688 kvmppc_core_check_exceptions(vcpu); 688 kvmppc_core_check_exceptions(vcpu);
689 689
690 if (vcpu->requests) { 690 if (kvm_request_pending(vcpu)) {
691 /* Exception delivery raised request; start over */ 691 /* Exception delivery raised request; start over */
692 return 1; 692 return 1;
693 } 693 }
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index c873ffe55362..4d8b4d6cebff 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -39,7 +39,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
39 unsigned long dec_nsec; 39 unsigned long dec_nsec;
40 unsigned long long dec_time; 40 unsigned long long dec_time;
41 41
42 pr_debug("mtDEC: %x\n", vcpu->arch.dec); 42 pr_debug("mtDEC: %lx\n", vcpu->arch.dec);
43 hrtimer_try_to_cancel(&vcpu->arch.dec_timer); 43 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
44 44
45#ifdef CONFIG_PPC_BOOK3S 45#ifdef CONFIG_PPC_BOOK3S
@@ -109,7 +109,7 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
109 case SPRN_TBWU: break; 109 case SPRN_TBWU: break;
110 110
111 case SPRN_DEC: 111 case SPRN_DEC:
112 vcpu->arch.dec = spr_val; 112 vcpu->arch.dec = (u32) spr_val;
113 kvmppc_emulate_dec(vcpu); 113 kvmppc_emulate_dec(vcpu);
114 break; 114 break;
115 115
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7f71ab5fcad1..1a75c0b5f4ca 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -55,8 +55,7 @@ EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
55 55
56int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 56int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
57{ 57{
58 return !!(v->arch.pending_exceptions) || 58 return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
59 v->requests;
60} 59}
61 60
62int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 61int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -108,7 +107,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
108 */ 107 */
109 smp_mb(); 108 smp_mb();
110 109
111 if (vcpu->requests) { 110 if (kvm_request_pending(vcpu)) {
112 /* Make sure we process requests preemptable */ 111 /* Make sure we process requests preemptable */
113 local_irq_enable(); 112 local_irq_enable();
114 trace_kvm_check_requests(vcpu); 113 trace_kvm_check_requests(vcpu);
@@ -554,13 +553,28 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
554#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 553#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
555 case KVM_CAP_PPC_SMT: 554 case KVM_CAP_PPC_SMT:
556 r = 0; 555 r = 0;
557 if (hv_enabled) { 556 if (kvm) {
557 if (kvm->arch.emul_smt_mode > 1)
558 r = kvm->arch.emul_smt_mode;
559 else
560 r = kvm->arch.smt_mode;
561 } else if (hv_enabled) {
558 if (cpu_has_feature(CPU_FTR_ARCH_300)) 562 if (cpu_has_feature(CPU_FTR_ARCH_300))
559 r = 1; 563 r = 1;
560 else 564 else
561 r = threads_per_subcore; 565 r = threads_per_subcore;
562 } 566 }
563 break; 567 break;
568 case KVM_CAP_PPC_SMT_POSSIBLE:
569 r = 1;
570 if (hv_enabled) {
571 if (!cpu_has_feature(CPU_FTR_ARCH_300))
572 r = ((threads_per_subcore << 1) - 1);
573 else
574 /* P9 can emulate dbells, so allow any mode */
575 r = 8 | 4 | 2 | 1;
576 }
577 break;
564 case KVM_CAP_PPC_RMA: 578 case KVM_CAP_PPC_RMA:
565 r = 0; 579 r = 0;
566 break; 580 break;
@@ -619,6 +633,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
619 r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); 633 r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
620 break; 634 break;
621#endif 635#endif
636#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
637 case KVM_CAP_PPC_FWNMI:
638 r = hv_enabled;
639 break;
640#endif
622 case KVM_CAP_PPC_HTM: 641 case KVM_CAP_PPC_HTM:
623 r = cpu_has_feature(CPU_FTR_TM_COMP) && 642 r = cpu_has_feature(CPU_FTR_TM_COMP) &&
624 is_kvmppc_hv_enabled(kvm); 643 is_kvmppc_hv_enabled(kvm);
@@ -1538,6 +1557,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
1538 break; 1557 break;
1539 } 1558 }
1540#endif /* CONFIG_KVM_XICS */ 1559#endif /* CONFIG_KVM_XICS */
1560#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
1561 case KVM_CAP_PPC_FWNMI:
1562 r = -EINVAL;
1563 if (!is_kvmppc_hv_enabled(vcpu->kvm))
1564 break;
1565 r = 0;
1566 vcpu->kvm->arch.fwnmi_enabled = true;
1567 break;
1568#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
1541 default: 1569 default:
1542 r = -EINVAL; 1570 r = -EINVAL;
1543 break; 1571 break;
@@ -1712,6 +1740,15 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
1712 r = 0; 1740 r = 0;
1713 break; 1741 break;
1714 } 1742 }
1743 case KVM_CAP_PPC_SMT: {
1744 unsigned long mode = cap->args[0];
1745 unsigned long flags = cap->args[1];
1746
1747 r = -EINVAL;
1748 if (kvm->arch.kvm_ops->set_smt_mode)
1749 r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
1750 break;
1751 }
1715#endif 1752#endif
1716 default: 1753 default:
1717 r = -EINVAL; 1754 r = -EINVAL;
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index d0441ad2a990..e508dff92535 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -59,7 +59,9 @@ union ctlreg0 {
59 unsigned long lap : 1; /* Low-address-protection control */ 59 unsigned long lap : 1; /* Low-address-protection control */
60 unsigned long : 4; 60 unsigned long : 4;
61 unsigned long edat : 1; /* Enhanced-DAT-enablement control */ 61 unsigned long edat : 1; /* Enhanced-DAT-enablement control */
62 unsigned long : 4; 62 unsigned long : 2;
63 unsigned long iep : 1; /* Instruction-Execution-Protection */
64 unsigned long : 1;
63 unsigned long afp : 1; /* AFP-register control */ 65 unsigned long afp : 1; /* AFP-register control */
64 unsigned long vx : 1; /* Vector enablement control */ 66 unsigned long vx : 1; /* Vector enablement control */
65 unsigned long : 7; 67 unsigned long : 7;
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 6baae236f461..a409d5991934 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -42,9 +42,11 @@
42#define KVM_HALT_POLL_NS_DEFAULT 80000 42#define KVM_HALT_POLL_NS_DEFAULT 80000
43 43
44/* s390-specific vcpu->requests bit members */ 44/* s390-specific vcpu->requests bit members */
45#define KVM_REQ_ENABLE_IBS 8 45#define KVM_REQ_ENABLE_IBS KVM_ARCH_REQ(0)
46#define KVM_REQ_DISABLE_IBS 9 46#define KVM_REQ_DISABLE_IBS KVM_ARCH_REQ(1)
47#define KVM_REQ_ICPT_OPEREXC 10 47#define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2)
48#define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3)
49#define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4)
48 50
49#define SIGP_CTRL_C 0x80 51#define SIGP_CTRL_C 0x80
50#define SIGP_CTRL_SCN_MASK 0x3f 52#define SIGP_CTRL_SCN_MASK 0x3f
@@ -56,7 +58,7 @@ union bsca_sigp_ctrl {
56 __u8 r : 1; 58 __u8 r : 1;
57 __u8 scn : 6; 59 __u8 scn : 6;
58 }; 60 };
59} __packed; 61};
60 62
61union esca_sigp_ctrl { 63union esca_sigp_ctrl {
62 __u16 value; 64 __u16 value;
@@ -65,14 +67,14 @@ union esca_sigp_ctrl {
65 __u8 reserved: 7; 67 __u8 reserved: 7;
66 __u8 scn; 68 __u8 scn;
67 }; 69 };
68} __packed; 70};
69 71
70struct esca_entry { 72struct esca_entry {
71 union esca_sigp_ctrl sigp_ctrl; 73 union esca_sigp_ctrl sigp_ctrl;
72 __u16 reserved1[3]; 74 __u16 reserved1[3];
73 __u64 sda; 75 __u64 sda;
74 __u64 reserved2[6]; 76 __u64 reserved2[6];
75} __packed; 77};
76 78
77struct bsca_entry { 79struct bsca_entry {
78 __u8 reserved0; 80 __u8 reserved0;
@@ -80,7 +82,7 @@ struct bsca_entry {
80 __u16 reserved[3]; 82 __u16 reserved[3];
81 __u64 sda; 83 __u64 sda;
82 __u64 reserved2[2]; 84 __u64 reserved2[2];
83} __attribute__((packed)); 85};
84 86
85union ipte_control { 87union ipte_control {
86 unsigned long val; 88 unsigned long val;
@@ -97,7 +99,7 @@ struct bsca_block {
97 __u64 mcn; 99 __u64 mcn;
98 __u64 reserved2; 100 __u64 reserved2;
99 struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; 101 struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
100} __attribute__((packed)); 102};
101 103
102struct esca_block { 104struct esca_block {
103 union ipte_control ipte_control; 105 union ipte_control ipte_control;
@@ -105,7 +107,7 @@ struct esca_block {
105 __u64 mcn[4]; 107 __u64 mcn[4];
106 __u64 reserved2[20]; 108 __u64 reserved2[20];
107 struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; 109 struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
108} __packed; 110};
109 111
110/* 112/*
111 * This struct is used to store some machine check info from lowcore 113 * This struct is used to store some machine check info from lowcore
@@ -274,7 +276,7 @@ struct kvm_s390_sie_block {
274 276
275struct kvm_s390_itdb { 277struct kvm_s390_itdb {
276 __u8 data[256]; 278 __u8 data[256];
277} __packed; 279};
278 280
279struct sie_page { 281struct sie_page {
280 struct kvm_s390_sie_block sie_block; 282 struct kvm_s390_sie_block sie_block;
@@ -282,7 +284,7 @@ struct sie_page {
282 __u8 reserved218[1000]; /* 0x0218 */ 284 __u8 reserved218[1000]; /* 0x0218 */
283 struct kvm_s390_itdb itdb; /* 0x0600 */ 285 struct kvm_s390_itdb itdb; /* 0x0600 */
284 __u8 reserved700[2304]; /* 0x0700 */ 286 __u8 reserved700[2304]; /* 0x0700 */
285} __packed; 287};
286 288
287struct kvm_vcpu_stat { 289struct kvm_vcpu_stat {
288 u64 exit_userspace; 290 u64 exit_userspace;
@@ -695,7 +697,7 @@ struct sie_page2 {
695 __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ 697 __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */
696 struct kvm_s390_crypto_cb crycb; /* 0x0800 */ 698 struct kvm_s390_crypto_cb crycb; /* 0x0800 */
697 u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ 699 u8 reserved900[0x1000 - 0x900]; /* 0x0900 */
698} __packed; 700};
699 701
700struct kvm_s390_vsie { 702struct kvm_s390_vsie {
701 struct mutex mutex; 703 struct mutex mutex;
@@ -705,6 +707,12 @@ struct kvm_s390_vsie {
705 struct page *pages[KVM_MAX_VCPUS]; 707 struct page *pages[KVM_MAX_VCPUS];
706}; 708};
707 709
710struct kvm_s390_migration_state {
711 unsigned long bitmap_size; /* in bits (number of guest pages) */
712 atomic64_t dirty_pages; /* number of dirty pages */
713 unsigned long *pgste_bitmap;
714};
715
708struct kvm_arch{ 716struct kvm_arch{
709 void *sca; 717 void *sca;
710 int use_esca; 718 int use_esca;
@@ -732,6 +740,7 @@ struct kvm_arch{
732 struct kvm_s390_crypto crypto; 740 struct kvm_s390_crypto crypto;
733 struct kvm_s390_vsie vsie; 741 struct kvm_s390_vsie vsie;
734 u64 epoch; 742 u64 epoch;
743 struct kvm_s390_migration_state *migration_state;
735 /* subset of available cpu features enabled by user space */ 744 /* subset of available cpu features enabled by user space */
736 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); 745 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
737}; 746};
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 13623b9991d4..9d91cf3e427f 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -26,6 +26,12 @@
26#define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20) 26#define MCCK_CODE_PSW_MWP_VALID _BITUL(63 - 20)
27#define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23) 27#define MCCK_CODE_PSW_IA_VALID _BITUL(63 - 23)
28 28
29#define MCCK_CR14_CR_PENDING_SUB_MASK (1 << 28)
30#define MCCK_CR14_RECOVERY_SUB_MASK (1 << 27)
31#define MCCK_CR14_DEGRAD_SUB_MASK (1 << 26)
32#define MCCK_CR14_EXT_DAMAGE_SUB_MASK (1 << 25)
33#define MCCK_CR14_WARN_SUB_MASK (1 << 24)
34
29#ifndef __ASSEMBLY__ 35#ifndef __ASSEMBLY__
30 36
31union mci { 37union mci {
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3dd2a1d308dd..69d09c39bbcd 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -28,6 +28,7 @@
28#define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 28#define KVM_DEV_FLIC_CLEAR_IO_IRQ 8
29#define KVM_DEV_FLIC_AISM 9 29#define KVM_DEV_FLIC_AISM 9
30#define KVM_DEV_FLIC_AIRQ_INJECT 10 30#define KVM_DEV_FLIC_AIRQ_INJECT 10
31#define KVM_DEV_FLIC_AISM_ALL 11
31/* 32/*
32 * We can have up to 4*64k pending subchannels + 8 adapter interrupts, 33 * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
33 * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. 34 * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -53,6 +54,11 @@ struct kvm_s390_ais_req {
53 __u16 mode; 54 __u16 mode;
54}; 55};
55 56
57struct kvm_s390_ais_all {
58 __u8 simm;
59 __u8 nimm;
60};
61
56#define KVM_S390_IO_ADAPTER_MASK 1 62#define KVM_S390_IO_ADAPTER_MASK 1
57#define KVM_S390_IO_ADAPTER_MAP 2 63#define KVM_S390_IO_ADAPTER_MAP 2
58#define KVM_S390_IO_ADAPTER_UNMAP 3 64#define KVM_S390_IO_ADAPTER_UNMAP 3
@@ -70,6 +76,7 @@ struct kvm_s390_io_adapter_req {
70#define KVM_S390_VM_TOD 1 76#define KVM_S390_VM_TOD 1
71#define KVM_S390_VM_CRYPTO 2 77#define KVM_S390_VM_CRYPTO 2
72#define KVM_S390_VM_CPU_MODEL 3 78#define KVM_S390_VM_CPU_MODEL 3
79#define KVM_S390_VM_MIGRATION 4
73 80
74/* kvm attributes for mem_ctrl */ 81/* kvm attributes for mem_ctrl */
75#define KVM_S390_VM_MEM_ENABLE_CMMA 0 82#define KVM_S390_VM_MEM_ENABLE_CMMA 0
@@ -151,6 +158,11 @@ struct kvm_s390_vm_cpu_subfunc {
151#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 158#define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2
152#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 159#define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3
153 160
161/* kvm attributes for migration mode */
162#define KVM_S390_VM_MIGRATION_STOP 0
163#define KVM_S390_VM_MIGRATION_START 1
164#define KVM_S390_VM_MIGRATION_STATUS 2
165
154/* for KVM_GET_REGS and KVM_SET_REGS */ 166/* for KVM_GET_REGS and KVM_SET_REGS */
155struct kvm_regs { 167struct kvm_regs {
156 /* general purpose regs for s390 */ 168 /* general purpose regs for s390 */
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 875f8bea8c67..653cae5e1ee1 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -89,7 +89,7 @@ struct region3_table_entry_fc1 {
89 unsigned long f : 1; /* Fetch-Protection Bit */ 89 unsigned long f : 1; /* Fetch-Protection Bit */
90 unsigned long fc : 1; /* Format-Control */ 90 unsigned long fc : 1; /* Format-Control */
91 unsigned long p : 1; /* DAT-Protection Bit */ 91 unsigned long p : 1; /* DAT-Protection Bit */
92 unsigned long co : 1; /* Change-Recording Override */ 92 unsigned long iep: 1; /* Instruction-Execution-Protection */
93 unsigned long : 2; 93 unsigned long : 2;
94 unsigned long i : 1; /* Region-Invalid Bit */ 94 unsigned long i : 1; /* Region-Invalid Bit */
95 unsigned long cr : 1; /* Common-Region Bit */ 95 unsigned long cr : 1; /* Common-Region Bit */
@@ -131,7 +131,7 @@ struct segment_entry_fc1 {
131 unsigned long f : 1; /* Fetch-Protection Bit */ 131 unsigned long f : 1; /* Fetch-Protection Bit */
132 unsigned long fc : 1; /* Format-Control */ 132 unsigned long fc : 1; /* Format-Control */
133 unsigned long p : 1; /* DAT-Protection Bit */ 133 unsigned long p : 1; /* DAT-Protection Bit */
134 unsigned long co : 1; /* Change-Recording Override */ 134 unsigned long iep: 1; /* Instruction-Execution-Protection */
135 unsigned long : 2; 135 unsigned long : 2;
136 unsigned long i : 1; /* Segment-Invalid Bit */ 136 unsigned long i : 1; /* Segment-Invalid Bit */
137 unsigned long cs : 1; /* Common-Segment Bit */ 137 unsigned long cs : 1; /* Common-Segment Bit */
@@ -168,7 +168,8 @@ union page_table_entry {
168 unsigned long z : 1; /* Zero Bit */ 168 unsigned long z : 1; /* Zero Bit */
169 unsigned long i : 1; /* Page-Invalid Bit */ 169 unsigned long i : 1; /* Page-Invalid Bit */
170 unsigned long p : 1; /* DAT-Protection Bit */ 170 unsigned long p : 1; /* DAT-Protection Bit */
171 unsigned long : 9; 171 unsigned long iep: 1; /* Instruction-Execution-Protection */
172 unsigned long : 8;
172 }; 173 };
173}; 174};
174 175
@@ -241,7 +242,7 @@ struct ale {
241 unsigned long asteo : 25; /* ASN-Second-Table-Entry Origin */ 242 unsigned long asteo : 25; /* ASN-Second-Table-Entry Origin */
242 unsigned long : 6; 243 unsigned long : 6;
243 unsigned long astesn : 32; /* ASTE Sequence Number */ 244 unsigned long astesn : 32; /* ASTE Sequence Number */
244} __packed; 245};
245 246
246struct aste { 247struct aste {
247 unsigned long i : 1; /* ASX-Invalid Bit */ 248 unsigned long i : 1; /* ASX-Invalid Bit */
@@ -257,7 +258,7 @@ struct aste {
257 unsigned long ald : 32; 258 unsigned long ald : 32;
258 unsigned long astesn : 32; 259 unsigned long astesn : 32;
259 /* .. more fields there */ 260 /* .. more fields there */
260} __packed; 261};
261 262
262int ipte_lock_held(struct kvm_vcpu *vcpu) 263int ipte_lock_held(struct kvm_vcpu *vcpu)
263{ 264{
@@ -485,6 +486,7 @@ enum prot_type {
485 PROT_TYPE_KEYC = 1, 486 PROT_TYPE_KEYC = 1,
486 PROT_TYPE_ALC = 2, 487 PROT_TYPE_ALC = 2,
487 PROT_TYPE_DAT = 3, 488 PROT_TYPE_DAT = 3,
489 PROT_TYPE_IEP = 4,
488}; 490};
489 491
490static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, 492static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
@@ -500,6 +502,9 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
500 switch (code) { 502 switch (code) {
501 case PGM_PROTECTION: 503 case PGM_PROTECTION:
502 switch (prot) { 504 switch (prot) {
505 case PROT_TYPE_IEP:
506 tec->b61 = 1;
507 /* FALL THROUGH */
503 case PROT_TYPE_LA: 508 case PROT_TYPE_LA:
504 tec->b56 = 1; 509 tec->b56 = 1;
505 break; 510 break;
@@ -591,6 +596,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
591 * @gpa: points to where guest physical (absolute) address should be stored 596 * @gpa: points to where guest physical (absolute) address should be stored
592 * @asce: effective asce 597 * @asce: effective asce
593 * @mode: indicates the access mode to be used 598 * @mode: indicates the access mode to be used
599 * @prot: returns the type for protection exceptions
594 * 600 *
595 * Translate a guest virtual address into a guest absolute address by means 601 * Translate a guest virtual address into a guest absolute address by means
596 * of dynamic address translation as specified by the architecture. 602 * of dynamic address translation as specified by the architecture.
@@ -606,19 +612,21 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
606 */ 612 */
607static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, 613static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
608 unsigned long *gpa, const union asce asce, 614 unsigned long *gpa, const union asce asce,
609 enum gacc_mode mode) 615 enum gacc_mode mode, enum prot_type *prot)
610{ 616{
611 union vaddress vaddr = {.addr = gva}; 617 union vaddress vaddr = {.addr = gva};
612 union raddress raddr = {.addr = gva}; 618 union raddress raddr = {.addr = gva};
613 union page_table_entry pte; 619 union page_table_entry pte;
614 int dat_protection = 0; 620 int dat_protection = 0;
621 int iep_protection = 0;
615 union ctlreg0 ctlreg0; 622 union ctlreg0 ctlreg0;
616 unsigned long ptr; 623 unsigned long ptr;
617 int edat1, edat2; 624 int edat1, edat2, iep;
618 625
619 ctlreg0.val = vcpu->arch.sie_block->gcr[0]; 626 ctlreg0.val = vcpu->arch.sie_block->gcr[0];
620 edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8); 627 edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
621 edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78); 628 edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
629 iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
622 if (asce.r) 630 if (asce.r)
623 goto real_address; 631 goto real_address;
624 ptr = asce.origin * 4096; 632 ptr = asce.origin * 4096;
@@ -702,6 +710,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
702 return PGM_TRANSLATION_SPEC; 710 return PGM_TRANSLATION_SPEC;
703 if (rtte.fc && edat2) { 711 if (rtte.fc && edat2) {
704 dat_protection |= rtte.fc1.p; 712 dat_protection |= rtte.fc1.p;
713 iep_protection = rtte.fc1.iep;
705 raddr.rfaa = rtte.fc1.rfaa; 714 raddr.rfaa = rtte.fc1.rfaa;
706 goto absolute_address; 715 goto absolute_address;
707 } 716 }
@@ -729,6 +738,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
729 return PGM_TRANSLATION_SPEC; 738 return PGM_TRANSLATION_SPEC;
730 if (ste.fc && edat1) { 739 if (ste.fc && edat1) {
731 dat_protection |= ste.fc1.p; 740 dat_protection |= ste.fc1.p;
741 iep_protection = ste.fc1.iep;
732 raddr.sfaa = ste.fc1.sfaa; 742 raddr.sfaa = ste.fc1.sfaa;
733 goto absolute_address; 743 goto absolute_address;
734 } 744 }
@@ -745,12 +755,19 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
745 if (pte.z) 755 if (pte.z)
746 return PGM_TRANSLATION_SPEC; 756 return PGM_TRANSLATION_SPEC;
747 dat_protection |= pte.p; 757 dat_protection |= pte.p;
758 iep_protection = pte.iep;
748 raddr.pfra = pte.pfra; 759 raddr.pfra = pte.pfra;
749real_address: 760real_address:
750 raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr); 761 raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
751absolute_address: 762absolute_address:
752 if (mode == GACC_STORE && dat_protection) 763 if (mode == GACC_STORE && dat_protection) {
764 *prot = PROT_TYPE_DAT;
753 return PGM_PROTECTION; 765 return PGM_PROTECTION;
766 }
767 if (mode == GACC_IFETCH && iep_protection && iep) {
768 *prot = PROT_TYPE_IEP;
769 return PGM_PROTECTION;
770 }
754 if (kvm_is_error_gpa(vcpu->kvm, raddr.addr)) 771 if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
755 return PGM_ADDRESSING; 772 return PGM_ADDRESSING;
756 *gpa = raddr.addr; 773 *gpa = raddr.addr;
@@ -782,6 +799,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
782{ 799{
783 psw_t *psw = &vcpu->arch.sie_block->gpsw; 800 psw_t *psw = &vcpu->arch.sie_block->gpsw;
784 int lap_enabled, rc = 0; 801 int lap_enabled, rc = 0;
802 enum prot_type prot;
785 803
786 lap_enabled = low_address_protection_enabled(vcpu, asce); 804 lap_enabled = low_address_protection_enabled(vcpu, asce);
787 while (nr_pages) { 805 while (nr_pages) {
@@ -791,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
791 PROT_TYPE_LA); 809 PROT_TYPE_LA);
792 ga &= PAGE_MASK; 810 ga &= PAGE_MASK;
793 if (psw_bits(*psw).dat) { 811 if (psw_bits(*psw).dat) {
794 rc = guest_translate(vcpu, ga, pages, asce, mode); 812 rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
795 if (rc < 0) 813 if (rc < 0)
796 return rc; 814 return rc;
797 } else { 815 } else {
@@ -800,7 +818,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
800 rc = PGM_ADDRESSING; 818 rc = PGM_ADDRESSING;
801 } 819 }
802 if (rc) 820 if (rc)
803 return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); 821 return trans_exc(vcpu, rc, ga, ar, mode, prot);
804 ga += PAGE_SIZE; 822 ga += PAGE_SIZE;
805 pages++; 823 pages++;
806 nr_pages--; 824 nr_pages--;
@@ -886,6 +904,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
886 unsigned long *gpa, enum gacc_mode mode) 904 unsigned long *gpa, enum gacc_mode mode)
887{ 905{
888 psw_t *psw = &vcpu->arch.sie_block->gpsw; 906 psw_t *psw = &vcpu->arch.sie_block->gpsw;
907 enum prot_type prot;
889 union asce asce; 908 union asce asce;
890 int rc; 909 int rc;
891 910
@@ -900,9 +919,9 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
900 } 919 }
901 920
902 if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */ 921 if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */
903 rc = guest_translate(vcpu, gva, gpa, asce, mode); 922 rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
904 if (rc > 0) 923 if (rc > 0)
905 return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); 924 return trans_exc(vcpu, rc, gva, 0, mode, prot);
906 } else { 925 } else {
907 *gpa = kvm_s390_real_to_abs(vcpu, gva); 926 *gpa = kvm_s390_real_to_abs(vcpu, gva);
908 if (kvm_is_error_gpa(vcpu->kvm, *gpa)) 927 if (kvm_is_error_gpa(vcpu->kvm, *gpa))
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 2d120fef7d90..a619ddae610d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -251,8 +251,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
251 __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); 251 __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask);
252 if (psw_mchk_disabled(vcpu)) 252 if (psw_mchk_disabled(vcpu))
253 active_mask &= ~IRQ_PEND_MCHK_MASK; 253 active_mask &= ~IRQ_PEND_MCHK_MASK;
254 /*
255 * Check both floating and local interrupt's cr14 because
256 * bit IRQ_PEND_MCHK_REP could be set in both cases.
257 */
254 if (!(vcpu->arch.sie_block->gcr[14] & 258 if (!(vcpu->arch.sie_block->gcr[14] &
255 vcpu->kvm->arch.float_int.mchk.cr14)) 259 (vcpu->kvm->arch.float_int.mchk.cr14 |
260 vcpu->arch.local_int.irq.mchk.cr14)))
256 __clear_bit(IRQ_PEND_MCHK_REP, &active_mask); 261 __clear_bit(IRQ_PEND_MCHK_REP, &active_mask);
257 262
258 /* 263 /*
@@ -1876,6 +1881,28 @@ out:
1876 return ret < 0 ? ret : n; 1881 return ret < 0 ? ret : n;
1877} 1882}
1878 1883
1884static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
1885{
1886 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1887 struct kvm_s390_ais_all ais;
1888
1889 if (attr->attr < sizeof(ais))
1890 return -EINVAL;
1891
1892 if (!test_kvm_facility(kvm, 72))
1893 return -ENOTSUPP;
1894
1895 mutex_lock(&fi->ais_lock);
1896 ais.simm = fi->simm;
1897 ais.nimm = fi->nimm;
1898 mutex_unlock(&fi->ais_lock);
1899
1900 if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais)))
1901 return -EFAULT;
1902
1903 return 0;
1904}
1905
1879static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 1906static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1880{ 1907{
1881 int r; 1908 int r;
@@ -1885,6 +1912,9 @@ static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1885 r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr, 1912 r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr,
1886 attr->attr); 1913 attr->attr);
1887 break; 1914 break;
1915 case KVM_DEV_FLIC_AISM_ALL:
1916 r = flic_ais_mode_get_all(dev->kvm, attr);
1917 break;
1888 default: 1918 default:
1889 r = -EINVAL; 1919 r = -EINVAL;
1890 } 1920 }
@@ -2235,6 +2265,25 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
2235 return kvm_s390_inject_airq(kvm, adapter); 2265 return kvm_s390_inject_airq(kvm, adapter);
2236} 2266}
2237 2267
2268static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
2269{
2270 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
2271 struct kvm_s390_ais_all ais;
2272
2273 if (!test_kvm_facility(kvm, 72))
2274 return -ENOTSUPP;
2275
2276 if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
2277 return -EFAULT;
2278
2279 mutex_lock(&fi->ais_lock);
2280 fi->simm = ais.simm;
2281 fi->nimm = ais.nimm;
2282 mutex_unlock(&fi->ais_lock);
2283
2284 return 0;
2285}
2286
2238static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) 2287static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2239{ 2288{
2240 int r = 0; 2289 int r = 0;
@@ -2277,6 +2326,9 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
2277 case KVM_DEV_FLIC_AIRQ_INJECT: 2326 case KVM_DEV_FLIC_AIRQ_INJECT:
2278 r = flic_inject_airq(dev->kvm, attr); 2327 r = flic_inject_airq(dev->kvm, attr);
2279 break; 2328 break;
2329 case KVM_DEV_FLIC_AISM_ALL:
2330 r = flic_ais_mode_set_all(dev->kvm, attr);
2331 break;
2280 default: 2332 default:
2281 r = -EINVAL; 2333 r = -EINVAL;
2282 } 2334 }
@@ -2298,6 +2350,7 @@ static int flic_has_attr(struct kvm_device *dev,
2298 case KVM_DEV_FLIC_CLEAR_IO_IRQ: 2350 case KVM_DEV_FLIC_CLEAR_IO_IRQ:
2299 case KVM_DEV_FLIC_AISM: 2351 case KVM_DEV_FLIC_AISM:
2300 case KVM_DEV_FLIC_AIRQ_INJECT: 2352 case KVM_DEV_FLIC_AIRQ_INJECT:
2353 case KVM_DEV_FLIC_AISM_ALL:
2301 return 0; 2354 return 0;
2302 } 2355 }
2303 return -ENXIO; 2356 return -ENXIO;
@@ -2415,6 +2468,42 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
2415 return ret; 2468 return ret;
2416} 2469}
2417 2470
2471/*
2472 * Inject the machine check to the guest.
2473 */
2474void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
2475 struct mcck_volatile_info *mcck_info)
2476{
2477 struct kvm_s390_interrupt_info inti;
2478 struct kvm_s390_irq irq;
2479 struct kvm_s390_mchk_info *mchk;
2480 union mci mci;
2481 __u64 cr14 = 0; /* upper bits are not used */
2482
2483 mci.val = mcck_info->mcic;
2484 if (mci.sr)
2485 cr14 |= MCCK_CR14_RECOVERY_SUB_MASK;
2486 if (mci.dg)
2487 cr14 |= MCCK_CR14_DEGRAD_SUB_MASK;
2488 if (mci.w)
2489 cr14 |= MCCK_CR14_WARN_SUB_MASK;
2490
2491 mchk = mci.ck ? &inti.mchk : &irq.u.mchk;
2492 mchk->cr14 = cr14;
2493 mchk->mcic = mcck_info->mcic;
2494 mchk->ext_damage_code = mcck_info->ext_damage_code;
2495 mchk->failing_storage_address = mcck_info->failing_storage_address;
2496 if (mci.ck) {
2497 /* Inject the floating machine check */
2498 inti.type = KVM_S390_MCHK;
2499 WARN_ON_ONCE(__inject_vm(vcpu->kvm, &inti));
2500 } else {
2501 /* Inject the machine check to specified vcpu */
2502 irq.type = KVM_S390_MCHK;
2503 WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &irq));
2504 }
2505}
2506
2418int kvm_set_routing_entry(struct kvm *kvm, 2507int kvm_set_routing_entry(struct kvm *kvm,
2419 struct kvm_kernel_irq_routing_entry *e, 2508 struct kvm_kernel_irq_routing_entry *e,
2420 const struct kvm_irq_routing_entry *ue) 2509 const struct kvm_irq_routing_entry *ue)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b0d7de5a533d..3f2884e99ed4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -30,6 +30,7 @@
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/bitmap.h> 31#include <linux/bitmap.h>
32#include <linux/sched/signal.h> 32#include <linux/sched/signal.h>
33#include <linux/string.h>
33 34
34#include <asm/asm-offsets.h> 35#include <asm/asm-offsets.h>
35#include <asm/lowcore.h> 36#include <asm/lowcore.h>
@@ -386,6 +387,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
386 case KVM_CAP_S390_SKEYS: 387 case KVM_CAP_S390_SKEYS:
387 case KVM_CAP_S390_IRQ_STATE: 388 case KVM_CAP_S390_IRQ_STATE:
388 case KVM_CAP_S390_USER_INSTR0: 389 case KVM_CAP_S390_USER_INSTR0:
390 case KVM_CAP_S390_CMMA_MIGRATION:
389 case KVM_CAP_S390_AIS: 391 case KVM_CAP_S390_AIS:
390 r = 1; 392 r = 1;
391 break; 393 break;
@@ -749,6 +751,129 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr)
749 return 0; 751 return 0;
750} 752}
751 753
754static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
755{
756 int cx;
757 struct kvm_vcpu *vcpu;
758
759 kvm_for_each_vcpu(cx, vcpu, kvm)
760 kvm_s390_sync_request(req, vcpu);
761}
762
763/*
764 * Must be called with kvm->srcu held to avoid races on memslots, and with
765 * kvm->lock to avoid races with ourselves and kvm_s390_vm_stop_migration.
766 */
767static int kvm_s390_vm_start_migration(struct kvm *kvm)
768{
769 struct kvm_s390_migration_state *mgs;
770 struct kvm_memory_slot *ms;
771 /* should be the only one */
772 struct kvm_memslots *slots;
773 unsigned long ram_pages;
774 int slotnr;
775
776 /* migration mode already enabled */
777 if (kvm->arch.migration_state)
778 return 0;
779
780 slots = kvm_memslots(kvm);
781 if (!slots || !slots->used_slots)
782 return -EINVAL;
783
784 mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
785 if (!mgs)
786 return -ENOMEM;
787 kvm->arch.migration_state = mgs;
788
789 if (kvm->arch.use_cmma) {
790 /*
791 * Get the last slot. They should be sorted by base_gfn, so the
792 * last slot is also the one at the end of the address space.
793 * We have verified above that at least one slot is present.
794 */
795 ms = slots->memslots + slots->used_slots - 1;
796 /* round up so we only use full longs */
797 ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
798 /* allocate enough bytes to store all the bits */
799 mgs->pgste_bitmap = vmalloc(ram_pages / 8);
800 if (!mgs->pgste_bitmap) {
801 kfree(mgs);
802 kvm->arch.migration_state = NULL;
803 return -ENOMEM;
804 }
805
806 mgs->bitmap_size = ram_pages;
807 atomic64_set(&mgs->dirty_pages, ram_pages);
808 /* mark all the pages in active slots as dirty */
809 for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
810 ms = slots->memslots + slotnr;
811 bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
812 }
813
814 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
815 }
816 return 0;
817}
818
819/*
820 * Must be called with kvm->lock to avoid races with ourselves and
821 * kvm_s390_vm_start_migration.
822 */
823static int kvm_s390_vm_stop_migration(struct kvm *kvm)
824{
825 struct kvm_s390_migration_state *mgs;
826
827 /* migration mode already disabled */
828 if (!kvm->arch.migration_state)
829 return 0;
830 mgs = kvm->arch.migration_state;
831 kvm->arch.migration_state = NULL;
832
833 if (kvm->arch.use_cmma) {
834 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
835 vfree(mgs->pgste_bitmap);
836 }
837 kfree(mgs);
838 return 0;
839}
840
841static int kvm_s390_vm_set_migration(struct kvm *kvm,
842 struct kvm_device_attr *attr)
843{
844 int idx, res = -ENXIO;
845
846 mutex_lock(&kvm->lock);
847 switch (attr->attr) {
848 case KVM_S390_VM_MIGRATION_START:
849 idx = srcu_read_lock(&kvm->srcu);
850 res = kvm_s390_vm_start_migration(kvm);
851 srcu_read_unlock(&kvm->srcu, idx);
852 break;
853 case KVM_S390_VM_MIGRATION_STOP:
854 res = kvm_s390_vm_stop_migration(kvm);
855 break;
856 default:
857 break;
858 }
859 mutex_unlock(&kvm->lock);
860
861 return res;
862}
863
864static int kvm_s390_vm_get_migration(struct kvm *kvm,
865 struct kvm_device_attr *attr)
866{
867 u64 mig = (kvm->arch.migration_state != NULL);
868
869 if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
870 return -ENXIO;
871
872 if (copy_to_user((void __user *)attr->addr, &mig, sizeof(mig)))
873 return -EFAULT;
874 return 0;
875}
876
752static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) 877static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
753{ 878{
754 u8 gtod_high; 879 u8 gtod_high;
@@ -1089,6 +1214,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1089 case KVM_S390_VM_CRYPTO: 1214 case KVM_S390_VM_CRYPTO:
1090 ret = kvm_s390_vm_set_crypto(kvm, attr); 1215 ret = kvm_s390_vm_set_crypto(kvm, attr);
1091 break; 1216 break;
1217 case KVM_S390_VM_MIGRATION:
1218 ret = kvm_s390_vm_set_migration(kvm, attr);
1219 break;
1092 default: 1220 default:
1093 ret = -ENXIO; 1221 ret = -ENXIO;
1094 break; 1222 break;
@@ -1111,6 +1239,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1111 case KVM_S390_VM_CPU_MODEL: 1239 case KVM_S390_VM_CPU_MODEL:
1112 ret = kvm_s390_get_cpu_model(kvm, attr); 1240 ret = kvm_s390_get_cpu_model(kvm, attr);
1113 break; 1241 break;
1242 case KVM_S390_VM_MIGRATION:
1243 ret = kvm_s390_vm_get_migration(kvm, attr);
1244 break;
1114 default: 1245 default:
1115 ret = -ENXIO; 1246 ret = -ENXIO;
1116 break; 1247 break;
@@ -1178,6 +1309,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1178 break; 1309 break;
1179 } 1310 }
1180 break; 1311 break;
1312 case KVM_S390_VM_MIGRATION:
1313 ret = 0;
1314 break;
1181 default: 1315 default:
1182 ret = -ENXIO; 1316 ret = -ENXIO;
1183 break; 1317 break;
@@ -1285,6 +1419,182 @@ out:
1285 return r; 1419 return r;
1286} 1420}
1287 1421
1422/*
1423 * Base address and length must be sent at the start of each block, therefore
1424 * it's cheaper to send some clean data, as long as it's less than the size of
1425 * two longs.
1426 */
1427#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
1428/* for consistency */
1429#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
1430
1431/*
1432 * This function searches for the next page with dirty CMMA attributes, and
1433 * saves the attributes in the buffer up to either the end of the buffer or
1434 * until a block of at least KVM_S390_MAX_BIT_DISTANCE clean bits is found;
1435 * no trailing clean bytes are saved.
1436 * In case no dirty bits were found, or if CMMA was not enabled or used, the
1437 * output buffer will indicate 0 as length.
1438 */
1439static int kvm_s390_get_cmma_bits(struct kvm *kvm,
1440 struct kvm_s390_cmma_log *args)
1441{
1442 struct kvm_s390_migration_state *s = kvm->arch.migration_state;
1443 unsigned long bufsize, hva, pgstev, i, next, cur;
1444 int srcu_idx, peek, r = 0, rr;
1445 u8 *res;
1446
1447 cur = args->start_gfn;
1448 i = next = pgstev = 0;
1449
1450 if (unlikely(!kvm->arch.use_cmma))
1451 return -ENXIO;
1452 /* Invalid/unsupported flags were specified */
1453 if (args->flags & ~KVM_S390_CMMA_PEEK)
1454 return -EINVAL;
1455 /* Migration mode query, and we are not doing a migration */
1456 peek = !!(args->flags & KVM_S390_CMMA_PEEK);
1457 if (!peek && !s)
1458 return -EINVAL;
1459 /* CMMA is disabled or was not used, or the buffer has length zero */
1460 bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
1461 if (!bufsize || !kvm->mm->context.use_cmma) {
1462 memset(args, 0, sizeof(*args));
1463 return 0;
1464 }
1465
1466 if (!peek) {
1467 /* We are not peeking, and there are no dirty pages */
1468 if (!atomic64_read(&s->dirty_pages)) {
1469 memset(args, 0, sizeof(*args));
1470 return 0;
1471 }
1472 cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
1473 args->start_gfn);
1474 if (cur >= s->bitmap_size) /* nothing found, loop back */
1475 cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
1476 if (cur >= s->bitmap_size) { /* again! (very unlikely) */
1477 memset(args, 0, sizeof(*args));
1478 return 0;
1479 }
1480 next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
1481 }
1482
1483 res = vmalloc(bufsize);
1484 if (!res)
1485 return -ENOMEM;
1486
1487 args->start_gfn = cur;
1488
1489 down_read(&kvm->mm->mmap_sem);
1490 srcu_idx = srcu_read_lock(&kvm->srcu);
1491 while (i < bufsize) {
1492 hva = gfn_to_hva(kvm, cur);
1493 if (kvm_is_error_hva(hva)) {
1494 r = -EFAULT;
1495 break;
1496 }
1497 /* decrement only if we actually flipped the bit to 0 */
1498 if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
1499 atomic64_dec(&s->dirty_pages);
1500 r = get_pgste(kvm->mm, hva, &pgstev);
1501 if (r < 0)
1502 pgstev = 0;
1503 /* save the value */
1504 res[i++] = (pgstev >> 24) & 0x3;
1505 /*
1506 * if the next bit is too far away, stop.
1507 * if we reached the previous "next", find the next one
1508 */
1509 if (!peek) {
1510 if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
1511 break;
1512 if (cur == next)
1513 next = find_next_bit(s->pgste_bitmap,
1514 s->bitmap_size, cur + 1);
1515 /* reached the end of the bitmap or of the buffer, stop */
1516 if ((next >= s->bitmap_size) ||
1517 (next >= args->start_gfn + bufsize))
1518 break;
1519 }
1520 cur++;
1521 }
1522 srcu_read_unlock(&kvm->srcu, srcu_idx);
1523 up_read(&kvm->mm->mmap_sem);
1524 args->count = i;
1525 args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
1526
1527 rr = copy_to_user((void __user *)args->values, res, args->count);
1528 if (rr)
1529 r = -EFAULT;
1530
1531 vfree(res);
1532 return r;
1533}
1534
1535/*
1536 * This function sets the CMMA attributes for the given pages. If the input
1537 * buffer has zero length, no action is taken, otherwise the attributes are
1538 * set and the mm->context.use_cmma flag is set.
1539 */
1540static int kvm_s390_set_cmma_bits(struct kvm *kvm,
1541 const struct kvm_s390_cmma_log *args)
1542{
1543 unsigned long hva, mask, pgstev, i;
1544 uint8_t *bits;
1545 int srcu_idx, r = 0;
1546
1547 mask = args->mask;
1548
1549 if (!kvm->arch.use_cmma)
1550 return -ENXIO;
1551 /* invalid/unsupported flags */
1552 if (args->flags != 0)
1553 return -EINVAL;
1554 /* Enforce sane limit on memory allocation */
1555 if (args->count > KVM_S390_CMMA_SIZE_MAX)
1556 return -EINVAL;
1557 /* Nothing to do */
1558 if (args->count == 0)
1559 return 0;
1560
1561 bits = vmalloc(sizeof(*bits) * args->count);
1562 if (!bits)
1563 return -ENOMEM;
1564
1565 r = copy_from_user(bits, (void __user *)args->values, args->count);
1566 if (r) {
1567 r = -EFAULT;
1568 goto out;
1569 }
1570
1571 down_read(&kvm->mm->mmap_sem);
1572 srcu_idx = srcu_read_lock(&kvm->srcu);
1573 for (i = 0; i < args->count; i++) {
1574 hva = gfn_to_hva(kvm, args->start_gfn + i);
1575 if (kvm_is_error_hva(hva)) {
1576 r = -EFAULT;
1577 break;
1578 }
1579
1580 pgstev = bits[i];
1581 pgstev = pgstev << 24;
1582 mask &= _PGSTE_GPS_USAGE_MASK;
1583 set_pgste_bits(kvm->mm, hva, mask, pgstev);
1584 }
1585 srcu_read_unlock(&kvm->srcu, srcu_idx);
1586 up_read(&kvm->mm->mmap_sem);
1587
1588 if (!kvm->mm->context.use_cmma) {
1589 down_write(&kvm->mm->mmap_sem);
1590 kvm->mm->context.use_cmma = 1;
1591 up_write(&kvm->mm->mmap_sem);
1592 }
1593out:
1594 vfree(bits);
1595 return r;
1596}
1597
1288long kvm_arch_vm_ioctl(struct file *filp, 1598long kvm_arch_vm_ioctl(struct file *filp,
1289 unsigned int ioctl, unsigned long arg) 1599 unsigned int ioctl, unsigned long arg)
1290{ 1600{
@@ -1363,6 +1673,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
1363 r = kvm_s390_set_skeys(kvm, &args); 1673 r = kvm_s390_set_skeys(kvm, &args);
1364 break; 1674 break;
1365 } 1675 }
1676 case KVM_S390_GET_CMMA_BITS: {
1677 struct kvm_s390_cmma_log args;
1678
1679 r = -EFAULT;
1680 if (copy_from_user(&args, argp, sizeof(args)))
1681 break;
1682 r = kvm_s390_get_cmma_bits(kvm, &args);
1683 if (!r) {
1684 r = copy_to_user(argp, &args, sizeof(args));
1685 if (r)
1686 r = -EFAULT;
1687 }
1688 break;
1689 }
1690 case KVM_S390_SET_CMMA_BITS: {
1691 struct kvm_s390_cmma_log args;
1692
1693 r = -EFAULT;
1694 if (copy_from_user(&args, argp, sizeof(args)))
1695 break;
1696 r = kvm_s390_set_cmma_bits(kvm, &args);
1697 break;
1698 }
1366 default: 1699 default:
1367 r = -ENOTTY; 1700 r = -ENOTTY;
1368 } 1701 }
@@ -1631,6 +1964,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
1631 kvm_s390_destroy_adapters(kvm); 1964 kvm_s390_destroy_adapters(kvm);
1632 kvm_s390_clear_float_irqs(kvm); 1965 kvm_s390_clear_float_irqs(kvm);
1633 kvm_s390_vsie_destroy(kvm); 1966 kvm_s390_vsie_destroy(kvm);
1967 if (kvm->arch.migration_state) {
1968 vfree(kvm->arch.migration_state->pgste_bitmap);
1969 kfree(kvm->arch.migration_state);
1970 }
1634 KVM_EVENT(3, "vm 0x%pK destroyed", kvm); 1971 KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
1635} 1972}
1636 1973
@@ -1975,7 +2312,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
1975 if (!vcpu->arch.sie_block->cbrlo) 2312 if (!vcpu->arch.sie_block->cbrlo)
1976 return -ENOMEM; 2313 return -ENOMEM;
1977 2314
1978 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
1979 vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; 2315 vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
1980 return 0; 2316 return 0;
1981} 2317}
@@ -2439,7 +2775,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
2439{ 2775{
2440retry: 2776retry:
2441 kvm_s390_vcpu_request_handled(vcpu); 2777 kvm_s390_vcpu_request_handled(vcpu);
2442 if (!vcpu->requests) 2778 if (!kvm_request_pending(vcpu))
2443 return 0; 2779 return 0;
2444 /* 2780 /*
2445 * We use MMU_RELOAD just to re-arm the ipte notifier for the 2781 * We use MMU_RELOAD just to re-arm the ipte notifier for the
@@ -2488,6 +2824,27 @@ retry:
2488 goto retry; 2824 goto retry;
2489 } 2825 }
2490 2826
2827 if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
2828 /*
2829 * Disable CMMA virtualization; we will emulate the ESSA
2830 * instruction manually, in order to provide additional
2831 * functionalities needed for live migration.
2832 */
2833 vcpu->arch.sie_block->ecb2 &= ~ECB2_CMMA;
2834 goto retry;
2835 }
2836
2837 if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
2838 /*
2839 * Re-enable CMMA virtualization if CMMA is available and
2840 * was used.
2841 */
2842 if ((vcpu->kvm->arch.use_cmma) &&
2843 (vcpu->kvm->mm->context.use_cmma))
2844 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
2845 goto retry;
2846 }
2847
2491 /* nothing to do, just clear the request */ 2848 /* nothing to do, just clear the request */
2492 kvm_clear_request(KVM_REQ_UNHALT, vcpu); 2849 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
2493 2850
@@ -2682,6 +3039,9 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
2682 3039
2683static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) 3040static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
2684{ 3041{
3042 struct mcck_volatile_info *mcck_info;
3043 struct sie_page *sie_page;
3044
2685 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", 3045 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
2686 vcpu->arch.sie_block->icptcode); 3046 vcpu->arch.sie_block->icptcode);
2687 trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode); 3047 trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
@@ -2692,6 +3052,15 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
2692 vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14; 3052 vcpu->run->s.regs.gprs[14] = vcpu->arch.sie_block->gg14;
2693 vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15; 3053 vcpu->run->s.regs.gprs[15] = vcpu->arch.sie_block->gg15;
2694 3054
3055 if (exit_reason == -EINTR) {
3056 VCPU_EVENT(vcpu, 3, "%s", "machine check");
3057 sie_page = container_of(vcpu->arch.sie_block,
3058 struct sie_page, sie_block);
3059 mcck_info = &sie_page->mcck_info;
3060 kvm_s390_reinject_machine_check(vcpu, mcck_info);
3061 return 0;
3062 }
3063
2695 if (vcpu->arch.sie_block->icptcode > 0) { 3064 if (vcpu->arch.sie_block->icptcode > 0) {
2696 int rc = kvm_handle_sie_intercept(vcpu); 3065 int rc = kvm_handle_sie_intercept(vcpu);
2697 3066
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 55f5c8457d6d..6fedc8bc7a37 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -397,4 +397,6 @@ static inline int kvm_s390_use_sca_entries(void)
397 */ 397 */
398 return sclp.has_sigpif; 398 return sclp.has_sigpif;
399} 399}
400void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
401 struct mcck_volatile_info *mcck_info);
400#endif 402#endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index e53292a89257..8a1dac793d6b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -24,6 +24,7 @@
24#include <asm/ebcdic.h> 24#include <asm/ebcdic.h>
25#include <asm/sysinfo.h> 25#include <asm/sysinfo.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/page-states.h>
27#include <asm/pgalloc.h> 28#include <asm/pgalloc.h>
28#include <asm/gmap.h> 29#include <asm/gmap.h>
29#include <asm/io.h> 30#include <asm/io.h>
@@ -949,13 +950,72 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
949 return 0; 950 return 0;
950} 951}
951 952
953static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
954{
955 struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
956 int r1, r2, nappended, entries;
957 unsigned long gfn, hva, res, pgstev, ptev;
958 unsigned long *cbrlo;
959
960 /*
961 * We don't need to set SD.FPF.SK to 1 here, because if we have a
962 * machine check here we either handle it or crash
963 */
964
965 kvm_s390_get_regs_rre(vcpu, &r1, &r2);
966 gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
967 hva = gfn_to_hva(vcpu->kvm, gfn);
968 entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
969
970 if (kvm_is_error_hva(hva))
971 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
972
973 nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
974 if (nappended < 0) {
975 res = orc ? 0x10 : 0;
976 vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
977 return 0;
978 }
979 res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
980 /*
981 * Set the block-content state part of the result. 0 means resident, so
982 * nothing to do if the page is valid. 2 is for preserved pages
983 * (non-present and non-zero), and 3 for zero pages (non-present and
984 * zero).
985 */
986 if (ptev & _PAGE_INVALID) {
987 res |= 2;
988 if (pgstev & _PGSTE_GPS_ZERO)
989 res |= 1;
990 }
991 vcpu->run->s.regs.gprs[r1] = res;
992 /*
993 * It is possible that all the normal 511 slots were full, in which case
994 * we will now write in the 512th slot, which is reserved for host use.
995 * In both cases we let the normal essa handling code process all the
996 * slots, including the reserved one, if needed.
997 */
998 if (nappended > 0) {
999 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo & PAGE_MASK);
1000 cbrlo[entries] = gfn << PAGE_SHIFT;
1001 }
1002
1003 if (orc) {
1004 /* increment only if we are really flipping the bit to 1 */
1005 if (!test_and_set_bit(gfn, ms->pgste_bitmap))
1006 atomic64_inc(&ms->dirty_pages);
1007 }
1008
1009 return nappended;
1010}
1011
952static int handle_essa(struct kvm_vcpu *vcpu) 1012static int handle_essa(struct kvm_vcpu *vcpu)
953{ 1013{
954 /* entries expected to be 1FF */ 1014 /* entries expected to be 1FF */
955 int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; 1015 int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
956 unsigned long *cbrlo; 1016 unsigned long *cbrlo;
957 struct gmap *gmap; 1017 struct gmap *gmap;
958 int i; 1018 int i, orc;
959 1019
960 VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); 1020 VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
961 gmap = vcpu->arch.gmap; 1021 gmap = vcpu->arch.gmap;
@@ -965,12 +1025,45 @@ static int handle_essa(struct kvm_vcpu *vcpu)
965 1025
966 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 1026 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
967 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 1027 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
968 1028 /* Check for invalid operation request code */
969 if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6) 1029 orc = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
1030 if (orc > ESSA_MAX)
970 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 1031 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
971 1032
972 /* Retry the ESSA instruction */ 1033 if (likely(!vcpu->kvm->arch.migration_state)) {
973 kvm_s390_retry_instr(vcpu); 1034 /*
1035 * CMMA is enabled in the KVM settings, but is disabled in
1036 * the SIE block and in the mm_context, and we are not doing
1037 * a migration. Enable CMMA in the mm_context.
1038 * Since we need to take a write lock to write to the context
1039 * to avoid races with storage keys handling, we check if the
1040 * value really needs to be written to; if the value is
1041 * already correct, we do nothing and avoid the lock.
1042 */
1043 if (vcpu->kvm->mm->context.use_cmma == 0) {
1044 down_write(&vcpu->kvm->mm->mmap_sem);
1045 vcpu->kvm->mm->context.use_cmma = 1;
1046 up_write(&vcpu->kvm->mm->mmap_sem);
1047 }
1048 /*
1049 * If we are here, we are supposed to have CMMA enabled in
1050 * the SIE block. Enabling CMMA works on a per-CPU basis,
1051 * while the context use_cmma flag is per process.
1052 * It's possible that the context flag is enabled and the
1053 * SIE flag is not, so we set the flag always; if it was
1054 * already set, nothing changes, otherwise we enable it
1055 * on this CPU too.
1056 */
1057 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
1058 /* Retry the ESSA instruction */
1059 kvm_s390_retry_instr(vcpu);
1060 } else {
1061 /* Account for the possible extra cbrl entry */
1062 i = do_essa(vcpu, orc);
1063 if (i < 0)
1064 return i;
1065 entries += i;
1066 }
974 vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ 1067 vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
975 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); 1068 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
976 down_read(&gmap->mm->mmap_sem); 1069 down_read(&gmap->mm->mmap_sem);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4719ecb9ab42..715c19c45d9a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -26,16 +26,21 @@
26 26
27struct vsie_page { 27struct vsie_page {
28 struct kvm_s390_sie_block scb_s; /* 0x0000 */ 28 struct kvm_s390_sie_block scb_s; /* 0x0000 */
29 /*
30 * the backup info for machine check. ensure it's at
31 * the same offset as that in struct sie_page!
32 */
33 struct mcck_volatile_info mcck_info; /* 0x0200 */
29 /* the pinned originial scb */ 34 /* the pinned originial scb */
30 struct kvm_s390_sie_block *scb_o; /* 0x0200 */ 35 struct kvm_s390_sie_block *scb_o; /* 0x0218 */
31 /* the shadow gmap in use by the vsie_page */ 36 /* the shadow gmap in use by the vsie_page */
32 struct gmap *gmap; /* 0x0208 */ 37 struct gmap *gmap; /* 0x0220 */
33 /* address of the last reported fault to guest2 */ 38 /* address of the last reported fault to guest2 */
34 unsigned long fault_addr; /* 0x0210 */ 39 unsigned long fault_addr; /* 0x0228 */
35 __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ 40 __u8 reserved[0x0700 - 0x0230]; /* 0x0230 */
36 struct kvm_s390_crypto_cb crycb; /* 0x0700 */ 41 struct kvm_s390_crypto_cb crycb; /* 0x0700 */
37 __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ 42 __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
38} __packed; 43};
39 44
40/* trigger a validity icpt for the given scb */ 45/* trigger a validity icpt for the given scb */
41static int set_validity_icpt(struct kvm_s390_sie_block *scb, 46static int set_validity_icpt(struct kvm_s390_sie_block *scb,
@@ -801,6 +806,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
801{ 806{
802 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 807 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
803 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; 808 struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
809 struct mcck_volatile_info *mcck_info;
810 struct sie_page *sie_page;
804 int rc; 811 int rc;
805 812
806 handle_last_fault(vcpu, vsie_page); 813 handle_last_fault(vcpu, vsie_page);
@@ -822,6 +829,14 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
822 local_irq_enable(); 829 local_irq_enable();
823 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 830 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
824 831
832 if (rc == -EINTR) {
833 VCPU_EVENT(vcpu, 3, "%s", "machine check");
834 sie_page = container_of(scb_s, struct sie_page, sie_block);
835 mcck_info = &sie_page->mcck_info;
836 kvm_s390_reinject_machine_check(vcpu, mcck_info);
837 return 0;
838 }
839
825 if (rc > 0) 840 if (rc > 0)
826 rc = 0; /* we could still have an icpt */ 841 rc = 0; /* we could still have an icpt */
827 else if (rc == -EFAULT) 842 else if (rc == -EFAULT)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605eb1dfb..1588e9e3dc01 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,28 +48,31 @@
48#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS 48#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
49 49
50/* x86-specific vcpu->requests bit members */ 50/* x86-specific vcpu->requests bit members */
51#define KVM_REQ_MIGRATE_TIMER 8 51#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
52#define KVM_REQ_REPORT_TPR_ACCESS 9 52#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
53#define KVM_REQ_TRIPLE_FAULT 10 53#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2)
54#define KVM_REQ_MMU_SYNC 11 54#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3)
55#define KVM_REQ_CLOCK_UPDATE 12 55#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4)
56#define KVM_REQ_EVENT 14 56#define KVM_REQ_EVENT KVM_ARCH_REQ(6)
57#define KVM_REQ_APF_HALT 15 57#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7)
58#define KVM_REQ_STEAL_UPDATE 16 58#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8)
59#define KVM_REQ_NMI 17 59#define KVM_REQ_NMI KVM_ARCH_REQ(9)
60#define KVM_REQ_PMU 18 60#define KVM_REQ_PMU KVM_ARCH_REQ(10)
61#define KVM_REQ_PMI 19 61#define KVM_REQ_PMI KVM_ARCH_REQ(11)
62#define KVM_REQ_SMI 20 62#define KVM_REQ_SMI KVM_ARCH_REQ(12)
63#define KVM_REQ_MASTERCLOCK_UPDATE 21 63#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
64#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 64#define KVM_REQ_MCLOCK_INPROGRESS \
65#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 65 KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
66#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 66#define KVM_REQ_SCAN_IOAPIC \
67#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 67 KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
68#define KVM_REQ_HV_CRASH 26 68#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16)
69#define KVM_REQ_IOAPIC_EOI_EXIT 27 69#define KVM_REQ_APIC_PAGE_RELOAD \
70#define KVM_REQ_HV_RESET 28 70 KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
71#define KVM_REQ_HV_EXIT 29 71#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18)
72#define KVM_REQ_HV_STIMER 30 72#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19)
73#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
74#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
75#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
73 76
74#define CR0_RESERVED_BITS \ 77#define CR0_RESERVED_BITS \
75 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 78 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -254,7 +257,8 @@ union kvm_mmu_page_role {
254 unsigned cr0_wp:1; 257 unsigned cr0_wp:1;
255 unsigned smep_andnot_wp:1; 258 unsigned smep_andnot_wp:1;
256 unsigned smap_andnot_wp:1; 259 unsigned smap_andnot_wp:1;
257 unsigned :8; 260 unsigned ad_disabled:1;
261 unsigned :7;
258 262
259 /* 263 /*
260 * This is left at the top of the word so that 264 * This is left at the top of the word so that
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d406894cd9a2..5573c75f8e4c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -426,6 +426,8 @@
426#define MSR_IA32_TSC_ADJUST 0x0000003b 426#define MSR_IA32_TSC_ADJUST 0x0000003b
427#define MSR_IA32_BNDCFGS 0x00000d90 427#define MSR_IA32_BNDCFGS 0x00000d90
428 428
429#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
430
429#define MSR_IA32_XSS 0x00000da0 431#define MSR_IA32_XSS 0x00000da0
430 432
431#define FEATURE_CONTROL_LOCKED (1<<0) 433#define FEATURE_CONTROL_LOCKED (1<<0)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a6fd40aade7c..da6728383052 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -144,6 +144,14 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
144 return best && (best->ebx & bit(X86_FEATURE_RTM)); 144 return best && (best->ebx & bit(X86_FEATURE_RTM));
145} 145}
146 146
147static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
148{
149 struct kvm_cpuid_entry2 *best;
150
151 best = kvm_find_cpuid_entry(vcpu, 7, 0);
152 return best && (best->ebx & bit(X86_FEATURE_MPX));
153}
154
147static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) 155static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
148{ 156{
149 struct kvm_cpuid_entry2 *best; 157 struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 80890dee66ce..fb0055953fbc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -900,7 +900,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
900 if (rc != X86EMUL_CONTINUE) \ 900 if (rc != X86EMUL_CONTINUE) \
901 goto done; \ 901 goto done; \
902 ctxt->_eip += sizeof(_type); \ 902 ctxt->_eip += sizeof(_type); \
903 _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ 903 memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
904 ctxt->fetch.ptr += sizeof(_type); \ 904 ctxt->fetch.ptr += sizeof(_type); \
905 _x; \ 905 _x; \
906}) 906})
@@ -3942,6 +3942,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
3942} 3942}
3943 3943
3944/* 3944/*
3945 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
3946 * and restore MXCSR.
3947 */
3948static size_t __fxstate_size(int nregs)
3949{
3950 return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
3951}
3952
3953static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
3954{
3955 bool cr4_osfxsr;
3956 if (ctxt->mode == X86EMUL_MODE_PROT64)
3957 return __fxstate_size(16);
3958
3959 cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
3960 return __fxstate_size(cr4_osfxsr ? 8 : 0);
3961}
3962
3963/*
3945 * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, 3964 * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
3946 * 1) 16 bit mode 3965 * 1) 16 bit mode
3947 * 2) 32 bit mode 3966 * 2) 32 bit mode
@@ -3962,7 +3981,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt)
3962static int em_fxsave(struct x86_emulate_ctxt *ctxt) 3981static int em_fxsave(struct x86_emulate_ctxt *ctxt)
3963{ 3982{
3964 struct fxregs_state fx_state; 3983 struct fxregs_state fx_state;
3965 size_t size;
3966 int rc; 3984 int rc;
3967 3985
3968 rc = check_fxsr(ctxt); 3986 rc = check_fxsr(ctxt);
@@ -3978,68 +3996,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
3978 if (rc != X86EMUL_CONTINUE) 3996 if (rc != X86EMUL_CONTINUE)
3979 return rc; 3997 return rc;
3980 3998
3981 if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) 3999 return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
3982 size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); 4000 fxstate_size(ctxt));
3983 else
3984 size = offsetof(struct fxregs_state, xmm_space[0]);
3985
3986 return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
3987}
3988
3989static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
3990 struct fxregs_state *new)
3991{
3992 int rc = X86EMUL_CONTINUE;
3993 struct fxregs_state old;
3994
3995 rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
3996 if (rc != X86EMUL_CONTINUE)
3997 return rc;
3998
3999 /*
4000 * 64 bit host will restore XMM 8-15, which is not correct on non-64
4001 * bit guests. Load the current values in order to preserve 64 bit
4002 * XMMs after fxrstor.
4003 */
4004#ifdef CONFIG_X86_64
4005 /* XXX: accessing XMM 8-15 very awkwardly */
4006 memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
4007#endif
4008
4009 /*
4010 * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
4011 * does save and restore MXCSR.
4012 */
4013 if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
4014 memcpy(new->xmm_space, old.xmm_space, 8 * 16);
4015
4016 return rc;
4017} 4001}
4018 4002
4019static int em_fxrstor(struct x86_emulate_ctxt *ctxt) 4003static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
4020{ 4004{
4021 struct fxregs_state fx_state; 4005 struct fxregs_state fx_state;
4022 int rc; 4006 int rc;
4007 size_t size;
4023 4008
4024 rc = check_fxsr(ctxt); 4009 rc = check_fxsr(ctxt);
4025 if (rc != X86EMUL_CONTINUE) 4010 if (rc != X86EMUL_CONTINUE)
4026 return rc; 4011 return rc;
4027 4012
4028 rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); 4013 ctxt->ops->get_fpu(ctxt);
4029 if (rc != X86EMUL_CONTINUE)
4030 return rc;
4031 4014
4032 if (fx_state.mxcsr >> 16) 4015 size = fxstate_size(ctxt);
4033 return emulate_gp(ctxt, 0); 4016 if (size < __fxstate_size(16)) {
4017 rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
4018 if (rc != X86EMUL_CONTINUE)
4019 goto out;
4020 }
4034 4021
4035 ctxt->ops->get_fpu(ctxt); 4022 rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
4023 if (rc != X86EMUL_CONTINUE)
4024 goto out;
4036 4025
4037 if (ctxt->mode < X86EMUL_MODE_PROT64) 4026 if (fx_state.mxcsr >> 16) {
4038 rc = fxrstor_fixup(ctxt, &fx_state); 4027 rc = emulate_gp(ctxt, 0);
4028 goto out;
4029 }
4039 4030
4040 if (rc == X86EMUL_CONTINUE) 4031 if (rc == X86EMUL_CONTINUE)
4041 rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); 4032 rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
4042 4033
4034out:
4043 ctxt->ops->put_fpu(ctxt); 4035 ctxt->ops->put_fpu(ctxt);
4044 4036
4045 return rc; 4037 return rc;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d24c8742d9b0..2819d4c123eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1495,6 +1495,7 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
1495 1495
1496static void cancel_hv_timer(struct kvm_lapic *apic) 1496static void cancel_hv_timer(struct kvm_lapic *apic)
1497{ 1497{
1498 WARN_ON(!apic->lapic_timer.hv_timer_in_use);
1498 preempt_disable(); 1499 preempt_disable();
1499 kvm_x86_ops->cancel_hv_timer(apic->vcpu); 1500 kvm_x86_ops->cancel_hv_timer(apic->vcpu);
1500 apic->lapic_timer.hv_timer_in_use = false; 1501 apic->lapic_timer.hv_timer_in_use = false;
@@ -1503,25 +1504,56 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
1503 1504
1504static bool start_hv_timer(struct kvm_lapic *apic) 1505static bool start_hv_timer(struct kvm_lapic *apic)
1505{ 1506{
1506 u64 tscdeadline = apic->lapic_timer.tscdeadline; 1507 struct kvm_timer *ktimer = &apic->lapic_timer;
1508 int r;
1507 1509
1508 if ((atomic_read(&apic->lapic_timer.pending) && 1510 if (!kvm_x86_ops->set_hv_timer)
1509 !apic_lvtt_period(apic)) || 1511 return false;
1510 kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { 1512
1511 if (apic->lapic_timer.hv_timer_in_use) 1513 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
1512 cancel_hv_timer(apic); 1514 return false;
1513 } else {
1514 apic->lapic_timer.hv_timer_in_use = true;
1515 hrtimer_cancel(&apic->lapic_timer.timer);
1516 1515
1517 /* In case the sw timer triggered in the window */ 1516 r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
1518 if (atomic_read(&apic->lapic_timer.pending) && 1517 if (r < 0)
1519 !apic_lvtt_period(apic)) 1518 return false;
1520 cancel_hv_timer(apic); 1519
1520 ktimer->hv_timer_in_use = true;
1521 hrtimer_cancel(&ktimer->timer);
1522
1523 /*
1524 * Also recheck ktimer->pending, in case the sw timer triggered in
1525 * the window. For periodic timer, leave the hv timer running for
1526 * simplicity, and the deadline will be recomputed on the next vmexit.
1527 */
1528 if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
1529 if (r)
1530 apic_timer_expired(apic);
1531 return false;
1521 } 1532 }
1522 trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, 1533
1523 apic->lapic_timer.hv_timer_in_use); 1534 trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
1524 return apic->lapic_timer.hv_timer_in_use; 1535 return true;
1536}
1537
1538static void start_sw_timer(struct kvm_lapic *apic)
1539{
1540 struct kvm_timer *ktimer = &apic->lapic_timer;
1541 if (apic->lapic_timer.hv_timer_in_use)
1542 cancel_hv_timer(apic);
1543 if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
1544 return;
1545
1546 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
1547 start_sw_period(apic);
1548 else if (apic_lvtt_tscdeadline(apic))
1549 start_sw_tscdeadline(apic);
1550 trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
1551}
1552
1553static void restart_apic_timer(struct kvm_lapic *apic)
1554{
1555 if (!start_hv_timer(apic))
1556 start_sw_timer(apic);
1525} 1557}
1526 1558
1527void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) 1559void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
@@ -1535,19 +1567,14 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
1535 1567
1536 if (apic_lvtt_period(apic) && apic->lapic_timer.period) { 1568 if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
1537 advance_periodic_target_expiration(apic); 1569 advance_periodic_target_expiration(apic);
1538 if (!start_hv_timer(apic)) 1570 restart_apic_timer(apic);
1539 start_sw_period(apic);
1540 } 1571 }
1541} 1572}
1542EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); 1573EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
1543 1574
1544void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) 1575void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
1545{ 1576{
1546 struct kvm_lapic *apic = vcpu->arch.apic; 1577 restart_apic_timer(vcpu->arch.apic);
1547
1548 WARN_ON(apic->lapic_timer.hv_timer_in_use);
1549
1550 start_hv_timer(apic);
1551} 1578}
1552EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); 1579EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
1553 1580
@@ -1556,33 +1583,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
1556 struct kvm_lapic *apic = vcpu->arch.apic; 1583 struct kvm_lapic *apic = vcpu->arch.apic;
1557 1584
1558 /* Possibly the TSC deadline timer is not enabled yet */ 1585 /* Possibly the TSC deadline timer is not enabled yet */
1559 if (!apic->lapic_timer.hv_timer_in_use) 1586 if (apic->lapic_timer.hv_timer_in_use)
1560 return; 1587 start_sw_timer(apic);
1561 1588}
1562 cancel_hv_timer(apic); 1589EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
1563 1590
1564 if (atomic_read(&apic->lapic_timer.pending)) 1591void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
1565 return; 1592{
1593 struct kvm_lapic *apic = vcpu->arch.apic;
1566 1594
1567 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) 1595 WARN_ON(!apic->lapic_timer.hv_timer_in_use);
1568 start_sw_period(apic); 1596 restart_apic_timer(apic);
1569 else if (apic_lvtt_tscdeadline(apic))
1570 start_sw_tscdeadline(apic);
1571} 1597}
1572EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
1573 1598
1574static void start_apic_timer(struct kvm_lapic *apic) 1599static void start_apic_timer(struct kvm_lapic *apic)
1575{ 1600{
1576 atomic_set(&apic->lapic_timer.pending, 0); 1601 atomic_set(&apic->lapic_timer.pending, 0);
1577 1602
1578 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 1603 if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
1579 if (set_target_expiration(apic) && 1604 && !set_target_expiration(apic))
1580 !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) 1605 return;
1581 start_sw_period(apic); 1606
1582 } else if (apic_lvtt_tscdeadline(apic)) { 1607 restart_apic_timer(apic);
1583 if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
1584 start_sw_tscdeadline(apic);
1585 }
1586} 1608}
1587 1609
1588static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 1610static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -1813,16 +1835,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
1813 * LAPIC interface 1835 * LAPIC interface
1814 *---------------------------------------------------------------------- 1836 *----------------------------------------------------------------------
1815 */ 1837 */
1816u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
1817{
1818 struct kvm_lapic *apic = vcpu->arch.apic;
1819
1820 if (!lapic_in_kernel(vcpu))
1821 return 0;
1822
1823 return apic->lapic_timer.tscdeadline;
1824}
1825
1826u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) 1838u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1827{ 1839{
1828 struct kvm_lapic *apic = vcpu->arch.apic; 1840 struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index bcbe811f3b97..29caa2c3dff9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -87,7 +87,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
87int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); 87int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
88int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 88int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
89 89
90u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
91u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 90u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
92void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 91void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
93 92
@@ -216,4 +215,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
216void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); 215void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
217void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); 216void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
218bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); 217bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
218void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
219#endif 219#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb8225969255..aafd399cf8c6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -183,13 +183,13 @@ static u64 __read_mostly shadow_user_mask;
183static u64 __read_mostly shadow_accessed_mask; 183static u64 __read_mostly shadow_accessed_mask;
184static u64 __read_mostly shadow_dirty_mask; 184static u64 __read_mostly shadow_dirty_mask;
185static u64 __read_mostly shadow_mmio_mask; 185static u64 __read_mostly shadow_mmio_mask;
186static u64 __read_mostly shadow_mmio_value;
186static u64 __read_mostly shadow_present_mask; 187static u64 __read_mostly shadow_present_mask;
187 188
188/* 189/*
189 * The mask/value to distinguish a PTE that has been marked not-present for 190 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
190 * access tracking purposes. 191 * Non-present SPTEs with shadow_acc_track_value set are in place for access
191 * The mask would be either 0 if access tracking is disabled, or 192 * tracking.
192 * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
193 */ 193 */
194static u64 __read_mostly shadow_acc_track_mask; 194static u64 __read_mostly shadow_acc_track_mask;
195static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; 195static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
@@ -207,16 +207,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF
207static void mmu_spte_set(u64 *sptep, u64 spte); 207static void mmu_spte_set(u64 *sptep, u64 spte);
208static void mmu_free_roots(struct kvm_vcpu *vcpu); 208static void mmu_free_roots(struct kvm_vcpu *vcpu);
209 209
210void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 210void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
211{ 211{
212 BUG_ON((mmio_mask & mmio_value) != mmio_value);
213 shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
212 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; 214 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
213} 215}
214EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 216EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
215 217
218static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
219{
220 return sp->role.ad_disabled;
221}
222
223static inline bool spte_ad_enabled(u64 spte)
224{
225 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
226 return !(spte & shadow_acc_track_value);
227}
228
229static inline u64 spte_shadow_accessed_mask(u64 spte)
230{
231 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
232 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
233}
234
235static inline u64 spte_shadow_dirty_mask(u64 spte)
236{
237 MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
238 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
239}
240
216static inline bool is_access_track_spte(u64 spte) 241static inline bool is_access_track_spte(u64 spte)
217{ 242{
218 /* Always false if shadow_acc_track_mask is zero. */ 243 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
219 return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
220} 244}
221 245
222/* 246/*
@@ -270,7 +294,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
270 u64 mask = generation_mmio_spte_mask(gen); 294 u64 mask = generation_mmio_spte_mask(gen);
271 295
272 access &= ACC_WRITE_MASK | ACC_USER_MASK; 296 access &= ACC_WRITE_MASK | ACC_USER_MASK;
273 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; 297 mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
274 298
275 trace_mark_mmio_spte(sptep, gfn, access, gen); 299 trace_mark_mmio_spte(sptep, gfn, access, gen);
276 mmu_spte_set(sptep, mask); 300 mmu_spte_set(sptep, mask);
@@ -278,7 +302,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
278 302
279static bool is_mmio_spte(u64 spte) 303static bool is_mmio_spte(u64 spte)
280{ 304{
281 return (spte & shadow_mmio_mask) == shadow_mmio_mask; 305 return (spte & shadow_mmio_mask) == shadow_mmio_value;
282} 306}
283 307
284static gfn_t get_mmio_spte_gfn(u64 spte) 308static gfn_t get_mmio_spte_gfn(u64 spte)
@@ -315,12 +339,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
315 return likely(kvm_gen == spte_gen); 339 return likely(kvm_gen == spte_gen);
316} 340}
317 341
342/*
343 * Sets the shadow PTE masks used by the MMU.
344 *
345 * Assumptions:
346 * - Setting either @accessed_mask or @dirty_mask requires setting both
347 * - At least one of @accessed_mask or @acc_track_mask must be set
348 */
318void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 349void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
319 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 350 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
320 u64 acc_track_mask) 351 u64 acc_track_mask)
321{ 352{
322 if (acc_track_mask != 0) 353 BUG_ON(!dirty_mask != !accessed_mask);
323 acc_track_mask |= SPTE_SPECIAL_MASK; 354 BUG_ON(!accessed_mask && !acc_track_mask);
355 BUG_ON(acc_track_mask & shadow_acc_track_value);
324 356
325 shadow_user_mask = user_mask; 357 shadow_user_mask = user_mask;
326 shadow_accessed_mask = accessed_mask; 358 shadow_accessed_mask = accessed_mask;
@@ -329,7 +361,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
329 shadow_x_mask = x_mask; 361 shadow_x_mask = x_mask;
330 shadow_present_mask = p_mask; 362 shadow_present_mask = p_mask;
331 shadow_acc_track_mask = acc_track_mask; 363 shadow_acc_track_mask = acc_track_mask;
332 WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
333} 364}
334EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 365EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
335 366
@@ -549,7 +580,7 @@ static bool spte_has_volatile_bits(u64 spte)
549 is_access_track_spte(spte)) 580 is_access_track_spte(spte))
550 return true; 581 return true;
551 582
552 if (shadow_accessed_mask) { 583 if (spte_ad_enabled(spte)) {
553 if ((spte & shadow_accessed_mask) == 0 || 584 if ((spte & shadow_accessed_mask) == 0 ||
554 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) 585 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
555 return true; 586 return true;
@@ -560,14 +591,17 @@ static bool spte_has_volatile_bits(u64 spte)
560 591
561static bool is_accessed_spte(u64 spte) 592static bool is_accessed_spte(u64 spte)
562{ 593{
563 return shadow_accessed_mask ? spte & shadow_accessed_mask 594 u64 accessed_mask = spte_shadow_accessed_mask(spte);
564 : !is_access_track_spte(spte); 595
596 return accessed_mask ? spte & accessed_mask
597 : !is_access_track_spte(spte);
565} 598}
566 599
567static bool is_dirty_spte(u64 spte) 600static bool is_dirty_spte(u64 spte)
568{ 601{
569 return shadow_dirty_mask ? spte & shadow_dirty_mask 602 u64 dirty_mask = spte_shadow_dirty_mask(spte);
570 : spte & PT_WRITABLE_MASK; 603
604 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
571} 605}
572 606
573/* Rules for using mmu_spte_set: 607/* Rules for using mmu_spte_set:
@@ -707,10 +741,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
707 741
708static u64 mark_spte_for_access_track(u64 spte) 742static u64 mark_spte_for_access_track(u64 spte)
709{ 743{
710 if (shadow_accessed_mask != 0) 744 if (spte_ad_enabled(spte))
711 return spte & ~shadow_accessed_mask; 745 return spte & ~shadow_accessed_mask;
712 746
713 if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) 747 if (is_access_track_spte(spte))
714 return spte; 748 return spte;
715 749
716 /* 750 /*
@@ -729,7 +763,6 @@ static u64 mark_spte_for_access_track(u64 spte)
729 spte |= (spte & shadow_acc_track_saved_bits_mask) << 763 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
730 shadow_acc_track_saved_bits_shift; 764 shadow_acc_track_saved_bits_shift;
731 spte &= ~shadow_acc_track_mask; 765 spte &= ~shadow_acc_track_mask;
732 spte |= shadow_acc_track_value;
733 766
734 return spte; 767 return spte;
735} 768}
@@ -741,6 +774,7 @@ static u64 restore_acc_track_spte(u64 spte)
741 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) 774 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
742 & shadow_acc_track_saved_bits_mask; 775 & shadow_acc_track_saved_bits_mask;
743 776
777 WARN_ON_ONCE(spte_ad_enabled(spte));
744 WARN_ON_ONCE(!is_access_track_spte(spte)); 778 WARN_ON_ONCE(!is_access_track_spte(spte));
745 779
746 new_spte &= ~shadow_acc_track_mask; 780 new_spte &= ~shadow_acc_track_mask;
@@ -759,7 +793,7 @@ static bool mmu_spte_age(u64 *sptep)
759 if (!is_accessed_spte(spte)) 793 if (!is_accessed_spte(spte))
760 return false; 794 return false;
761 795
762 if (shadow_accessed_mask) { 796 if (spte_ad_enabled(spte)) {
763 clear_bit((ffs(shadow_accessed_mask) - 1), 797 clear_bit((ffs(shadow_accessed_mask) - 1),
764 (unsigned long *)sptep); 798 (unsigned long *)sptep);
765 } else { 799 } else {
@@ -1390,6 +1424,22 @@ static bool spte_clear_dirty(u64 *sptep)
1390 return mmu_spte_update(sptep, spte); 1424 return mmu_spte_update(sptep, spte);
1391} 1425}
1392 1426
1427static bool wrprot_ad_disabled_spte(u64 *sptep)
1428{
1429 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1430 (unsigned long *)sptep);
1431 if (was_writable)
1432 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1433
1434 return was_writable;
1435}
1436
1437/*
1438 * Gets the GFN ready for another round of dirty logging by clearing the
1439 * - D bit on ad-enabled SPTEs, and
1440 * - W bit on ad-disabled SPTEs.
1441 * Returns true iff any D or W bits were cleared.
1442 */
1393static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1443static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1394{ 1444{
1395 u64 *sptep; 1445 u64 *sptep;
@@ -1397,7 +1447,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1397 bool flush = false; 1447 bool flush = false;
1398 1448
1399 for_each_rmap_spte(rmap_head, &iter, sptep) 1449 for_each_rmap_spte(rmap_head, &iter, sptep)
1400 flush |= spte_clear_dirty(sptep); 1450 if (spte_ad_enabled(*sptep))
1451 flush |= spte_clear_dirty(sptep);
1452 else
1453 flush |= wrprot_ad_disabled_spte(sptep);
1401 1454
1402 return flush; 1455 return flush;
1403} 1456}
@@ -1420,7 +1473,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1420 bool flush = false; 1473 bool flush = false;
1421 1474
1422 for_each_rmap_spte(rmap_head, &iter, sptep) 1475 for_each_rmap_spte(rmap_head, &iter, sptep)
1423 flush |= spte_set_dirty(sptep); 1476 if (spte_ad_enabled(*sptep))
1477 flush |= spte_set_dirty(sptep);
1424 1478
1425 return flush; 1479 return flush;
1426} 1480}
@@ -1452,7 +1506,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1452} 1506}
1453 1507
1454/** 1508/**
1455 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages 1509 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1510 * protect the page if the D-bit isn't supported.
1456 * @kvm: kvm instance 1511 * @kvm: kvm instance
1457 * @slot: slot to clear D-bit 1512 * @slot: slot to clear D-bit
1458 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1513 * @gfn_offset: start of the BITS_PER_LONG pages we care about
@@ -1766,18 +1821,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1766 u64 *sptep; 1821 u64 *sptep;
1767 struct rmap_iterator iter; 1822 struct rmap_iterator iter;
1768 1823
1769 /*
1770 * If there's no access bit in the secondary pte set by the hardware and
1771 * fast access tracking is also not enabled, it's up to gup-fast/gup to
1772 * set the access bit in the primary pte or in the page structure.
1773 */
1774 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1775 goto out;
1776
1777 for_each_rmap_spte(rmap_head, &iter, sptep) 1824 for_each_rmap_spte(rmap_head, &iter, sptep)
1778 if (is_accessed_spte(*sptep)) 1825 if (is_accessed_spte(*sptep))
1779 return 1; 1826 return 1;
1780out:
1781 return 0; 1827 return 0;
1782} 1828}
1783 1829
@@ -1798,18 +1844,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1798 1844
1799int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1845int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1800{ 1846{
1801 /*
1802 * In case of absence of EPT Access and Dirty Bits supports,
1803 * emulate the accessed bit for EPT, by checking if this page has
1804 * an EPT mapping, and clearing it if it does. On the next access,
1805 * a new EPT mapping will be established.
1806 * This has some overhead, but not as much as the cost of swapping
1807 * out actively used pages or breaking up actively used hugepages.
1808 */
1809 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1810 return kvm_handle_hva_range(kvm, start, end, 0,
1811 kvm_unmap_rmapp);
1812
1813 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); 1847 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
1814} 1848}
1815 1849
@@ -2398,7 +2432,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2398 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2432 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2399 2433
2400 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | 2434 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2401 shadow_user_mask | shadow_x_mask | shadow_accessed_mask; 2435 shadow_user_mask | shadow_x_mask;
2436
2437 if (sp_ad_disabled(sp))
2438 spte |= shadow_acc_track_value;
2439 else
2440 spte |= shadow_accessed_mask;
2402 2441
2403 mmu_spte_set(sptep, spte); 2442 mmu_spte_set(sptep, spte);
2404 2443
@@ -2666,10 +2705,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2666{ 2705{
2667 u64 spte = 0; 2706 u64 spte = 0;
2668 int ret = 0; 2707 int ret = 0;
2708 struct kvm_mmu_page *sp;
2669 2709
2670 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) 2710 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2671 return 0; 2711 return 0;
2672 2712
2713 sp = page_header(__pa(sptep));
2714 if (sp_ad_disabled(sp))
2715 spte |= shadow_acc_track_value;
2716
2673 /* 2717 /*
2674 * For the EPT case, shadow_present_mask is 0 if hardware 2718 * For the EPT case, shadow_present_mask is 0 if hardware
2675 * supports exec-only page table entries. In that case, 2719 * supports exec-only page table entries. In that case,
@@ -2678,7 +2722,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2678 */ 2722 */
2679 spte |= shadow_present_mask; 2723 spte |= shadow_present_mask;
2680 if (!speculative) 2724 if (!speculative)
2681 spte |= shadow_accessed_mask; 2725 spte |= spte_shadow_accessed_mask(spte);
2682 2726
2683 if (pte_access & ACC_EXEC_MASK) 2727 if (pte_access & ACC_EXEC_MASK)
2684 spte |= shadow_x_mask; 2728 spte |= shadow_x_mask;
@@ -2735,7 +2779,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2735 2779
2736 if (pte_access & ACC_WRITE_MASK) { 2780 if (pte_access & ACC_WRITE_MASK) {
2737 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2781 kvm_vcpu_mark_page_dirty(vcpu, gfn);
2738 spte |= shadow_dirty_mask; 2782 spte |= spte_shadow_dirty_mask(spte);
2739 } 2783 }
2740 2784
2741 if (speculative) 2785 if (speculative)
@@ -2877,16 +2921,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2877{ 2921{
2878 struct kvm_mmu_page *sp; 2922 struct kvm_mmu_page *sp;
2879 2923
2924 sp = page_header(__pa(sptep));
2925
2880 /* 2926 /*
2881 * Since it's no accessed bit on EPT, it's no way to 2927 * Without accessed bits, there's no way to distinguish between
2882 * distinguish between actually accessed translations 2928 * actually accessed translations and prefetched, so disable pte
2883 * and prefetched, so disable pte prefetch if EPT is 2929 * prefetch if accessed bits aren't available.
2884 * enabled.
2885 */ 2930 */
2886 if (!shadow_accessed_mask) 2931 if (sp_ad_disabled(sp))
2887 return; 2932 return;
2888 2933
2889 sp = page_header(__pa(sptep));
2890 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2934 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2891 return; 2935 return;
2892 2936
@@ -4290,6 +4334,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4290 4334
4291 context->base_role.word = 0; 4335 context->base_role.word = 0;
4292 context->base_role.smm = is_smm(vcpu); 4336 context->base_role.smm = is_smm(vcpu);
4337 context->base_role.ad_disabled = (shadow_accessed_mask == 0);
4293 context->page_fault = tdp_page_fault; 4338 context->page_fault = tdp_page_fault;
4294 context->sync_page = nonpaging_sync_page; 4339 context->sync_page = nonpaging_sync_page;
4295 context->invlpg = nonpaging_invlpg; 4340 context->invlpg = nonpaging_invlpg;
@@ -4377,6 +4422,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4377 context->root_level = context->shadow_root_level; 4422 context->root_level = context->shadow_root_level;
4378 context->root_hpa = INVALID_PAGE; 4423 context->root_hpa = INVALID_PAGE;
4379 context->direct_map = false; 4424 context->direct_map = false;
4425 context->base_role.ad_disabled = !accessed_dirty;
4380 4426
4381 update_permission_bitmask(vcpu, context, true); 4427 update_permission_bitmask(vcpu, context, true);
4382 update_pkru_bitmask(vcpu, context, true); 4428 update_pkru_bitmask(vcpu, context, true);
@@ -4636,6 +4682,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4636 mask.smep_andnot_wp = 1; 4682 mask.smep_andnot_wp = 1;
4637 mask.smap_andnot_wp = 1; 4683 mask.smap_andnot_wp = 1;
4638 mask.smm = 1; 4684 mask.smm = 1;
4685 mask.ad_disabled = 1;
4639 4686
4640 /* 4687 /*
4641 * If we don't have indirect shadow pages, it means no page is 4688 * If we don't have indirect shadow pages, it means no page is
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 330bf3a811fb..a276834950c1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e)
51 return ((1ULL << (e - s + 1)) - 1) << s; 51 return ((1ULL << (e - s + 1)) - 1) << s;
52} 52}
53 53
54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); 54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
55 55
56void 56void
57reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 57reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 5a24b846a1cb..8b97a6cba8d1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -30,8 +30,9 @@
30 \ 30 \
31 role.word = __entry->role; \ 31 role.word = __entry->role; \
32 \ 32 \
33 trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ 33 trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", __entry->mmu_valid_gen, \ 34 " %snxe %sad root %u %s%c", \
35 __entry->mmu_valid_gen, \
35 __entry->gfn, role.level, \ 36 __entry->gfn, role.level, \
36 role.cr4_pae ? " pae" : "", \ 37 role.cr4_pae ? " pae" : "", \
37 role.quadrant, \ 38 role.quadrant, \
@@ -39,6 +40,7 @@
39 access_str[role.access], \ 40 access_str[role.access], \
40 role.invalid ? " invalid" : "", \ 41 role.invalid ? " invalid" : "", \
41 role.nxe ? "" : "!", \ 42 role.nxe ? "" : "!", \
43 role.ad_disabled ? "!" : "", \
42 __entry->root_count, \ 44 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 45 __entry->unsync ? "unsync" : "sync", 0); \
44 saved_ptr; \ 46 saved_ptr; \
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 33460fcdeef9..905ea6052517 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -190,6 +190,7 @@ struct vcpu_svm {
190 struct nested_state nested; 190 struct nested_state nested;
191 191
192 bool nmi_singlestep; 192 bool nmi_singlestep;
193 u64 nmi_singlestep_guest_rflags;
193 194
194 unsigned int3_injected; 195 unsigned int3_injected;
195 unsigned long int3_rip; 196 unsigned long int3_rip;
@@ -964,6 +965,18 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
964 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 965 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
965} 966}
966 967
968static void disable_nmi_singlestep(struct vcpu_svm *svm)
969{
970 svm->nmi_singlestep = false;
971 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
972 /* Clear our flags if they were not set by the guest */
973 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
974 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
975 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
976 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
977 }
978}
979
967/* Note: 980/* Note:
968 * This hash table is used to map VM_ID to a struct kvm_arch, 981 * This hash table is used to map VM_ID to a struct kvm_arch,
969 * when handling AMD IOMMU GALOG notification to schedule in 982 * when handling AMD IOMMU GALOG notification to schedule in
@@ -1713,11 +1726,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
1713 1726
1714static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1727static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1715{ 1728{
1716 return to_svm(vcpu)->vmcb->save.rflags; 1729 struct vcpu_svm *svm = to_svm(vcpu);
1730 unsigned long rflags = svm->vmcb->save.rflags;
1731
1732 if (svm->nmi_singlestep) {
1733 /* Hide our flags if they were not set by the guest */
1734 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1735 rflags &= ~X86_EFLAGS_TF;
1736 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1737 rflags &= ~X86_EFLAGS_RF;
1738 }
1739 return rflags;
1717} 1740}
1718 1741
1719static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1742static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1720{ 1743{
1744 if (to_svm(vcpu)->nmi_singlestep)
1745 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1746
1721 /* 1747 /*
1722 * Any change of EFLAGS.VM is accompanied by a reload of SS 1748 * Any change of EFLAGS.VM is accompanied by a reload of SS
1723 * (caused by either a task switch or an inter-privilege IRET), 1749 * (caused by either a task switch or an inter-privilege IRET),
@@ -2112,10 +2138,7 @@ static int db_interception(struct vcpu_svm *svm)
2112 } 2138 }
2113 2139
2114 if (svm->nmi_singlestep) { 2140 if (svm->nmi_singlestep) {
2115 svm->nmi_singlestep = false; 2141 disable_nmi_singlestep(svm);
2116 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
2117 svm->vmcb->save.rflags &=
2118 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2119 } 2142 }
2120 2143
2121 if (svm->vcpu.guest_debug & 2144 if (svm->vcpu.guest_debug &
@@ -2370,8 +2393,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2370 2393
2371static int nested_svm_check_permissions(struct vcpu_svm *svm) 2394static int nested_svm_check_permissions(struct vcpu_svm *svm)
2372{ 2395{
2373 if (!(svm->vcpu.arch.efer & EFER_SVME) 2396 if (!(svm->vcpu.arch.efer & EFER_SVME) ||
2374 || !is_paging(&svm->vcpu)) { 2397 !is_paging(&svm->vcpu)) {
2375 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2398 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2376 return 1; 2399 return 1;
2377 } 2400 }
@@ -2381,7 +2404,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
2381 return 1; 2404 return 1;
2382 } 2405 }
2383 2406
2384 return 0; 2407 return 0;
2385} 2408}
2386 2409
2387static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 2410static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
@@ -2534,6 +2557,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2534 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; 2557 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2535} 2558}
2536 2559
2560/* DB exceptions for our internal use must not cause vmexit */
2561static int nested_svm_intercept_db(struct vcpu_svm *svm)
2562{
2563 unsigned long dr6;
2564
2565 /* if we're not singlestepping, it's not ours */
2566 if (!svm->nmi_singlestep)
2567 return NESTED_EXIT_DONE;
2568
2569 /* if it's not a singlestep exception, it's not ours */
2570 if (kvm_get_dr(&svm->vcpu, 6, &dr6))
2571 return NESTED_EXIT_DONE;
2572 if (!(dr6 & DR6_BS))
2573 return NESTED_EXIT_DONE;
2574
2575 /* if the guest is singlestepping, it should get the vmexit */
2576 if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
2577 disable_nmi_singlestep(svm);
2578 return NESTED_EXIT_DONE;
2579 }
2580
2581 /* it's ours, the nested hypervisor must not see this one */
2582 return NESTED_EXIT_HOST;
2583}
2584
2537static int nested_svm_exit_special(struct vcpu_svm *svm) 2585static int nested_svm_exit_special(struct vcpu_svm *svm)
2538{ 2586{
2539 u32 exit_code = svm->vmcb->control.exit_code; 2587 u32 exit_code = svm->vmcb->control.exit_code;
@@ -2589,8 +2637,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
2589 } 2637 }
2590 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 2638 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2591 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2639 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2592 if (svm->nested.intercept_exceptions & excp_bits) 2640 if (svm->nested.intercept_exceptions & excp_bits) {
2593 vmexit = NESTED_EXIT_DONE; 2641 if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
2642 vmexit = nested_svm_intercept_db(svm);
2643 else
2644 vmexit = NESTED_EXIT_DONE;
2645 }
2594 /* async page fault always cause vmexit */ 2646 /* async page fault always cause vmexit */
2595 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && 2647 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2596 svm->apf_reason != 0) 2648 svm->apf_reason != 0)
@@ -4627,10 +4679,17 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
4627 == HF_NMI_MASK) 4679 == HF_NMI_MASK)
4628 return; /* IRET will cause a vm exit */ 4680 return; /* IRET will cause a vm exit */
4629 4681
4682 if ((svm->vcpu.arch.hflags & HF_GIF_MASK) == 0)
4683 return; /* STGI will cause a vm exit */
4684
4685 if (svm->nested.exit_required)
4686 return; /* we're not going to run the guest yet */
4687
4630 /* 4688 /*
4631 * Something prevents NMI from been injected. Single step over possible 4689 * Something prevents NMI from been injected. Single step over possible
4632 * problem (IRET or exception injection or interrupt shadow) 4690 * problem (IRET or exception injection or interrupt shadow)
4633 */ 4691 */
4692 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
4634 svm->nmi_singlestep = true; 4693 svm->nmi_singlestep = true;
4635 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 4694 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4636} 4695}
@@ -4771,6 +4830,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
4771 if (unlikely(svm->nested.exit_required)) 4830 if (unlikely(svm->nested.exit_required))
4772 return; 4831 return;
4773 4832
4833 /*
4834 * Disable singlestep if we're injecting an interrupt/exception.
4835 * We don't want our modified rflags to be pushed on the stack where
4836 * we might not be able to easily reset them if we disabled NMI
4837 * singlestep later.
4838 */
4839 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4840 /*
4841 * Event injection happens before external interrupts cause a
4842 * vmexit and interrupts are disabled here, so smp_send_reschedule
4843 * is enough to force an immediate vmexit.
4844 */
4845 disable_nmi_singlestep(svm);
4846 smp_send_reschedule(vcpu->cpu);
4847 }
4848
4774 pre_svm_run(svm); 4849 pre_svm_run(svm);
4775 4850
4776 sync_lapic_to_cr8(vcpu); 4851 sync_lapic_to_cr8(vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dcc4873e435..f76efad248ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -913,8 +913,9 @@ static void nested_release_page_clean(struct page *page)
913 kvm_release_page_clean(page); 913 kvm_release_page_clean(page);
914} 914}
915 915
916static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
916static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 917static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
917static u64 construct_eptp(unsigned long root_hpa); 918static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
918static bool vmx_xsaves_supported(void); 919static bool vmx_xsaves_supported(void);
919static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 920static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
920static void vmx_set_segment(struct kvm_vcpu *vcpu, 921static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -2772,7 +2773,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2772 if (enable_ept_ad_bits) { 2773 if (enable_ept_ad_bits) {
2773 vmx->nested.nested_vmx_secondary_ctls_high |= 2774 vmx->nested.nested_vmx_secondary_ctls_high |=
2774 SECONDARY_EXEC_ENABLE_PML; 2775 SECONDARY_EXEC_ENABLE_PML;
2775 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; 2776 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
2776 } 2777 }
2777 } else 2778 } else
2778 vmx->nested.nested_vmx_ept_caps = 0; 2779 vmx->nested.nested_vmx_ept_caps = 0;
@@ -3198,7 +3199,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3198 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 3199 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
3199 break; 3200 break;
3200 case MSR_IA32_BNDCFGS: 3201 case MSR_IA32_BNDCFGS:
3201 if (!kvm_mpx_supported()) 3202 if (!kvm_mpx_supported() ||
3203 (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
3202 return 1; 3204 return 1;
3203 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 3205 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
3204 break; 3206 break;
@@ -3280,7 +3282,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3280 vmcs_writel(GUEST_SYSENTER_ESP, data); 3282 vmcs_writel(GUEST_SYSENTER_ESP, data);
3281 break; 3283 break;
3282 case MSR_IA32_BNDCFGS: 3284 case MSR_IA32_BNDCFGS:
3283 if (!kvm_mpx_supported()) 3285 if (!kvm_mpx_supported() ||
3286 (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
3287 return 1;
3288 if (is_noncanonical_address(data & PAGE_MASK) ||
3289 (data & MSR_IA32_BNDCFGS_RSVD))
3284 return 1; 3290 return 1;
3285 vmcs_write64(GUEST_BNDCFGS, data); 3291 vmcs_write64(GUEST_BNDCFGS, data);
3286 break; 3292 break;
@@ -4013,7 +4019,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
4013 if (enable_ept) { 4019 if (enable_ept) {
4014 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 4020 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
4015 return; 4021 return;
4016 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 4022 ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
4017 } else { 4023 } else {
4018 vpid_sync_context(vpid); 4024 vpid_sync_context(vpid);
4019 } 4025 }
@@ -4188,14 +4194,15 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4188 vmx->emulation_required = emulation_required(vcpu); 4194 vmx->emulation_required = emulation_required(vcpu);
4189} 4195}
4190 4196
4191static u64 construct_eptp(unsigned long root_hpa) 4197static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
4192{ 4198{
4193 u64 eptp; 4199 u64 eptp;
4194 4200
4195 /* TODO write the value reading from MSR */ 4201 /* TODO write the value reading from MSR */
4196 eptp = VMX_EPT_DEFAULT_MT | 4202 eptp = VMX_EPT_DEFAULT_MT |
4197 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 4203 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
4198 if (enable_ept_ad_bits) 4204 if (enable_ept_ad_bits &&
4205 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
4199 eptp |= VMX_EPT_AD_ENABLE_BIT; 4206 eptp |= VMX_EPT_AD_ENABLE_BIT;
4200 eptp |= (root_hpa & PAGE_MASK); 4207 eptp |= (root_hpa & PAGE_MASK);
4201 4208
@@ -4209,7 +4216,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4209 4216
4210 guest_cr3 = cr3; 4217 guest_cr3 = cr3;
4211 if (enable_ept) { 4218 if (enable_ept) {
4212 eptp = construct_eptp(cr3); 4219 eptp = construct_eptp(vcpu, cr3);
4213 vmcs_write64(EPT_POINTER, eptp); 4220 vmcs_write64(EPT_POINTER, eptp);
4214 if (is_paging(vcpu) || is_guest_mode(vcpu)) 4221 if (is_paging(vcpu) || is_guest_mode(vcpu))
4215 guest_cr3 = kvm_read_cr3(vcpu); 4222 guest_cr3 = kvm_read_cr3(vcpu);
@@ -5170,7 +5177,8 @@ static void ept_set_mmio_spte_mask(void)
5170 * EPT Misconfigurations can be generated if the value of bits 2:0 5177 * EPT Misconfigurations can be generated if the value of bits 2:0
5171 * of an EPT paging-structure entry is 110b (write/execute). 5178 * of an EPT paging-structure entry is 110b (write/execute).
5172 */ 5179 */
5173 kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); 5180 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
5181 VMX_EPT_MISCONFIG_WX_VALUE);
5174} 5182}
5175 5183
5176#define VMX_XSS_EXIT_BITMAP 0 5184#define VMX_XSS_EXIT_BITMAP 0
@@ -6220,17 +6228,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
6220 6228
6221 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6229 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6222 6230
6223 if (is_guest_mode(vcpu)
6224 && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
6225 /*
6226 * Fix up exit_qualification according to whether guest
6227 * page table accesses are reads or writes.
6228 */
6229 u64 eptp = nested_ept_get_cr3(vcpu);
6230 if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
6231 exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
6232 }
6233
6234 /* 6231 /*
6235 * EPT violation happened while executing iret from NMI, 6232 * EPT violation happened while executing iret from NMI,
6236 * "blocked by NMI" bit has to be set before next VM entry. 6233 * "blocked by NMI" bit has to be set before next VM entry.
@@ -6453,7 +6450,7 @@ void vmx_enable_tdp(void)
6453 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 6450 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6454 0ull, VMX_EPT_EXECUTABLE_MASK, 6451 0ull, VMX_EPT_EXECUTABLE_MASK,
6455 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 6452 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6456 enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); 6453 VMX_EPT_RWX_MASK);
6457 6454
6458 ept_set_mmio_spte_mask(); 6455 ept_set_mmio_spte_mask();
6459 kvm_enable_tdp(); 6456 kvm_enable_tdp();
@@ -6557,7 +6554,6 @@ static __init int hardware_setup(void)
6557 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 6554 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
6558 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 6555 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
6559 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 6556 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
6560 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
6561 6557
6562 memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, 6558 memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
6563 vmx_msr_bitmap_legacy, PAGE_SIZE); 6559 vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -7661,7 +7657,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
7661 unsigned long type, types; 7657 unsigned long type, types;
7662 gva_t gva; 7658 gva_t gva;
7663 struct x86_exception e; 7659 struct x86_exception e;
7664 int vpid; 7660 struct {
7661 u64 vpid;
7662 u64 gla;
7663 } operand;
7665 7664
7666 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 7665 if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7667 SECONDARY_EXEC_ENABLE_VPID) || 7666 SECONDARY_EXEC_ENABLE_VPID) ||
@@ -7691,17 +7690,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
7691 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 7690 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7692 vmx_instruction_info, false, &gva)) 7691 vmx_instruction_info, false, &gva))
7693 return 1; 7692 return 1;
7694 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, 7693 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
7695 sizeof(u32), &e)) { 7694 sizeof(operand), &e)) {
7696 kvm_inject_page_fault(vcpu, &e); 7695 kvm_inject_page_fault(vcpu, &e);
7697 return 1; 7696 return 1;
7698 } 7697 }
7698 if (operand.vpid >> 16) {
7699 nested_vmx_failValid(vcpu,
7700 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7701 return kvm_skip_emulated_instruction(vcpu);
7702 }
7699 7703
7700 switch (type) { 7704 switch (type) {
7701 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 7705 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
7706 if (is_noncanonical_address(operand.gla)) {
7707 nested_vmx_failValid(vcpu,
7708 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7709 return kvm_skip_emulated_instruction(vcpu);
7710 }
7711 /* fall through */
7702 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 7712 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
7703 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 7713 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
7704 if (!vpid) { 7714 if (!operand.vpid) {
7705 nested_vmx_failValid(vcpu, 7715 nested_vmx_failValid(vcpu,
7706 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7716 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7707 return kvm_skip_emulated_instruction(vcpu); 7717 return kvm_skip_emulated_instruction(vcpu);
@@ -9394,6 +9404,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
9394 vmcs12->guest_physical_address = fault->address; 9404 vmcs12->guest_physical_address = fault->address;
9395} 9405}
9396 9406
9407static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
9408{
9409 return nested_ept_get_cr3(vcpu) & VMX_EPT_AD_ENABLE_BIT;
9410}
9411
9397/* Callbacks for nested_ept_init_mmu_context: */ 9412/* Callbacks for nested_ept_init_mmu_context: */
9398 9413
9399static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) 9414static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
@@ -9404,18 +9419,18 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
9404 9419
9405static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 9420static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
9406{ 9421{
9407 u64 eptp; 9422 bool wants_ad;
9408 9423
9409 WARN_ON(mmu_is_nested(vcpu)); 9424 WARN_ON(mmu_is_nested(vcpu));
9410 eptp = nested_ept_get_cr3(vcpu); 9425 wants_ad = nested_ept_ad_enabled(vcpu);
9411 if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) 9426 if (wants_ad && !enable_ept_ad_bits)
9412 return 1; 9427 return 1;
9413 9428
9414 kvm_mmu_unload(vcpu); 9429 kvm_mmu_unload(vcpu);
9415 kvm_init_shadow_ept_mmu(vcpu, 9430 kvm_init_shadow_ept_mmu(vcpu,
9416 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 9431 to_vmx(vcpu)->nested.nested_vmx_ept_caps &
9417 VMX_EPT_EXECUTE_ONLY_BIT, 9432 VMX_EPT_EXECUTE_ONLY_BIT,
9418 eptp & VMX_EPT_AD_ENABLE_BIT); 9433 wants_ad);
9419 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 9434 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
9420 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 9435 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
9421 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 9436 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -10728,8 +10743,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10728 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 10743 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
10729 } 10744 }
10730 10745
10731 if (nested_cpu_has_ept(vmcs12)) 10746 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
10732 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
10733 10747
10734 if (nested_cpu_has_vid(vmcs12)) 10748 if (nested_cpu_has_vid(vmcs12))
10735 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 10749 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10754,8 +10768,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10754 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 10768 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
10755 if (kvm_mpx_supported()) 10769 if (kvm_mpx_supported())
10756 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 10770 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
10757 if (nested_cpu_has_xsaves(vmcs12))
10758 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
10759} 10771}
10760 10772
10761/* 10773/*
@@ -11152,7 +11164,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
11152 vmx->hv_deadline_tsc = tscl + delta_tsc; 11164 vmx->hv_deadline_tsc = tscl + delta_tsc;
11153 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 11165 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
11154 PIN_BASED_VMX_PREEMPTION_TIMER); 11166 PIN_BASED_VMX_PREEMPTION_TIMER);
11155 return 0; 11167
11168 return delta_tsc == 0;
11156} 11169}
11157 11170
11158static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 11171static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0cb83b..6c7266f7766d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2841,10 +2841,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2841 kvm_vcpu_write_tsc_offset(vcpu, offset); 2841 kvm_vcpu_write_tsc_offset(vcpu, offset);
2842 vcpu->arch.tsc_catchup = 1; 2842 vcpu->arch.tsc_catchup = 1;
2843 } 2843 }
2844 if (kvm_lapic_hv_timer_in_use(vcpu) && 2844
2845 kvm_x86_ops->set_hv_timer(vcpu, 2845 if (kvm_lapic_hv_timer_in_use(vcpu))
2846 kvm_get_lapic_target_expiration_tsc(vcpu))) 2846 kvm_lapic_restart_hv_timer(vcpu);
2847 kvm_lapic_switch_to_sw_timer(vcpu); 2847
2848 /* 2848 /*
2849 * On a host with synchronized TSC, there is no need to update 2849 * On a host with synchronized TSC, there is no need to update
2850 * kvmclock on vcpu->cpu migration 2850 * kvmclock on vcpu->cpu migration
@@ -6011,7 +6011,7 @@ static void kvm_set_mmio_spte_mask(void)
6011 mask &= ~1ull; 6011 mask &= ~1ull;
6012#endif 6012#endif
6013 6013
6014 kvm_mmu_set_mmio_spte_mask(mask); 6014 kvm_mmu_set_mmio_spte_mask(mask, mask);
6015} 6015}
6016 6016
6017#ifdef CONFIG_X86_64 6017#ifdef CONFIG_X86_64
@@ -6733,7 +6733,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6733 6733
6734 bool req_immediate_exit = false; 6734 bool req_immediate_exit = false;
6735 6735
6736 if (vcpu->requests) { 6736 if (kvm_request_pending(vcpu)) {
6737 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 6737 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
6738 kvm_mmu_unload(vcpu); 6738 kvm_mmu_unload(vcpu);
6739 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 6739 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -6897,7 +6897,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6897 kvm_x86_ops->sync_pir_to_irr(vcpu); 6897 kvm_x86_ops->sync_pir_to_irr(vcpu);
6898 } 6898 }
6899 6899
6900 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests 6900 if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
6901 || need_resched() || signal_pending(current)) { 6901 || need_resched() || signal_pending(current)) {
6902 vcpu->mode = OUTSIDE_GUEST_MODE; 6902 vcpu->mode = OUTSIDE_GUEST_MODE;
6903 smp_wmb(); 6903 smp_wmb();
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 295584f31a4e..f0053f884b4a 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -57,9 +57,7 @@ struct arch_timer_cpu {
57 57
58int kvm_timer_hyp_init(void); 58int kvm_timer_hyp_init(void);
59int kvm_timer_enable(struct kvm_vcpu *vcpu); 59int kvm_timer_enable(struct kvm_vcpu *vcpu);
60int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, 60int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu);
61 const struct kvm_irq_level *virt_irq,
62 const struct kvm_irq_level *phys_irq);
63void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); 61void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
64void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); 62void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
65void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); 63void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -70,6 +68,10 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
70u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); 68u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
71int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); 69int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
72 70
71int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
72int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
73int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
74
73bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); 75bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
74void kvm_timer_schedule(struct kvm_vcpu *vcpu); 76void kvm_timer_schedule(struct kvm_vcpu *vcpu);
75void kvm_timer_unschedule(struct kvm_vcpu *vcpu); 77void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 1ab4633adf4f..f6e030617467 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -35,6 +35,7 @@ struct kvm_pmu {
35 int irq_num; 35 int irq_num;
36 struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS]; 36 struct kvm_pmc pmc[ARMV8_PMU_MAX_COUNTERS];
37 bool ready; 37 bool ready;
38 bool created;
38 bool irq_level; 39 bool irq_level;
39}; 40};
40 41
@@ -63,6 +64,7 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu,
63 struct kvm_device_attr *attr); 64 struct kvm_device_attr *attr);
64int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, 65int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
65 struct kvm_device_attr *attr); 66 struct kvm_device_attr *attr);
67int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
66#else 68#else
67struct kvm_pmu { 69struct kvm_pmu {
68}; 70};
@@ -112,6 +114,10 @@ static inline int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu,
112{ 114{
113 return -ENXIO; 115 return -ENXIO;
114} 116}
117static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
118{
119 return 0;
120}
115#endif 121#endif
116 122
117#endif 123#endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ef718586321c..34dba516ef24 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -38,6 +38,10 @@
38#define VGIC_MIN_LPI 8192 38#define VGIC_MIN_LPI 8192
39#define KVM_IRQCHIP_NUM_PINS (1020 - 32) 39#define KVM_IRQCHIP_NUM_PINS (1020 - 32)
40 40
41#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
42#define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
43 (irq) <= VGIC_MAX_SPI)
44
41enum vgic_type { 45enum vgic_type {
42 VGIC_V2, /* Good ol' GICv2 */ 46 VGIC_V2, /* Good ol' GICv2 */
43 VGIC_V3, /* New fancy GICv3 */ 47 VGIC_V3, /* New fancy GICv3 */
@@ -119,6 +123,9 @@ struct vgic_irq {
119 u8 source; /* GICv2 SGIs only */ 123 u8 source; /* GICv2 SGIs only */
120 u8 priority; 124 u8 priority;
121 enum vgic_irq_config config; /* Level or edge */ 125 enum vgic_irq_config config; /* Level or edge */
126
127 void *owner; /* Opaque pointer to reserve an interrupt
128 for in-kernel devices. */
122}; 129};
123 130
124struct vgic_register_region; 131struct vgic_register_region;
@@ -285,6 +292,7 @@ struct vgic_cpu {
285}; 292};
286 293
287extern struct static_key_false vgic_v2_cpuif_trap; 294extern struct static_key_false vgic_v2_cpuif_trap;
295extern struct static_key_false vgic_v3_cpuif_trap;
288 296
289int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); 297int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
290void kvm_vgic_early_init(struct kvm *kvm); 298void kvm_vgic_early_init(struct kvm *kvm);
@@ -298,9 +306,7 @@ int kvm_vgic_hyp_init(void);
298void kvm_vgic_init_cpu_hardware(void); 306void kvm_vgic_init_cpu_hardware(void);
299 307
300int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, 308int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
301 bool level); 309 bool level, void *owner);
302int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
303 bool level);
304int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); 310int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
305int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); 311int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
306bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); 312bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
@@ -341,4 +347,6 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
341 */ 347 */
342int kvm_vgic_setup_default_irq_routing(struct kvm *kvm); 348int kvm_vgic_setup_default_irq_routing(struct kvm *kvm);
343 349
350int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner);
351
344#endif /* __KVM_ARM_VGIC_H */ 352#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 1fa293a37f4a..6a1f87ff94e2 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -405,6 +405,7 @@
405#define ICH_LR_PHYS_ID_SHIFT 32 405#define ICH_LR_PHYS_ID_SHIFT 32
406#define ICH_LR_PHYS_ID_MASK (0x3ffULL << ICH_LR_PHYS_ID_SHIFT) 406#define ICH_LR_PHYS_ID_MASK (0x3ffULL << ICH_LR_PHYS_ID_SHIFT)
407#define ICH_LR_PRIORITY_SHIFT 48 407#define ICH_LR_PRIORITY_SHIFT 48
408#define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT)
408 409
409/* These are for GICv2 emulation only */ 410/* These are for GICv2 emulation only */
410#define GICH_LR_VIRTUALID (0x3ffUL << 0) 411#define GICH_LR_VIRTUALID (0x3ffUL << 0)
@@ -416,6 +417,11 @@
416 417
417#define ICH_HCR_EN (1 << 0) 418#define ICH_HCR_EN (1 << 0)
418#define ICH_HCR_UIE (1 << 1) 419#define ICH_HCR_UIE (1 << 1)
420#define ICH_HCR_TC (1 << 10)
421#define ICH_HCR_TALL0 (1 << 11)
422#define ICH_HCR_TALL1 (1 << 12)
423#define ICH_HCR_EOIcount_SHIFT 27
424#define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT)
419 425
420#define ICH_VMCR_ACK_CTL_SHIFT 2 426#define ICH_VMCR_ACK_CTL_SHIFT 2
421#define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT) 427#define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c0664309815..0b50e7b35ed4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -126,6 +126,13 @@ static inline bool is_error_page(struct page *page)
126#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 126#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
127#define KVM_REQ_PENDING_TIMER 2 127#define KVM_REQ_PENDING_TIMER 2
128#define KVM_REQ_UNHALT 3 128#define KVM_REQ_UNHALT 3
129#define KVM_REQUEST_ARCH_BASE 8
130
131#define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
132 BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
133 (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
134})
135#define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0)
129 136
130#define KVM_USERSPACE_IRQ_SOURCE_ID 0 137#define KVM_USERSPACE_IRQ_SOURCE_ID 0
131#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 138#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -1098,6 +1105,11 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
1098 set_bit(req & KVM_REQUEST_MASK, &vcpu->requests); 1105 set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
1099} 1106}
1100 1107
1108static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
1109{
1110 return READ_ONCE(vcpu->requests);
1111}
1112
1101static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) 1113static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
1102{ 1114{
1103 return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests); 1115 return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 577429a95ad8..c0b6dfec5f87 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -155,6 +155,35 @@ struct kvm_s390_skeys {
155 __u32 reserved[9]; 155 __u32 reserved[9];
156}; 156};
157 157
158#define KVM_S390_CMMA_PEEK (1 << 0)
159
160/**
161 * kvm_s390_cmma_log - Used for CMMA migration.
162 *
163 * Used both for input and output.
164 *
165 * @start_gfn: Guest page number to start from.
166 * @count: Size of the result buffer.
167 * @flags: Control operation mode via KVM_S390_CMMA_* flags
168 * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
169 * pages are still remaining.
170 * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
171 * in the PGSTE.
172 * @values: Pointer to the values buffer.
173 *
174 * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
175 */
176struct kvm_s390_cmma_log {
177 __u64 start_gfn;
178 __u32 count;
179 __u32 flags;
180 union {
181 __u64 remaining;
182 __u64 mask;
183 };
184 __u64 values;
185};
186
158struct kvm_hyperv_exit { 187struct kvm_hyperv_exit {
159#define KVM_EXIT_HYPERV_SYNIC 1 188#define KVM_EXIT_HYPERV_SYNIC 1
160#define KVM_EXIT_HYPERV_HCALL 2 189#define KVM_EXIT_HYPERV_HCALL 2
@@ -895,6 +924,9 @@ struct kvm_ppc_resize_hpt {
895#define KVM_CAP_SPAPR_TCE_VFIO 142 924#define KVM_CAP_SPAPR_TCE_VFIO 142
896#define KVM_CAP_X86_GUEST_MWAIT 143 925#define KVM_CAP_X86_GUEST_MWAIT 143
897#define KVM_CAP_ARM_USER_IRQ 144 926#define KVM_CAP_ARM_USER_IRQ 144
927#define KVM_CAP_S390_CMMA_MIGRATION 145
928#define KVM_CAP_PPC_FWNMI 146
929#define KVM_CAP_PPC_SMT_POSSIBLE 147
898 930
899#ifdef KVM_CAP_IRQ_ROUTING 931#ifdef KVM_CAP_IRQ_ROUTING
900 932
@@ -1318,6 +1350,9 @@ struct kvm_s390_ucas_mapping {
1318#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state) 1350#define KVM_S390_GET_IRQ_STATE _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
1319/* Available with KVM_CAP_X86_SMM */ 1351/* Available with KVM_CAP_X86_SMM */
1320#define KVM_SMI _IO(KVMIO, 0xb7) 1352#define KVM_SMI _IO(KVMIO, 0xb7)
1353/* Available with KVM_CAP_S390_CMMA_MIGRATION */
1354#define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log)
1355#define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
1321 1356
1322#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 1357#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
1323#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) 1358#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 8f74ed8e7237..dd8f00cfb8b4 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -295,114 +295,6 @@ class ArchS390(Arch):
295ARCH = Arch.get_arch() 295ARCH = Arch.get_arch()
296 296
297 297
298def walkdir(path):
299 """Returns os.walk() data for specified directory.
300
301 As it is only a wrapper it returns the same 3-tuple of (dirpath,
302 dirnames, filenames).
303 """
304 return next(os.walk(path))
305
306
307def parse_int_list(list_string):
308 """Returns an int list from a string of comma separated integers and
309 integer ranges."""
310 integers = []
311 members = list_string.split(',')
312
313 for member in members:
314 if '-' not in member:
315 integers.append(int(member))
316 else:
317 int_range = member.split('-')
318 integers.extend(range(int(int_range[0]),
319 int(int_range[1]) + 1))
320
321 return integers
322
323
324def get_pid_from_gname(gname):
325 """Fuzzy function to convert guest name to QEMU process pid.
326
327 Returns a list of potential pids, can be empty if no match found.
328 Throws an exception on processing errors.
329
330 """
331 pids = []
332 try:
333 child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
334 stdout=subprocess.PIPE)
335 except:
336 raise Exception
337 for line in child.stdout:
338 line = line.lstrip().split(' ', 1)
339 # perform a sanity check before calling the more expensive
340 # function to possibly extract the guest name
341 if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]):
342 pids.append(int(line[0]))
343 child.stdout.close()
344
345 return pids
346
347
348def get_gname_from_pid(pid):
349 """Returns the guest name for a QEMU process pid.
350
351 Extracts the guest name from the QEMU comma line by processing the '-name'
352 option. Will also handle names specified out of sequence.
353
354 """
355 name = ''
356 try:
357 line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0')
358 parms = line[line.index('-name') + 1].split(',')
359 while '' in parms:
360 # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in
361 # ['foo', '', 'bar'], which we revert here
362 idx = parms.index('')
363 parms[idx - 1] += ',' + parms[idx + 1]
364 del parms[idx:idx+2]
365 # the '-name' switch allows for two ways to specify the guest name,
366 # where the plain name overrides the name specified via 'guest='
367 for arg in parms:
368 if '=' not in arg:
369 name = arg
370 break
371 if arg[:6] == 'guest=':
372 name = arg[6:]
373 except (ValueError, IOError, IndexError):
374 pass
375
376 return name
377
378
379def get_online_cpus():
380 """Returns a list of cpu id integers."""
381 with open('/sys/devices/system/cpu/online') as cpu_list:
382 cpu_string = cpu_list.readline()
383 return parse_int_list(cpu_string)
384
385
386def get_filters():
387 """Returns a dict of trace events, their filter ids and
388 the values that can be filtered.
389
390 Trace events can be filtered for special values by setting a
391 filter string via an ioctl. The string normally has the format
392 identifier==value. For each filter a new event will be created, to
393 be able to distinguish the events.
394
395 """
396 filters = {}
397 filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
398 if ARCH.exit_reasons:
399 filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
400 return filters
401
402libc = ctypes.CDLL('libc.so.6', use_errno=True)
403syscall = libc.syscall
404
405
406class perf_event_attr(ctypes.Structure): 298class perf_event_attr(ctypes.Structure):
407 """Struct that holds the necessary data to set up a trace event. 299 """Struct that holds the necessary data to set up a trace event.
408 300
@@ -432,25 +324,6 @@ class perf_event_attr(ctypes.Structure):
432 self.read_format = PERF_FORMAT_GROUP 324 self.read_format = PERF_FORMAT_GROUP
433 325
434 326
435def perf_event_open(attr, pid, cpu, group_fd, flags):
436 """Wrapper for the sys_perf_evt_open() syscall.
437
438 Used to set up performance events, returns a file descriptor or -1
439 on error.
440
441 Attributes are:
442 - syscall number
443 - struct perf_event_attr *
444 - pid or -1 to monitor all pids
445 - cpu number or -1 to monitor all cpus
446 - The file descriptor of the group leader or -1 to create a group.
447 - flags
448
449 """
450 return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
451 ctypes.c_int(pid), ctypes.c_int(cpu),
452 ctypes.c_int(group_fd), ctypes.c_long(flags))
453
454PERF_TYPE_TRACEPOINT = 2 327PERF_TYPE_TRACEPOINT = 2
455PERF_FORMAT_GROUP = 1 << 3 328PERF_FORMAT_GROUP = 1 << 3
456 329
@@ -495,6 +368,8 @@ class Event(object):
495 """Represents a performance event and manages its life cycle.""" 368 """Represents a performance event and manages its life cycle."""
496 def __init__(self, name, group, trace_cpu, trace_pid, trace_point, 369 def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
497 trace_filter, trace_set='kvm'): 370 trace_filter, trace_set='kvm'):
371 self.libc = ctypes.CDLL('libc.so.6', use_errno=True)
372 self.syscall = self.libc.syscall
498 self.name = name 373 self.name = name
499 self.fd = None 374 self.fd = None
500 self.setup_event(group, trace_cpu, trace_pid, trace_point, 375 self.setup_event(group, trace_cpu, trace_pid, trace_point,
@@ -511,6 +386,25 @@ class Event(object):
511 if self.fd: 386 if self.fd:
512 os.close(self.fd) 387 os.close(self.fd)
513 388
389 def perf_event_open(self, attr, pid, cpu, group_fd, flags):
390 """Wrapper for the sys_perf_evt_open() syscall.
391
392 Used to set up performance events, returns a file descriptor or -1
393 on error.
394
395 Attributes are:
396 - syscall number
397 - struct perf_event_attr *
398 - pid or -1 to monitor all pids
399 - cpu number or -1 to monitor all cpus
400 - The file descriptor of the group leader or -1 to create a group.
401 - flags
402
403 """
404 return self.syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr),
405 ctypes.c_int(pid), ctypes.c_int(cpu),
406 ctypes.c_int(group_fd), ctypes.c_long(flags))
407
514 def setup_event_attribute(self, trace_set, trace_point): 408 def setup_event_attribute(self, trace_set, trace_point):
515 """Returns an initialized ctype perf_event_attr struct.""" 409 """Returns an initialized ctype perf_event_attr struct."""
516 410
@@ -539,8 +433,8 @@ class Event(object):
539 if group.events: 433 if group.events:
540 group_leader = group.events[0].fd 434 group_leader = group.events[0].fd
541 435
542 fd = perf_event_open(event_attr, trace_pid, 436 fd = self.perf_event_open(event_attr, trace_pid,
543 trace_cpu, group_leader, 0) 437 trace_cpu, group_leader, 0)
544 if fd == -1: 438 if fd == -1:
545 err = ctypes.get_errno() 439 err = ctypes.get_errno()
546 raise OSError(err, os.strerror(err), 440 raise OSError(err, os.strerror(err),
@@ -575,17 +469,53 @@ class Event(object):
575 fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) 469 fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
576 470
577 471
578class TracepointProvider(object): 472class Provider(object):
473 """Encapsulates functionalities used by all providers."""
474 @staticmethod
475 def is_field_wanted(fields_filter, field):
476 """Indicate whether field is valid according to fields_filter."""
477 if not fields_filter:
478 return True
479 return re.match(fields_filter, field) is not None
480
481 @staticmethod
482 def walkdir(path):
483 """Returns os.walk() data for specified directory.
484
485 As it is only a wrapper it returns the same 3-tuple of (dirpath,
486 dirnames, filenames).
487 """
488 return next(os.walk(path))
489
490
491class TracepointProvider(Provider):
579 """Data provider for the stats class. 492 """Data provider for the stats class.
580 493
581 Manages the events/groups from which it acquires its data. 494 Manages the events/groups from which it acquires its data.
582 495
583 """ 496 """
584 def __init__(self): 497 def __init__(self, pid, fields_filter):
585 self.group_leaders = [] 498 self.group_leaders = []
586 self.filters = get_filters() 499 self.filters = self.get_filters()
587 self._fields = self.get_available_fields() 500 self.update_fields(fields_filter)
588 self._pid = 0 501 self.pid = pid
502
503 @staticmethod
504 def get_filters():
505 """Returns a dict of trace events, their filter ids and
506 the values that can be filtered.
507
508 Trace events can be filtered for special values by setting a
509 filter string via an ioctl. The string normally has the format
510 identifier==value. For each filter a new event will be created, to
511 be able to distinguish the events.
512
513 """
514 filters = {}
515 filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
516 if ARCH.exit_reasons:
517 filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
518 return filters
589 519
590 def get_available_fields(self): 520 def get_available_fields(self):
591 """Returns a list of available event's of format 'event name(filter 521 """Returns a list of available event's of format 'event name(filter
@@ -603,7 +533,7 @@ class TracepointProvider(object):
603 533
604 """ 534 """
605 path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') 535 path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm')
606 fields = walkdir(path)[1] 536 fields = self.walkdir(path)[1]
607 extra = [] 537 extra = []
608 for field in fields: 538 for field in fields:
609 if field in self.filters: 539 if field in self.filters:
@@ -613,6 +543,34 @@ class TracepointProvider(object):
613 fields += extra 543 fields += extra
614 return fields 544 return fields
615 545
546 def update_fields(self, fields_filter):
547 """Refresh fields, applying fields_filter"""
548 self._fields = [field for field in self.get_available_fields()
549 if self.is_field_wanted(fields_filter, field)]
550
551 @staticmethod
552 def get_online_cpus():
553 """Returns a list of cpu id integers."""
554 def parse_int_list(list_string):
555 """Returns an int list from a string of comma separated integers and
556 integer ranges."""
557 integers = []
558 members = list_string.split(',')
559
560 for member in members:
561 if '-' not in member:
562 integers.append(int(member))
563 else:
564 int_range = member.split('-')
565 integers.extend(range(int(int_range[0]),
566 int(int_range[1]) + 1))
567
568 return integers
569
570 with open('/sys/devices/system/cpu/online') as cpu_list:
571 cpu_string = cpu_list.readline()
572 return parse_int_list(cpu_string)
573
616 def setup_traces(self): 574 def setup_traces(self):
617 """Creates all event and group objects needed to be able to retrieve 575 """Creates all event and group objects needed to be able to retrieve
618 data.""" 576 data."""
@@ -621,9 +579,9 @@ class TracepointProvider(object):
621 # Fetch list of all threads of the monitored pid, as qemu 579 # Fetch list of all threads of the monitored pid, as qemu
622 # starts a thread for each vcpu. 580 # starts a thread for each vcpu.
623 path = os.path.join('/proc', str(self._pid), 'task') 581 path = os.path.join('/proc', str(self._pid), 'task')
624 groupids = walkdir(path)[1] 582 groupids = self.walkdir(path)[1]
625 else: 583 else:
626 groupids = get_online_cpus() 584 groupids = self.get_online_cpus()
627 585
628 # The constant is needed as a buffer for python libs, std 586 # The constant is needed as a buffer for python libs, std
629 # streams and other files that the script opens. 587 # streams and other files that the script opens.
@@ -671,9 +629,6 @@ class TracepointProvider(object):
671 629
672 self.group_leaders.append(group) 630 self.group_leaders.append(group)
673 631
674 def available_fields(self):
675 return self.get_available_fields()
676
677 @property 632 @property
678 def fields(self): 633 def fields(self):
679 return self._fields 634 return self._fields
@@ -707,7 +662,7 @@ class TracepointProvider(object):
707 self.setup_traces() 662 self.setup_traces()
708 self.fields = self._fields 663 self.fields = self._fields
709 664
710 def read(self): 665 def read(self, by_guest=0):
711 """Returns 'event name: current value' for all enabled events.""" 666 """Returns 'event name: current value' for all enabled events."""
712 ret = defaultdict(int) 667 ret = defaultdict(int)
713 for group in self.group_leaders: 668 for group in self.group_leaders:
@@ -723,16 +678,17 @@ class TracepointProvider(object):
723 event.reset() 678 event.reset()
724 679
725 680
726class DebugfsProvider(object): 681class DebugfsProvider(Provider):
727 """Provides data from the files that KVM creates in the kvm debugfs 682 """Provides data from the files that KVM creates in the kvm debugfs
728 folder.""" 683 folder."""
729 def __init__(self): 684 def __init__(self, pid, fields_filter, include_past):
730 self._fields = self.get_available_fields() 685 self.update_fields(fields_filter)
731 self._baseline = {} 686 self._baseline = {}
732 self._pid = 0
733 self.do_read = True 687 self.do_read = True
734 self.paths = [] 688 self.paths = []
735 self.reset() 689 self.pid = pid
690 if include_past:
691 self.restore()
736 692
737 def get_available_fields(self): 693 def get_available_fields(self):
738 """"Returns a list of available fields. 694 """"Returns a list of available fields.
@@ -740,7 +696,12 @@ class DebugfsProvider(object):
740 The fields are all available KVM debugfs files 696 The fields are all available KVM debugfs files
741 697
742 """ 698 """
743 return walkdir(PATH_DEBUGFS_KVM)[2] 699 return self.walkdir(PATH_DEBUGFS_KVM)[2]
700
701 def update_fields(self, fields_filter):
702 """Refresh fields, applying fields_filter"""
703 self._fields = [field for field in self.get_available_fields()
704 if self.is_field_wanted(fields_filter, field)]
744 705
745 @property 706 @property
746 def fields(self): 707 def fields(self):
@@ -757,10 +718,9 @@ class DebugfsProvider(object):
757 718
758 @pid.setter 719 @pid.setter
759 def pid(self, pid): 720 def pid(self, pid):
721 self._pid = pid
760 if pid != 0: 722 if pid != 0:
761 self._pid = pid 723 vms = self.walkdir(PATH_DEBUGFS_KVM)[1]
762
763 vms = walkdir(PATH_DEBUGFS_KVM)[1]
764 if len(vms) == 0: 724 if len(vms) == 0:
765 self.do_read = False 725 self.do_read = False
766 726
@@ -771,8 +731,15 @@ class DebugfsProvider(object):
771 self.do_read = True 731 self.do_read = True
772 self.reset() 732 self.reset()
773 733
774 def read(self, reset=0): 734 def read(self, reset=0, by_guest=0):
775 """Returns a dict with format:'file name / field -> current value'.""" 735 """Returns a dict with format:'file name / field -> current value'.
736
737 Parameter 'reset':
738 0 plain read
739 1 reset field counts to 0
740 2 restore the original field counts
741
742 """
776 results = {} 743 results = {}
777 744
778 # If no debugfs filtering support is available, then don't read. 745 # If no debugfs filtering support is available, then don't read.
@@ -789,12 +756,22 @@ class DebugfsProvider(object):
789 for field in self._fields: 756 for field in self._fields:
790 value = self.read_field(field, path) 757 value = self.read_field(field, path)
791 key = path + field 758 key = path + field
792 if reset: 759 if reset == 1:
793 self._baseline[key] = value 760 self._baseline[key] = value
761 if reset == 2:
762 self._baseline[key] = 0
794 if self._baseline.get(key, -1) == -1: 763 if self._baseline.get(key, -1) == -1:
795 self._baseline[key] = value 764 self._baseline[key] = value
796 results[field] = (results.get(field, 0) + value - 765 increment = (results.get(field, 0) + value -
797 self._baseline.get(key, 0)) 766 self._baseline.get(key, 0))
767 if by_guest:
768 pid = key.split('-')[0]
769 if pid in results:
770 results[pid] += increment
771 else:
772 results[pid] = increment
773 else:
774 results[field] = increment
798 775
799 return results 776 return results
800 777
@@ -813,6 +790,11 @@ class DebugfsProvider(object):
813 self._baseline = {} 790 self._baseline = {}
814 self.read(1) 791 self.read(1)
815 792
793 def restore(self):
794 """Reset field counters"""
795 self._baseline = {}
796 self.read(2)
797
816 798
817class Stats(object): 799class Stats(object):
818 """Manages the data providers and the data they provide. 800 """Manages the data providers and the data they provide.
@@ -821,33 +803,32 @@ class Stats(object):
821 provider data. 803 provider data.
822 804
823 """ 805 """
824 def __init__(self, providers, pid, fields=None): 806 def __init__(self, options):
825 self.providers = providers 807 self.providers = self.get_providers(options)
826 self._pid_filter = pid 808 self._pid_filter = options.pid
827 self._fields_filter = fields 809 self._fields_filter = options.fields
828 self.values = {} 810 self.values = {}
829 self.update_provider_pid() 811
830 self.update_provider_filters() 812 @staticmethod
813 def get_providers(options):
814 """Returns a list of data providers depending on the passed options."""
815 providers = []
816
817 if options.debugfs:
818 providers.append(DebugfsProvider(options.pid, options.fields,
819 options.dbgfs_include_past))
820 if options.tracepoints or not providers:
821 providers.append(TracepointProvider(options.pid, options.fields))
822
823 return providers
831 824
832 def update_provider_filters(self): 825 def update_provider_filters(self):
833 """Propagates fields filters to providers.""" 826 """Propagates fields filters to providers."""
834 def wanted(key):
835 if not self._fields_filter:
836 return True
837 return re.match(self._fields_filter, key) is not None
838
839 # As we reset the counters when updating the fields we can 827 # As we reset the counters when updating the fields we can
840 # also clear the cache of old values. 828 # also clear the cache of old values.
841 self.values = {} 829 self.values = {}
842 for provider in self.providers: 830 for provider in self.providers:
843 provider_fields = [key for key in provider.get_available_fields() 831 provider.update_fields(self._fields_filter)
844 if wanted(key)]
845 provider.fields = provider_fields
846
847 def update_provider_pid(self):
848 """Propagates pid filters to providers."""
849 for provider in self.providers:
850 provider.pid = self._pid_filter
851 832
852 def reset(self): 833 def reset(self):
853 self.values = {} 834 self.values = {}
@@ -873,27 +854,52 @@ class Stats(object):
873 if pid != self._pid_filter: 854 if pid != self._pid_filter:
874 self._pid_filter = pid 855 self._pid_filter = pid
875 self.values = {} 856 self.values = {}
876 self.update_provider_pid() 857 for provider in self.providers:
858 provider.pid = self._pid_filter
877 859
878 def get(self): 860 def get(self, by_guest=0):
879 """Returns a dict with field -> (value, delta to last value) of all 861 """Returns a dict with field -> (value, delta to last value) of all
880 provider data.""" 862 provider data."""
881 for provider in self.providers: 863 for provider in self.providers:
882 new = provider.read() 864 new = provider.read(by_guest=by_guest)
883 for key in provider.fields: 865 for key in new if by_guest else provider.fields:
884 oldval = self.values.get(key, (0, 0))[0] 866 oldval = self.values.get(key, (0, 0))[0]
885 newval = new.get(key, 0) 867 newval = new.get(key, 0)
886 newdelta = newval - oldval 868 newdelta = newval - oldval
887 self.values[key] = (newval, newdelta) 869 self.values[key] = (newval, newdelta)
888 return self.values 870 return self.values
889 871
890LABEL_WIDTH = 40 872 def toggle_display_guests(self, to_pid):
891NUMBER_WIDTH = 10 873 """Toggle between collection of stats by individual event and by
892DELAY_INITIAL = 0.25 874 guest pid
893DELAY_REGULAR = 3.0 875
876 Events reported by DebugfsProvider change when switching to/from
877 reading by guest values. Hence we have to remove the excess event
878 names from self.values.
879
880 """
881 if any(isinstance(ins, TracepointProvider) for ins in self.providers):
882 return 1
883 if to_pid:
884 for provider in self.providers:
885 if isinstance(provider, DebugfsProvider):
886 for key in provider.fields:
887 if key in self.values.keys():
888 del self.values[key]
889 else:
890 oldvals = self.values.copy()
891 for key in oldvals:
892 if key.isdigit():
893 del self.values[key]
894 # Update oldval (see get())
895 self.get(to_pid)
896 return 0
897
898DELAY_DEFAULT = 3.0
894MAX_GUEST_NAME_LEN = 48 899MAX_GUEST_NAME_LEN = 48
895MAX_REGEX_LEN = 44 900MAX_REGEX_LEN = 44
896DEFAULT_REGEX = r'^[^\(]*$' 901DEFAULT_REGEX = r'^[^\(]*$'
902SORT_DEFAULT = 0
897 903
898 904
899class Tui(object): 905class Tui(object):
@@ -901,7 +907,10 @@ class Tui(object):
901 def __init__(self, stats): 907 def __init__(self, stats):
902 self.stats = stats 908 self.stats = stats
903 self.screen = None 909 self.screen = None
904 self.update_drilldown() 910 self._delay_initial = 0.25
911 self._delay_regular = DELAY_DEFAULT
912 self._sorting = SORT_DEFAULT
913 self._display_guests = 0
905 914
906 def __enter__(self): 915 def __enter__(self):
907 """Initialises curses for later use. Based on curses.wrapper 916 """Initialises curses for later use. Based on curses.wrapper
@@ -929,7 +938,7 @@ class Tui(object):
929 return self 938 return self
930 939
931 def __exit__(self, *exception): 940 def __exit__(self, *exception):
932 """Resets the terminal to its normal state. Based on curses.wrappre 941 """Resets the terminal to its normal state. Based on curses.wrapper
933 implementation from the Python standard library.""" 942 implementation from the Python standard library."""
934 if self.screen: 943 if self.screen:
935 self.screen.keypad(0) 944 self.screen.keypad(0)
@@ -937,6 +946,86 @@ class Tui(object):
937 curses.nocbreak() 946 curses.nocbreak()
938 curses.endwin() 947 curses.endwin()
939 948
949 def get_all_gnames(self):
950 """Returns a list of (pid, gname) tuples of all running guests"""
951 res = []
952 try:
953 child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
954 stdout=subprocess.PIPE)
955 except:
956 raise Exception
957 for line in child.stdout:
958 line = line.lstrip().split(' ', 1)
959 # perform a sanity check before calling the more expensive
960 # function to possibly extract the guest name
961 if ' -name ' in line[1]:
962 res.append((line[0], self.get_gname_from_pid(line[0])))
963 child.stdout.close()
964
965 return res
966
967 def print_all_gnames(self, row):
968 """Print a list of all running guests along with their pids."""
969 self.screen.addstr(row, 2, '%8s %-60s' %
970 ('Pid', 'Guest Name (fuzzy list, might be '
971 'inaccurate!)'),
972 curses.A_UNDERLINE)
973 row += 1
974 try:
975 for line in self.get_all_gnames():
976 self.screen.addstr(row, 2, '%8s %-60s' % (line[0], line[1]))
977 row += 1
978 if row >= self.screen.getmaxyx()[0]:
979 break
980 except Exception:
981 self.screen.addstr(row + 1, 2, 'Not available')
982
983 def get_pid_from_gname(self, gname):
984 """Fuzzy function to convert guest name to QEMU process pid.
985
986 Returns a list of potential pids, can be empty if no match found.
987 Throws an exception on processing errors.
988
989 """
990 pids = []
991 for line in self.get_all_gnames():
992 if gname == line[1]:
993 pids.append(int(line[0]))
994
995 return pids
996
997 @staticmethod
998 def get_gname_from_pid(pid):
999 """Returns the guest name for a QEMU process pid.
1000
1001 Extracts the guest name from the QEMU comma line by processing the
1002 '-name' option. Will also handle names specified out of sequence.
1003
1004 """
1005 name = ''
1006 try:
1007 line = open('/proc/{}/cmdline'
1008 .format(pid), 'rb').read().split('\0')
1009 parms = line[line.index('-name') + 1].split(',')
1010 while '' in parms:
1011 # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results
1012 # in # ['foo', '', 'bar'], which we revert here
1013 idx = parms.index('')
1014 parms[idx - 1] += ',' + parms[idx + 1]
1015 del parms[idx:idx+2]
1016 # the '-name' switch allows for two ways to specify the guest name,
1017 # where the plain name overrides the name specified via 'guest='
1018 for arg in parms:
1019 if '=' not in arg:
1020 name = arg
1021 break
1022 if arg[:6] == 'guest=':
1023 name = arg[6:]
1024 except (ValueError, IOError, IndexError):
1025 pass
1026
1027 return name
1028
940 def update_drilldown(self): 1029 def update_drilldown(self):
941 """Sets or removes a filter that only allows fields without braces.""" 1030 """Sets or removes a filter that only allows fields without braces."""
942 if not self.stats.fields_filter: 1031 if not self.stats.fields_filter:
@@ -954,7 +1043,7 @@ class Tui(object):
954 if pid is None: 1043 if pid is None:
955 pid = self.stats.pid_filter 1044 pid = self.stats.pid_filter
956 self.screen.erase() 1045 self.screen.erase()
957 gname = get_gname_from_pid(pid) 1046 gname = self.get_gname_from_pid(pid)
958 if gname: 1047 if gname:
959 gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' 1048 gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...'
960 if len(gname) > MAX_GUEST_NAME_LEN 1049 if len(gname) > MAX_GUEST_NAME_LEN
@@ -970,13 +1059,13 @@ class Tui(object):
970 if len(regex) > MAX_REGEX_LEN: 1059 if len(regex) > MAX_REGEX_LEN:
971 regex = regex[:MAX_REGEX_LEN] + '...' 1060 regex = regex[:MAX_REGEX_LEN] + '...'
972 self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex)) 1061 self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex))
973 self.screen.addstr(2, 1, 'Event') 1062 if self._display_guests:
974 self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - 1063 col_name = 'Guest Name'
975 len('Total'), 'Total') 1064 else:
976 self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 - 1065 col_name = 'Event'
977 len('%Total'), '%Total') 1066 self.screen.addstr(2, 1, '%-40s %10s%7s %8s' %
978 self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 - 1067 (col_name, 'Total', '%Total', 'CurAvg/s'),
979 len('Current'), 'Current') 1068 curses.A_STANDOUT)
980 self.screen.addstr(4, 1, 'Collecting data...') 1069 self.screen.addstr(4, 1, 'Collecting data...')
981 self.screen.refresh() 1070 self.screen.refresh()
982 1071
@@ -984,16 +1073,25 @@ class Tui(object):
984 row = 3 1073 row = 3
985 self.screen.move(row, 0) 1074 self.screen.move(row, 0)
986 self.screen.clrtobot() 1075 self.screen.clrtobot()
987 stats = self.stats.get() 1076 stats = self.stats.get(self._display_guests)
988 1077
989 def sortkey(x): 1078 def sortCurAvg(x):
1079 # sort by current events if available
990 if stats[x][1]: 1080 if stats[x][1]:
991 return (-stats[x][1], -stats[x][0]) 1081 return (-stats[x][1], -stats[x][0])
992 else: 1082 else:
993 return (0, -stats[x][0]) 1083 return (0, -stats[x][0])
1084
1085 def sortTotal(x):
1086 # sort by totals
1087 return (0, -stats[x][0])
994 total = 0. 1088 total = 0.
995 for val in stats.values(): 1089 for val in stats.values():
996 total += val[0] 1090 total += val[0]
1091 if self._sorting == SORT_DEFAULT:
1092 sortkey = sortCurAvg
1093 else:
1094 sortkey = sortTotal
997 for key in sorted(stats.keys(), key=sortkey): 1095 for key in sorted(stats.keys(), key=sortkey):
998 1096
999 if row >= self.screen.getmaxyx()[0]: 1097 if row >= self.screen.getmaxyx()[0]:
@@ -1001,18 +1099,61 @@ class Tui(object):
1001 values = stats[key] 1099 values = stats[key]
1002 if not values[0] and not values[1]: 1100 if not values[0] and not values[1]:
1003 break 1101 break
1004 col = 1 1102 if values[0] is not None:
1005 self.screen.addstr(row, col, key) 1103 cur = int(round(values[1] / sleeptime)) if values[1] else ''
1006 col += LABEL_WIDTH 1104 if self._display_guests:
1007 self.screen.addstr(row, col, '%10d' % (values[0],)) 1105 key = self.get_gname_from_pid(key)
1008 col += NUMBER_WIDTH 1106 self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' %
1009 self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,)) 1107 (key, values[0], values[0] * 100 / total,
1010 col += 7 1108 cur))
1011 if values[1] is not None:
1012 self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
1013 row += 1 1109 row += 1
1110 if row == 3:
1111 self.screen.addstr(4, 1, 'No matching events reported yet')
1014 self.screen.refresh() 1112 self.screen.refresh()
1015 1113
1114 def show_msg(self, text):
1115 """Display message centered text and exit on key press"""
1116 hint = 'Press any key to continue'
1117 curses.cbreak()
1118 self.screen.erase()
1119 (x, term_width) = self.screen.getmaxyx()
1120 row = 2
1121 for line in text:
1122 start = (term_width - len(line)) / 2
1123 self.screen.addstr(row, start, line)
1124 row += 1
1125 self.screen.addstr(row + 1, (term_width - len(hint)) / 2, hint,
1126 curses.A_STANDOUT)
1127 self.screen.getkey()
1128
1129 def show_help_interactive(self):
1130 """Display help with list of interactive commands"""
1131 msg = (' b toggle events by guests (debugfs only, honors'
1132 ' filters)',
1133 ' c clear filter',
1134 ' f filter by regular expression',
1135 ' g filter by guest name',
1136 ' h display interactive commands reference',
1137 ' o toggle sorting order (Total vs CurAvg/s)',
1138 ' p filter by PID',
1139 ' q quit',
1140 ' r reset stats',
1141 ' s set update interval',
1142 ' x toggle reporting of stats for individual child trace'
1143 ' events',
1144 'Any other key refreshes statistics immediately')
1145 curses.cbreak()
1146 self.screen.erase()
1147 self.screen.addstr(0, 0, "Interactive commands reference",
1148 curses.A_BOLD)
1149 self.screen.addstr(2, 0, "Press any key to exit", curses.A_STANDOUT)
1150 row = 4
1151 for line in msg:
1152 self.screen.addstr(row, 0, line)
1153 row += 1
1154 self.screen.getkey()
1155 self.refresh_header()
1156
1016 def show_filter_selection(self): 1157 def show_filter_selection(self):
1017 """Draws filter selection mask. 1158 """Draws filter selection mask.
1018 1159
@@ -1059,6 +1200,7 @@ class Tui(object):
1059 'This might limit the shown data to the trace ' 1200 'This might limit the shown data to the trace '
1060 'statistics.') 1201 'statistics.')
1061 self.screen.addstr(5, 0, msg) 1202 self.screen.addstr(5, 0, msg)
1203 self.print_all_gnames(7)
1062 1204
1063 curses.echo() 1205 curses.echo()
1064 self.screen.addstr(3, 0, "Pid [0 or pid]: ") 1206 self.screen.addstr(3, 0, "Pid [0 or pid]: ")
@@ -1077,10 +1219,40 @@ class Tui(object):
1077 self.refresh_header(pid) 1219 self.refresh_header(pid)
1078 self.update_pid(pid) 1220 self.update_pid(pid)
1079 break 1221 break
1080
1081 except ValueError: 1222 except ValueError:
1082 msg = '"' + str(pid) + '": Not a valid pid' 1223 msg = '"' + str(pid) + '": Not a valid pid'
1083 continue 1224
1225 def show_set_update_interval(self):
1226 """Draws update interval selection mask."""
1227 msg = ''
1228 while True:
1229 self.screen.erase()
1230 self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' %
1231 DELAY_DEFAULT, curses.A_BOLD)
1232 self.screen.addstr(4, 0, msg)
1233 self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
1234 self._delay_regular)
1235 curses.echo()
1236 val = self.screen.getstr()
1237 curses.noecho()
1238
1239 try:
1240 if len(val) > 0:
1241 delay = float(val)
1242 if delay < 0.1:
1243 msg = '"' + str(val) + '": Value must be >=0.1'
1244 continue
1245 if delay > 25.5:
1246 msg = '"' + str(val) + '": Value must be <=25.5'
1247 continue
1248 else:
1249 delay = DELAY_DEFAULT
1250 self._delay_regular = delay
1251 break
1252
1253 except ValueError:
1254 msg = '"' + str(val) + '": Invalid value'
1255 self.refresh_header()
1084 1256
1085 def show_vm_selection_by_guest_name(self): 1257 def show_vm_selection_by_guest_name(self):
1086 """Draws guest selection mask. 1258 """Draws guest selection mask.
@@ -1098,6 +1270,7 @@ class Tui(object):
1098 'This might limit the shown data to the trace ' 1270 'This might limit the shown data to the trace '
1099 'statistics.') 1271 'statistics.')
1100 self.screen.addstr(5, 0, msg) 1272 self.screen.addstr(5, 0, msg)
1273 self.print_all_gnames(7)
1101 curses.echo() 1274 curses.echo()
1102 self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") 1275 self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
1103 gname = self.screen.getstr() 1276 gname = self.screen.getstr()
@@ -1110,7 +1283,7 @@ class Tui(object):
1110 else: 1283 else:
1111 pids = [] 1284 pids = []
1112 try: 1285 try:
1113 pids = get_pid_from_gname(gname) 1286 pids = self.get_pid_from_gname(gname)
1114 except: 1287 except:
1115 msg = '"' + gname + '": Internal error while searching, ' \ 1288 msg = '"' + gname + '": Internal error while searching, ' \
1116 'use pid filter instead' 1289 'use pid filter instead'
@@ -1128,38 +1301,60 @@ class Tui(object):
1128 1301
1129 def show_stats(self): 1302 def show_stats(self):
1130 """Refreshes the screen and processes user input.""" 1303 """Refreshes the screen and processes user input."""
1131 sleeptime = DELAY_INITIAL 1304 sleeptime = self._delay_initial
1132 self.refresh_header() 1305 self.refresh_header()
1306 start = 0.0 # result based on init value never appears on screen
1133 while True: 1307 while True:
1134 self.refresh_body(sleeptime) 1308 self.refresh_body(time.time() - start)
1135 curses.halfdelay(int(sleeptime * 10)) 1309 curses.halfdelay(int(sleeptime * 10))
1136 sleeptime = DELAY_REGULAR 1310 start = time.time()
1311 sleeptime = self._delay_regular
1137 try: 1312 try:
1138 char = self.screen.getkey() 1313 char = self.screen.getkey()
1139 if char == 'x': 1314 if char == 'b':
1315 self._display_guests = not self._display_guests
1316 if self.stats.toggle_display_guests(self._display_guests):
1317 self.show_msg(['Command not available with tracepoints'
1318 ' enabled', 'Restart with debugfs only '
1319 '(see option \'-d\') and try again!'])
1320 self._display_guests = not self._display_guests
1140 self.refresh_header() 1321 self.refresh_header()
1141 self.update_drilldown()
1142 sleeptime = DELAY_INITIAL
1143 if char == 'q':
1144 break
1145 if char == 'c': 1322 if char == 'c':
1146 self.stats.fields_filter = DEFAULT_REGEX 1323 self.stats.fields_filter = DEFAULT_REGEX
1147 self.refresh_header(0) 1324 self.refresh_header(0)
1148 self.update_pid(0) 1325 self.update_pid(0)
1149 sleeptime = DELAY_INITIAL
1150 if char == 'f': 1326 if char == 'f':
1327 curses.curs_set(1)
1151 self.show_filter_selection() 1328 self.show_filter_selection()
1152 sleeptime = DELAY_INITIAL 1329 curses.curs_set(0)
1330 sleeptime = self._delay_initial
1153 if char == 'g': 1331 if char == 'g':
1332 curses.curs_set(1)
1154 self.show_vm_selection_by_guest_name() 1333 self.show_vm_selection_by_guest_name()
1155 sleeptime = DELAY_INITIAL 1334 curses.curs_set(0)
1335 sleeptime = self._delay_initial
1336 if char == 'h':
1337 self.show_help_interactive()
1338 if char == 'o':
1339 self._sorting = not self._sorting
1156 if char == 'p': 1340 if char == 'p':
1341 curses.curs_set(1)
1157 self.show_vm_selection_by_pid() 1342 self.show_vm_selection_by_pid()
1158 sleeptime = DELAY_INITIAL 1343 curses.curs_set(0)
1344 sleeptime = self._delay_initial
1345 if char == 'q':
1346 break
1159 if char == 'r': 1347 if char == 'r':
1160 self.refresh_header()
1161 self.stats.reset() 1348 self.stats.reset()
1162 sleeptime = DELAY_INITIAL 1349 if char == 's':
1350 curses.curs_set(1)
1351 self.show_set_update_interval()
1352 curses.curs_set(0)
1353 sleeptime = self._delay_initial
1354 if char == 'x':
1355 self.update_drilldown()
1356 # prevents display of current values on next refresh
1357 self.stats.get()
1163 except KeyboardInterrupt: 1358 except KeyboardInterrupt:
1164 break 1359 break
1165 except curses.error: 1360 except curses.error:
@@ -1227,13 +1422,17 @@ Requirements:
1227 the large number of files that are possibly opened. 1422 the large number of files that are possibly opened.
1228 1423
1229Interactive Commands: 1424Interactive Commands:
1425 b toggle events by guests (debugfs only, honors filters)
1230 c clear filter 1426 c clear filter
1231 f filter by regular expression 1427 f filter by regular expression
1232 g filter by guest name 1428 g filter by guest name
1429 h display interactive commands reference
1430 o toggle sorting order (Total vs CurAvg/s)
1233 p filter by PID 1431 p filter by PID
1234 q quit 1432 q quit
1235 x toggle reporting of stats for individual child trace events
1236 r reset stats 1433 r reset stats
1434 s set update interval
1435 x toggle reporting of stats for individual child trace events
1237Press any other key to refresh statistics immediately. 1436Press any other key to refresh statistics immediately.
1238""" 1437"""
1239 1438
@@ -1246,7 +1445,7 @@ Press any other key to refresh statistics immediately.
1246 1445
1247 def cb_guest_to_pid(option, opt, val, parser): 1446 def cb_guest_to_pid(option, opt, val, parser):
1248 try: 1447 try:
1249 pids = get_pid_from_gname(val) 1448 pids = Tui.get_pid_from_gname(val)
1250 except: 1449 except:
1251 raise optparse.OptionValueError('Error while searching for guest ' 1450 raise optparse.OptionValueError('Error while searching for guest '
1252 '"{}", use "-p" to specify a pid ' 1451 '"{}", use "-p" to specify a pid '
@@ -1268,6 +1467,13 @@ Press any other key to refresh statistics immediately.
1268 dest='once', 1467 dest='once',
1269 help='run in batch mode for one second', 1468 help='run in batch mode for one second',
1270 ) 1469 )
1470 optparser.add_option('-i', '--debugfs-include-past',
1471 action='store_true',
1472 default=False,
1473 dest='dbgfs_include_past',
1474 help='include all available data on past events for '
1475 'debugfs',
1476 )
1271 optparser.add_option('-l', '--log', 1477 optparser.add_option('-l', '--log',
1272 action='store_true', 1478 action='store_true',
1273 default=False, 1479 default=False,
@@ -1288,7 +1494,7 @@ Press any other key to refresh statistics immediately.
1288 ) 1494 )
1289 optparser.add_option('-f', '--fields', 1495 optparser.add_option('-f', '--fields',
1290 action='store', 1496 action='store',
1291 default=None, 1497 default=DEFAULT_REGEX,
1292 dest='fields', 1498 dest='fields',
1293 help='fields to display (regex)', 1499 help='fields to display (regex)',
1294 ) 1500 )
@@ -1311,20 +1517,6 @@ Press any other key to refresh statistics immediately.
1311 return options 1517 return options
1312 1518
1313 1519
1314def get_providers(options):
1315 """Returns a list of data providers depending on the passed options."""
1316 providers = []
1317
1318 if options.tracepoints:
1319 providers.append(TracepointProvider())
1320 if options.debugfs:
1321 providers.append(DebugfsProvider())
1322 if len(providers) == 0:
1323 providers.append(TracepointProvider())
1324
1325 return providers
1326
1327
1328def check_access(options): 1520def check_access(options):
1329 """Exits if the current user can't access all needed directories.""" 1521 """Exits if the current user can't access all needed directories."""
1330 if not os.path.exists('/sys/kernel/debug'): 1522 if not os.path.exists('/sys/kernel/debug'):
@@ -1365,8 +1557,7 @@ def main():
1365 sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n') 1557 sys.stderr.write('Did you use a (unsupported) tid instead of a pid?\n')
1366 sys.exit('Specified pid does not exist.') 1558 sys.exit('Specified pid does not exist.')
1367 1559
1368 providers = get_providers(options) 1560 stats = Stats(options)
1369 stats = Stats(providers, options.pid, fields=options.fields)
1370 1561
1371 if options.log: 1562 if options.log:
1372 log(stats) 1563 log(stats)
diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt
index 109431bdc63c..e5cf836be8a1 100644
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@@ -29,18 +29,26 @@ meaning of events.
29INTERACTIVE COMMANDS 29INTERACTIVE COMMANDS
30-------------------- 30--------------------
31[horizontal] 31[horizontal]
32*b*:: toggle events by guests (debugfs only, honors filters)
33
32*c*:: clear filter 34*c*:: clear filter
33 35
34*f*:: filter by regular expression 36*f*:: filter by regular expression
35 37
36*g*:: filter by guest name 38*g*:: filter by guest name
37 39
40*h*:: display interactive commands reference
41
42*o*:: toggle sorting order (Total vs CurAvg/s)
43
38*p*:: filter by PID 44*p*:: filter by PID
39 45
40*q*:: quit 46*q*:: quit
41 47
42*r*:: reset stats 48*r*:: reset stats
43 49
50*s*:: set update interval
51
44*x*:: toggle reporting of stats for child trace events 52*x*:: toggle reporting of stats for child trace events
45 53
46Press any other key to refresh statistics immediately. 54Press any other key to refresh statistics immediately.
@@ -64,6 +72,10 @@ OPTIONS
64--debugfs:: 72--debugfs::
65 retrieve statistics from debugfs 73 retrieve statistics from debugfs
66 74
75-i::
76--debugfs-include-past::
77 include all available data on past events for debugfs
78
67-p<pid>:: 79-p<pid>::
68--pid=<pid>:: 80--pid=<pid>::
69 limit statistics to one virtual machine (pid) 81 limit statistics to one virtual machine (pid)
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index 528af4b2d09e..79c7c357804b 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -60,7 +60,7 @@ static const unsigned short cc_map[16] = {
60/* 60/*
61 * Check if a trapped instruction should have been executed or not. 61 * Check if a trapped instruction should have been executed or not.
62 */ 62 */
63bool kvm_condition_valid32(const struct kvm_vcpu *vcpu) 63bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
64{ 64{
65 unsigned long cpsr; 65 unsigned long cpsr;
66 u32 cpsr_cond; 66 u32 cpsr_cond;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 5976609ef27c..8e89d63005c7 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -21,6 +21,7 @@
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/irq.h> 23#include <linux/irq.h>
24#include <linux/uaccess.h>
24 25
25#include <clocksource/arm_arch_timer.h> 26#include <clocksource/arm_arch_timer.h>
26#include <asm/arch_timer.h> 27#include <asm/arch_timer.h>
@@ -35,6 +36,16 @@ static struct timecounter *timecounter;
35static unsigned int host_vtimer_irq; 36static unsigned int host_vtimer_irq;
36static u32 host_vtimer_irq_flags; 37static u32 host_vtimer_irq_flags;
37 38
39static const struct kvm_irq_level default_ptimer_irq = {
40 .irq = 30,
41 .level = 1,
42};
43
44static const struct kvm_irq_level default_vtimer_irq = {
45 .irq = 27,
46 .level = 1,
47};
48
38void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 49void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
39{ 50{
40 vcpu_vtimer(vcpu)->active_cleared_last = false; 51 vcpu_vtimer(vcpu)->active_cleared_last = false;
@@ -95,7 +106,7 @@ static void kvm_timer_inject_irq_work(struct work_struct *work)
95 * If the vcpu is blocked we want to wake it up so that it will see 106 * If the vcpu is blocked we want to wake it up so that it will see
96 * the timer has expired when entering the guest. 107 * the timer has expired when entering the guest.
97 */ 108 */
98 kvm_vcpu_kick(vcpu); 109 kvm_vcpu_wake_up(vcpu);
99} 110}
100 111
101static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) 112static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
@@ -215,7 +226,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
215 if (likely(irqchip_in_kernel(vcpu->kvm))) { 226 if (likely(irqchip_in_kernel(vcpu->kvm))) {
216 ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, 227 ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
217 timer_ctx->irq.irq, 228 timer_ctx->irq.irq,
218 timer_ctx->irq.level); 229 timer_ctx->irq.level,
230 timer_ctx);
219 WARN_ON(ret); 231 WARN_ON(ret);
220 } 232 }
221} 233}
@@ -445,23 +457,12 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
445 kvm_timer_update_state(vcpu); 457 kvm_timer_update_state(vcpu);
446} 458}
447 459
448int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, 460int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
449 const struct kvm_irq_level *virt_irq,
450 const struct kvm_irq_level *phys_irq)
451{ 461{
452 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 462 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
453 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 463 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
454 464
455 /* 465 /*
456 * The vcpu timer irq number cannot be determined in
457 * kvm_timer_vcpu_init() because it is called much before
458 * kvm_vcpu_set_target(). To handle this, we determine
459 * vcpu timer irq number when the vcpu is reset.
460 */
461 vtimer->irq.irq = virt_irq->irq;
462 ptimer->irq.irq = phys_irq->irq;
463
464 /*
465 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 466 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
466 * and to 0 for ARMv7. We provide an implementation that always 467 * and to 0 for ARMv7. We provide an implementation that always
467 * resets the timer to be disabled and unmasked and is compliant with 468 * resets the timer to be disabled and unmasked and is compliant with
@@ -496,6 +497,8 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
496void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) 497void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
497{ 498{
498 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 499 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
500 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
501 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
499 502
500 /* Synchronize cntvoff across all vtimers of a VM. */ 503 /* Synchronize cntvoff across all vtimers of a VM. */
501 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); 504 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
@@ -504,6 +507,9 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
504 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); 507 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
505 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 508 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
506 timer->timer.function = kvm_timer_expire; 509 timer->timer.function = kvm_timer_expire;
510
511 vtimer->irq.irq = default_vtimer_irq.irq;
512 ptimer->irq.irq = default_ptimer_irq.irq;
507} 513}
508 514
509static void kvm_timer_init_interrupt(void *info) 515static void kvm_timer_init_interrupt(void *info)
@@ -613,6 +619,30 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
613 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); 619 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
614} 620}
615 621
622static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
623{
624 int vtimer_irq, ptimer_irq;
625 int i, ret;
626
627 vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
628 ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
629 if (ret)
630 return false;
631
632 ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
633 ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
634 if (ret)
635 return false;
636
637 kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
638 if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
639 vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
640 return false;
641 }
642
643 return true;
644}
645
616int kvm_timer_enable(struct kvm_vcpu *vcpu) 646int kvm_timer_enable(struct kvm_vcpu *vcpu)
617{ 647{
618 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 648 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -632,6 +662,11 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
632 if (!vgic_initialized(vcpu->kvm)) 662 if (!vgic_initialized(vcpu->kvm))
633 return -ENODEV; 663 return -ENODEV;
634 664
665 if (!timer_irqs_are_valid(vcpu)) {
666 kvm_debug("incorrectly configured timer irqs\n");
667 return -EINVAL;
668 }
669
635 /* 670 /*
636 * Find the physical IRQ number corresponding to the host_vtimer_irq 671 * Find the physical IRQ number corresponding to the host_vtimer_irq
637 */ 672 */
@@ -681,3 +716,79 @@ void kvm_timer_init_vhe(void)
681 val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); 716 val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
682 write_sysreg(val, cnthctl_el2); 717 write_sysreg(val, cnthctl_el2);
683} 718}
719
720static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
721{
722 struct kvm_vcpu *vcpu;
723 int i;
724
725 kvm_for_each_vcpu(i, vcpu, kvm) {
726 vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
727 vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
728 }
729}
730
731int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
732{
733 int __user *uaddr = (int __user *)(long)attr->addr;
734 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
735 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
736 int irq;
737
738 if (!irqchip_in_kernel(vcpu->kvm))
739 return -EINVAL;
740
741 if (get_user(irq, uaddr))
742 return -EFAULT;
743
744 if (!(irq_is_ppi(irq)))
745 return -EINVAL;
746
747 if (vcpu->arch.timer_cpu.enabled)
748 return -EBUSY;
749
750 switch (attr->attr) {
751 case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
752 set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
753 break;
754 case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
755 set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
756 break;
757 default:
758 return -ENXIO;
759 }
760
761 return 0;
762}
763
764int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
765{
766 int __user *uaddr = (int __user *)(long)attr->addr;
767 struct arch_timer_context *timer;
768 int irq;
769
770 switch (attr->attr) {
771 case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
772 timer = vcpu_vtimer(vcpu);
773 break;
774 case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
775 timer = vcpu_ptimer(vcpu);
776 break;
777 default:
778 return -ENXIO;
779 }
780
781 irq = timer->irq.irq;
782 return put_user(irq, uaddr);
783}
784
785int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
786{
787 switch (attr->attr) {
788 case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
789 case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
790 return 0;
791 }
792
793 return -ENXIO;
794}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 3417e184c8e1..a39a1e161e63 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -368,6 +368,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
368 kvm_timer_vcpu_put(vcpu); 368 kvm_timer_vcpu_put(vcpu);
369} 369}
370 370
371static void vcpu_power_off(struct kvm_vcpu *vcpu)
372{
373 vcpu->arch.power_off = true;
374 kvm_make_request(KVM_REQ_SLEEP, vcpu);
375 kvm_vcpu_kick(vcpu);
376}
377
371int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 378int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
372 struct kvm_mp_state *mp_state) 379 struct kvm_mp_state *mp_state)
373{ 380{
@@ -387,7 +394,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
387 vcpu->arch.power_off = false; 394 vcpu->arch.power_off = false;
388 break; 395 break;
389 case KVM_MP_STATE_STOPPED: 396 case KVM_MP_STATE_STOPPED:
390 vcpu->arch.power_off = true; 397 vcpu_power_off(vcpu);
391 break; 398 break;
392 default: 399 default:
393 return -EINVAL; 400 return -EINVAL;
@@ -520,6 +527,10 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
520 } 527 }
521 528
522 ret = kvm_timer_enable(vcpu); 529 ret = kvm_timer_enable(vcpu);
530 if (ret)
531 return ret;
532
533 ret = kvm_arm_pmu_v3_enable(vcpu);
523 534
524 return ret; 535 return ret;
525} 536}
@@ -536,21 +547,7 @@ void kvm_arm_halt_guest(struct kvm *kvm)
536 547
537 kvm_for_each_vcpu(i, vcpu, kvm) 548 kvm_for_each_vcpu(i, vcpu, kvm)
538 vcpu->arch.pause = true; 549 vcpu->arch.pause = true;
539 kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT); 550 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
540}
541
542void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu)
543{
544 vcpu->arch.pause = true;
545 kvm_vcpu_kick(vcpu);
546}
547
548void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu)
549{
550 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
551
552 vcpu->arch.pause = false;
553 swake_up(wq);
554} 551}
555 552
556void kvm_arm_resume_guest(struct kvm *kvm) 553void kvm_arm_resume_guest(struct kvm *kvm)
@@ -558,16 +555,23 @@ void kvm_arm_resume_guest(struct kvm *kvm)
558 int i; 555 int i;
559 struct kvm_vcpu *vcpu; 556 struct kvm_vcpu *vcpu;
560 557
561 kvm_for_each_vcpu(i, vcpu, kvm) 558 kvm_for_each_vcpu(i, vcpu, kvm) {
562 kvm_arm_resume_vcpu(vcpu); 559 vcpu->arch.pause = false;
560 swake_up(kvm_arch_vcpu_wq(vcpu));
561 }
563} 562}
564 563
565static void vcpu_sleep(struct kvm_vcpu *vcpu) 564static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
566{ 565{
567 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); 566 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
568 567
569 swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && 568 swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
570 (!vcpu->arch.pause))); 569 (!vcpu->arch.pause)));
570
571 if (vcpu->arch.power_off || vcpu->arch.pause) {
572 /* Awaken to handle a signal, request we sleep again later. */
573 kvm_make_request(KVM_REQ_SLEEP, vcpu);
574 }
571} 575}
572 576
573static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) 577static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -575,6 +579,20 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
575 return vcpu->arch.target >= 0; 579 return vcpu->arch.target >= 0;
576} 580}
577 581
582static void check_vcpu_requests(struct kvm_vcpu *vcpu)
583{
584 if (kvm_request_pending(vcpu)) {
585 if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
586 vcpu_req_sleep(vcpu);
587
588 /*
589 * Clear IRQ_PENDING requests that were made to guarantee
590 * that a VCPU sees new virtual interrupts.
591 */
592 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
593 }
594}
595
578/** 596/**
579 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code 597 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
580 * @vcpu: The VCPU pointer 598 * @vcpu: The VCPU pointer
@@ -620,8 +638,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
620 638
621 update_vttbr(vcpu->kvm); 639 update_vttbr(vcpu->kvm);
622 640
623 if (vcpu->arch.power_off || vcpu->arch.pause) 641 check_vcpu_requests(vcpu);
624 vcpu_sleep(vcpu);
625 642
626 /* 643 /*
627 * Preparing the interrupts to be injected also 644 * Preparing the interrupts to be injected also
@@ -650,8 +667,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
650 run->exit_reason = KVM_EXIT_INTR; 667 run->exit_reason = KVM_EXIT_INTR;
651 } 668 }
652 669
670 /*
671 * Ensure we set mode to IN_GUEST_MODE after we disable
672 * interrupts and before the final VCPU requests check.
673 * See the comment in kvm_vcpu_exiting_guest_mode() and
674 * Documentation/virtual/kvm/vcpu-requests.rst
675 */
676 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
677
653 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || 678 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
654 vcpu->arch.power_off || vcpu->arch.pause) { 679 kvm_request_pending(vcpu)) {
680 vcpu->mode = OUTSIDE_GUEST_MODE;
655 local_irq_enable(); 681 local_irq_enable();
656 kvm_pmu_sync_hwstate(vcpu); 682 kvm_pmu_sync_hwstate(vcpu);
657 kvm_timer_sync_hwstate(vcpu); 683 kvm_timer_sync_hwstate(vcpu);
@@ -667,7 +693,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
667 */ 693 */
668 trace_kvm_entry(*vcpu_pc(vcpu)); 694 trace_kvm_entry(*vcpu_pc(vcpu));
669 guest_enter_irqoff(); 695 guest_enter_irqoff();
670 vcpu->mode = IN_GUEST_MODE;
671 696
672 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); 697 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
673 698
@@ -756,6 +781,7 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
756 * trigger a world-switch round on the running physical CPU to set the 781 * trigger a world-switch round on the running physical CPU to set the
757 * virtual IRQ/FIQ fields in the HCR appropriately. 782 * virtual IRQ/FIQ fields in the HCR appropriately.
758 */ 783 */
784 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
759 kvm_vcpu_kick(vcpu); 785 kvm_vcpu_kick(vcpu);
760 786
761 return 0; 787 return 0;
@@ -806,7 +832,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
806 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) 832 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
807 return -EINVAL; 833 return -EINVAL;
808 834
809 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level); 835 return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
810 case KVM_ARM_IRQ_TYPE_SPI: 836 case KVM_ARM_IRQ_TYPE_SPI:
811 if (!irqchip_in_kernel(kvm)) 837 if (!irqchip_in_kernel(kvm))
812 return -ENXIO; 838 return -ENXIO;
@@ -814,7 +840,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
814 if (irq_num < VGIC_NR_PRIVATE_IRQS) 840 if (irq_num < VGIC_NR_PRIVATE_IRQS)
815 return -EINVAL; 841 return -EINVAL;
816 842
817 return kvm_vgic_inject_irq(kvm, 0, irq_num, level); 843 return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
818 } 844 }
819 845
820 return -EINVAL; 846 return -EINVAL;
@@ -884,7 +910,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
884 * Handle the "start in power-off" case. 910 * Handle the "start in power-off" case.
885 */ 911 */
886 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 912 if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
887 vcpu->arch.power_off = true; 913 vcpu_power_off(vcpu);
888 else 914 else
889 vcpu->arch.power_off = false; 915 vcpu->arch.power_off = false;
890 916
@@ -1115,9 +1141,6 @@ static void cpu_init_hyp_mode(void *dummy)
1115 __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); 1141 __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
1116 __cpu_init_stage2(); 1142 __cpu_init_stage2();
1117 1143
1118 if (is_kernel_in_hyp_mode())
1119 kvm_timer_init_vhe();
1120
1121 kvm_arm_init_debug(); 1144 kvm_arm_init_debug();
1122} 1145}
1123 1146
@@ -1137,6 +1160,7 @@ static void cpu_hyp_reinit(void)
1137 * event was cancelled before the CPU was reset. 1160 * event was cancelled before the CPU was reset.
1138 */ 1161 */
1139 __cpu_init_stage2(); 1162 __cpu_init_stage2();
1163 kvm_timer_init_vhe();
1140 } else { 1164 } else {
1141 cpu_init_hyp_mode(NULL); 1165 cpu_init_hyp_mode(NULL);
1142 } 1166 }
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 87940364570b..91728faa13fd 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -19,10 +19,12 @@
19#include <linux/irqchip/arm-gic-v3.h> 19#include <linux/irqchip/arm-gic-v3.h>
20#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
21 21
22#include <asm/kvm_emulate.h>
22#include <asm/kvm_hyp.h> 23#include <asm/kvm_hyp.h>
23 24
24#define vtr_to_max_lr_idx(v) ((v) & 0xf) 25#define vtr_to_max_lr_idx(v) ((v) & 0xf)
25#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) 26#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
27#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
26 28
27static u64 __hyp_text __gic_v3_get_lr(unsigned int lr) 29static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
28{ 30{
@@ -118,6 +120,90 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
118 } 120 }
119} 121}
120 122
123static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
124{
125 switch (n) {
126 case 0:
127 write_gicreg(val, ICH_AP0R0_EL2);
128 break;
129 case 1:
130 write_gicreg(val, ICH_AP0R1_EL2);
131 break;
132 case 2:
133 write_gicreg(val, ICH_AP0R2_EL2);
134 break;
135 case 3:
136 write_gicreg(val, ICH_AP0R3_EL2);
137 break;
138 }
139}
140
141static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
142{
143 switch (n) {
144 case 0:
145 write_gicreg(val, ICH_AP1R0_EL2);
146 break;
147 case 1:
148 write_gicreg(val, ICH_AP1R1_EL2);
149 break;
150 case 2:
151 write_gicreg(val, ICH_AP1R2_EL2);
152 break;
153 case 3:
154 write_gicreg(val, ICH_AP1R3_EL2);
155 break;
156 }
157}
158
159static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
160{
161 u32 val;
162
163 switch (n) {
164 case 0:
165 val = read_gicreg(ICH_AP0R0_EL2);
166 break;
167 case 1:
168 val = read_gicreg(ICH_AP0R1_EL2);
169 break;
170 case 2:
171 val = read_gicreg(ICH_AP0R2_EL2);
172 break;
173 case 3:
174 val = read_gicreg(ICH_AP0R3_EL2);
175 break;
176 default:
177 unreachable();
178 }
179
180 return val;
181}
182
183static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
184{
185 u32 val;
186
187 switch (n) {
188 case 0:
189 val = read_gicreg(ICH_AP1R0_EL2);
190 break;
191 case 1:
192 val = read_gicreg(ICH_AP1R1_EL2);
193 break;
194 case 2:
195 val = read_gicreg(ICH_AP1R2_EL2);
196 break;
197 case 3:
198 val = read_gicreg(ICH_AP1R3_EL2);
199 break;
200 default:
201 unreachable();
202 }
203
204 return val;
205}
206
121void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) 207void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
122{ 208{
123 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; 209 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -154,24 +240,27 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
154 240
155 switch (nr_pre_bits) { 241 switch (nr_pre_bits) {
156 case 7: 242 case 7:
157 cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2); 243 cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
158 cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2); 244 cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
159 case 6: 245 case 6:
160 cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2); 246 cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
161 default: 247 default:
162 cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2); 248 cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
163 } 249 }
164 250
165 switch (nr_pre_bits) { 251 switch (nr_pre_bits) {
166 case 7: 252 case 7:
167 cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2); 253 cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
168 cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2); 254 cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
169 case 6: 255 case 6:
170 cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2); 256 cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
171 default: 257 default:
172 cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); 258 cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
173 } 259 }
174 } else { 260 } else {
261 if (static_branch_unlikely(&vgic_v3_cpuif_trap))
262 write_gicreg(0, ICH_HCR_EL2);
263
175 cpu_if->vgic_elrsr = 0xffff; 264 cpu_if->vgic_elrsr = 0xffff;
176 cpu_if->vgic_ap0r[0] = 0; 265 cpu_if->vgic_ap0r[0] = 0;
177 cpu_if->vgic_ap0r[1] = 0; 266 cpu_if->vgic_ap0r[1] = 0;
@@ -224,26 +313,34 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
224 313
225 switch (nr_pre_bits) { 314 switch (nr_pre_bits) {
226 case 7: 315 case 7:
227 write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2); 316 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
228 write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2); 317 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
229 case 6: 318 case 6:
230 write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2); 319 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
231 default: 320 default:
232 write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2); 321 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
233 } 322 }
234 323
235 switch (nr_pre_bits) { 324 switch (nr_pre_bits) {
236 case 7: 325 case 7:
237 write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2); 326 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
238 write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2); 327 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
239 case 6: 328 case 6:
240 write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2); 329 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
241 default: 330 default:
242 write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); 331 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
243 } 332 }
244 333
245 for (i = 0; i < used_lrs; i++) 334 for (i = 0; i < used_lrs; i++)
246 __gic_v3_set_lr(cpu_if->vgic_lr[i], i); 335 __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
336 } else {
337 /*
338 * If we need to trap system registers, we must write
339 * ICH_HCR_EL2 anyway, even if no interrupts are being
340 * injected,
341 */
342 if (static_branch_unlikely(&vgic_v3_cpuif_trap))
343 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
247 } 344 }
248 345
249 /* 346 /*
@@ -287,3 +384,697 @@ void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
287{ 384{
288 write_gicreg(vmcr, ICH_VMCR_EL2); 385 write_gicreg(vmcr, ICH_VMCR_EL2);
289} 386}
387
388#ifdef CONFIG_ARM64
389
390static int __hyp_text __vgic_v3_bpr_min(void)
391{
392 /* See Pseudocode for VPriorityGroup */
393 return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
394}
395
396static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
397{
398 u32 esr = kvm_vcpu_get_hsr(vcpu);
399 u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
400
401 return crm != 8;
402}
403
404#define GICv3_IDLE_PRIORITY 0xff
405
406static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
407 u32 vmcr,
408 u64 *lr_val)
409{
410 unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
411 u8 priority = GICv3_IDLE_PRIORITY;
412 int i, lr = -1;
413
414 for (i = 0; i < used_lrs; i++) {
415 u64 val = __gic_v3_get_lr(i);
416 u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
417
418 /* Not pending in the state? */
419 if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
420 continue;
421
422 /* Group-0 interrupt, but Group-0 disabled? */
423 if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
424 continue;
425
426 /* Group-1 interrupt, but Group-1 disabled? */
427 if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
428 continue;
429
430 /* Not the highest priority? */
431 if (lr_prio >= priority)
432 continue;
433
434 /* This is a candidate */
435 priority = lr_prio;
436 *lr_val = val;
437 lr = i;
438 }
439
440 if (lr == -1)
441 *lr_val = ICC_IAR1_EL1_SPURIOUS;
442
443 return lr;
444}
445
446static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
447 int intid, u64 *lr_val)
448{
449 unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
450 int i;
451
452 for (i = 0; i < used_lrs; i++) {
453 u64 val = __gic_v3_get_lr(i);
454
455 if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
456 (val & ICH_LR_ACTIVE_BIT)) {
457 *lr_val = val;
458 return i;
459 }
460 }
461
462 *lr_val = ICC_IAR1_EL1_SPURIOUS;
463 return -1;
464}
465
466static int __hyp_text __vgic_v3_get_highest_active_priority(void)
467{
468 u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
469 u32 hap = 0;
470 int i;
471
472 for (i = 0; i < nr_apr_regs; i++) {
473 u32 val;
474
475 /*
476 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
477 * contain the active priority levels for this VCPU
478 * for the maximum number of supported priority
479 * levels, and we return the full priority level only
480 * if the BPR is programmed to its minimum, otherwise
481 * we return a combination of the priority level and
482 * subpriority, as determined by the setting of the
483 * BPR, but without the full subpriority.
484 */
485 val = __vgic_v3_read_ap0rn(i);
486 val |= __vgic_v3_read_ap1rn(i);
487 if (!val) {
488 hap += 32;
489 continue;
490 }
491
492 return (hap + __ffs(val)) << __vgic_v3_bpr_min();
493 }
494
495 return GICv3_IDLE_PRIORITY;
496}
497
498static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
499{
500 return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
501}
502
503static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
504{
505 unsigned int bpr;
506
507 if (vmcr & ICH_VMCR_CBPR_MASK) {
508 bpr = __vgic_v3_get_bpr0(vmcr);
509 if (bpr < 7)
510 bpr++;
511 } else {
512 bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
513 }
514
515 return bpr;
516}
517
518/*
519 * Convert a priority to a preemption level, taking the relevant BPR
520 * into account by zeroing the sub-priority bits.
521 */
522static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
523{
524 unsigned int bpr;
525
526 if (!grp)
527 bpr = __vgic_v3_get_bpr0(vmcr) + 1;
528 else
529 bpr = __vgic_v3_get_bpr1(vmcr);
530
531 return pri & (GENMASK(7, 0) << bpr);
532}
533
534/*
535 * The priority value is independent of any of the BPR values, so we
536 * normalize it using the minumal BPR value. This guarantees that no
537 * matter what the guest does with its BPR, we can always set/get the
538 * same value of a priority.
539 */
540static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
541{
542 u8 pre, ap;
543 u32 val;
544 int apr;
545
546 pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
547 ap = pre >> __vgic_v3_bpr_min();
548 apr = ap / 32;
549
550 if (!grp) {
551 val = __vgic_v3_read_ap0rn(apr);
552 __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
553 } else {
554 val = __vgic_v3_read_ap1rn(apr);
555 __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
556 }
557}
558
559static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
560{
561 u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
562 u32 hap = 0;
563 int i;
564
565 for (i = 0; i < nr_apr_regs; i++) {
566 u32 ap0, ap1;
567 int c0, c1;
568
569 ap0 = __vgic_v3_read_ap0rn(i);
570 ap1 = __vgic_v3_read_ap1rn(i);
571 if (!ap0 && !ap1) {
572 hap += 32;
573 continue;
574 }
575
576 c0 = ap0 ? __ffs(ap0) : 32;
577 c1 = ap1 ? __ffs(ap1) : 32;
578
579 /* Always clear the LSB, which is the highest priority */
580 if (c0 < c1) {
581 ap0 &= ~BIT(c0);
582 __vgic_v3_write_ap0rn(ap0, i);
583 hap += c0;
584 } else {
585 ap1 &= ~BIT(c1);
586 __vgic_v3_write_ap1rn(ap1, i);
587 hap += c1;
588 }
589
590 /* Rescale to 8 bits of priority */
591 return hap << __vgic_v3_bpr_min();
592 }
593
594 return GICv3_IDLE_PRIORITY;
595}
596
597static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
598{
599 u64 lr_val;
600 u8 lr_prio, pmr;
601 int lr, grp;
602
603 grp = __vgic_v3_get_group(vcpu);
604
605 lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
606 if (lr < 0)
607 goto spurious;
608
609 if (grp != !!(lr_val & ICH_LR_GROUP))
610 goto spurious;
611
612 pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
613 lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
614 if (pmr <= lr_prio)
615 goto spurious;
616
617 if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
618 goto spurious;
619
620 lr_val &= ~ICH_LR_STATE;
621 /* No active state for LPIs */
622 if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
623 lr_val |= ICH_LR_ACTIVE_BIT;
624 __gic_v3_set_lr(lr_val, lr);
625 __vgic_v3_set_active_priority(lr_prio, vmcr, grp);
626 vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
627 return;
628
629spurious:
630 vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
631}
632
633static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
634{
635 lr_val &= ~ICH_LR_ACTIVE_BIT;
636 if (lr_val & ICH_LR_HW) {
637 u32 pid;
638
639 pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
640 gic_write_dir(pid);
641 }
642
643 __gic_v3_set_lr(lr_val, lr);
644}
645
646static void __hyp_text __vgic_v3_bump_eoicount(void)
647{
648 u32 hcr;
649
650 hcr = read_gicreg(ICH_HCR_EL2);
651 hcr += 1 << ICH_HCR_EOIcount_SHIFT;
652 write_gicreg(hcr, ICH_HCR_EL2);
653}
654
655static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
656 u32 vmcr, int rt)
657{
658 u32 vid = vcpu_get_reg(vcpu, rt);
659 u64 lr_val;
660 int lr;
661
662 /* EOImode == 0, nothing to be done here */
663 if (!(vmcr & ICH_VMCR_EOIM_MASK))
664 return;
665
666 /* No deactivate to be performed on an LPI */
667 if (vid >= VGIC_MIN_LPI)
668 return;
669
670 lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
671 if (lr == -1) {
672 __vgic_v3_bump_eoicount();
673 return;
674 }
675
676 __vgic_v3_clear_active_lr(lr, lr_val);
677}
678
679static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
680{
681 u32 vid = vcpu_get_reg(vcpu, rt);
682 u64 lr_val;
683 u8 lr_prio, act_prio;
684 int lr, grp;
685
686 grp = __vgic_v3_get_group(vcpu);
687
688 /* Drop priority in any case */
689 act_prio = __vgic_v3_clear_highest_active_priority();
690
691 /* If EOIing an LPI, no deactivate to be performed */
692 if (vid >= VGIC_MIN_LPI)
693 return;
694
695 /* EOImode == 1, nothing to be done here */
696 if (vmcr & ICH_VMCR_EOIM_MASK)
697 return;
698
699 lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
700 if (lr == -1) {
701 __vgic_v3_bump_eoicount();
702 return;
703 }
704
705 lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
706
707 /* If priorities or group do not match, the guest has fscked-up. */
708 if (grp != !!(lr_val & ICH_LR_GROUP) ||
709 __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
710 return;
711
712 /* Let's now perform the deactivation */
713 __vgic_v3_clear_active_lr(lr, lr_val);
714}
715
716static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
717{
718 vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
719}
720
721static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
722{
723 vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
724}
725
726static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
727{
728 u64 val = vcpu_get_reg(vcpu, rt);
729
730 if (val & 1)
731 vmcr |= ICH_VMCR_ENG0_MASK;
732 else
733 vmcr &= ~ICH_VMCR_ENG0_MASK;
734
735 __vgic_v3_write_vmcr(vmcr);
736}
737
738static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
739{
740 u64 val = vcpu_get_reg(vcpu, rt);
741
742 if (val & 1)
743 vmcr |= ICH_VMCR_ENG1_MASK;
744 else
745 vmcr &= ~ICH_VMCR_ENG1_MASK;
746
747 __vgic_v3_write_vmcr(vmcr);
748}
749
750static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
751{
752 vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
753}
754
755static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
756{
757 vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
758}
759
760static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
761{
762 u64 val = vcpu_get_reg(vcpu, rt);
763 u8 bpr_min = __vgic_v3_bpr_min() - 1;
764
765 /* Enforce BPR limiting */
766 if (val < bpr_min)
767 val = bpr_min;
768
769 val <<= ICH_VMCR_BPR0_SHIFT;
770 val &= ICH_VMCR_BPR0_MASK;
771 vmcr &= ~ICH_VMCR_BPR0_MASK;
772 vmcr |= val;
773
774 __vgic_v3_write_vmcr(vmcr);
775}
776
777static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
778{
779 u64 val = vcpu_get_reg(vcpu, rt);
780 u8 bpr_min = __vgic_v3_bpr_min();
781
782 if (vmcr & ICH_VMCR_CBPR_MASK)
783 return;
784
785 /* Enforce BPR limiting */
786 if (val < bpr_min)
787 val = bpr_min;
788
789 val <<= ICH_VMCR_BPR1_SHIFT;
790 val &= ICH_VMCR_BPR1_MASK;
791 vmcr &= ~ICH_VMCR_BPR1_MASK;
792 vmcr |= val;
793
794 __vgic_v3_write_vmcr(vmcr);
795}
796
797static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
798{
799 u32 val;
800
801 if (!__vgic_v3_get_group(vcpu))
802 val = __vgic_v3_read_ap0rn(n);
803 else
804 val = __vgic_v3_read_ap1rn(n);
805
806 vcpu_set_reg(vcpu, rt, val);
807}
808
809static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
810{
811 u32 val = vcpu_get_reg(vcpu, rt);
812
813 if (!__vgic_v3_get_group(vcpu))
814 __vgic_v3_write_ap0rn(val, n);
815 else
816 __vgic_v3_write_ap1rn(val, n);
817}
818
819static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
820 u32 vmcr, int rt)
821{
822 __vgic_v3_read_apxrn(vcpu, rt, 0);
823}
824
825static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
826 u32 vmcr, int rt)
827{
828 __vgic_v3_read_apxrn(vcpu, rt, 1);
829}
830
831static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
832 u32 vmcr, int rt)
833{
834 __vgic_v3_read_apxrn(vcpu, rt, 2);
835}
836
837static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
838 u32 vmcr, int rt)
839{
840 __vgic_v3_read_apxrn(vcpu, rt, 3);
841}
842
843static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
844 u32 vmcr, int rt)
845{
846 __vgic_v3_write_apxrn(vcpu, rt, 0);
847}
848
849static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
850 u32 vmcr, int rt)
851{
852 __vgic_v3_write_apxrn(vcpu, rt, 1);
853}
854
855static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
856 u32 vmcr, int rt)
857{
858 __vgic_v3_write_apxrn(vcpu, rt, 2);
859}
860
861static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
862 u32 vmcr, int rt)
863{
864 __vgic_v3_write_apxrn(vcpu, rt, 3);
865}
866
867static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
868 u32 vmcr, int rt)
869{
870 u64 lr_val;
871 int lr, lr_grp, grp;
872
873 grp = __vgic_v3_get_group(vcpu);
874
875 lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
876 if (lr == -1)
877 goto spurious;
878
879 lr_grp = !!(lr_val & ICH_LR_GROUP);
880 if (lr_grp != grp)
881 lr_val = ICC_IAR1_EL1_SPURIOUS;
882
883spurious:
884 vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
885}
886
887static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
888 u32 vmcr, int rt)
889{
890 vmcr &= ICH_VMCR_PMR_MASK;
891 vmcr >>= ICH_VMCR_PMR_SHIFT;
892 vcpu_set_reg(vcpu, rt, vmcr);
893}
894
895static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
896 u32 vmcr, int rt)
897{
898 u32 val = vcpu_get_reg(vcpu, rt);
899
900 val <<= ICH_VMCR_PMR_SHIFT;
901 val &= ICH_VMCR_PMR_MASK;
902 vmcr &= ~ICH_VMCR_PMR_MASK;
903 vmcr |= val;
904
905 write_gicreg(vmcr, ICH_VMCR_EL2);
906}
907
908static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
909 u32 vmcr, int rt)
910{
911 u32 val = __vgic_v3_get_highest_active_priority();
912 vcpu_set_reg(vcpu, rt, val);
913}
914
915static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
916 u32 vmcr, int rt)
917{
918 u32 vtr, val;
919
920 vtr = read_gicreg(ICH_VTR_EL2);
921 /* PRIbits */
922 val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
923 /* IDbits */
924 val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
925 /* SEIS */
926 val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
927 /* A3V */
928 val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
929 /* EOImode */
930 val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
931 /* CBPR */
932 val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
933
934 vcpu_set_reg(vcpu, rt, val);
935}
936
937static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
938 u32 vmcr, int rt)
939{
940 u32 val = vcpu_get_reg(vcpu, rt);
941
942 if (val & ICC_CTLR_EL1_CBPR_MASK)
943 vmcr |= ICH_VMCR_CBPR_MASK;
944 else
945 vmcr &= ~ICH_VMCR_CBPR_MASK;
946
947 if (val & ICC_CTLR_EL1_EOImode_MASK)
948 vmcr |= ICH_VMCR_EOIM_MASK;
949 else
950 vmcr &= ~ICH_VMCR_EOIM_MASK;
951
952 write_gicreg(vmcr, ICH_VMCR_EL2);
953}
954
955int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
956{
957 int rt;
958 u32 esr;
959 u32 vmcr;
960 void (*fn)(struct kvm_vcpu *, u32, int);
961 bool is_read;
962 u32 sysreg;
963
964 esr = kvm_vcpu_get_hsr(vcpu);
965 if (vcpu_mode_is_32bit(vcpu)) {
966 if (!kvm_condition_valid(vcpu))
967 return 1;
968
969 sysreg = esr_cp15_to_sysreg(esr);
970 } else {
971 sysreg = esr_sys64_to_sysreg(esr);
972 }
973
974 is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
975
976 switch (sysreg) {
977 case SYS_ICC_IAR0_EL1:
978 case SYS_ICC_IAR1_EL1:
979 if (unlikely(!is_read))
980 return 0;
981 fn = __vgic_v3_read_iar;
982 break;
983 case SYS_ICC_EOIR0_EL1:
984 case SYS_ICC_EOIR1_EL1:
985 if (unlikely(is_read))
986 return 0;
987 fn = __vgic_v3_write_eoir;
988 break;
989 case SYS_ICC_IGRPEN1_EL1:
990 if (is_read)
991 fn = __vgic_v3_read_igrpen1;
992 else
993 fn = __vgic_v3_write_igrpen1;
994 break;
995 case SYS_ICC_BPR1_EL1:
996 if (is_read)
997 fn = __vgic_v3_read_bpr1;
998 else
999 fn = __vgic_v3_write_bpr1;
1000 break;
1001 case SYS_ICC_AP0Rn_EL1(0):
1002 case SYS_ICC_AP1Rn_EL1(0):
1003 if (is_read)
1004 fn = __vgic_v3_read_apxr0;
1005 else
1006 fn = __vgic_v3_write_apxr0;
1007 break;
1008 case SYS_ICC_AP0Rn_EL1(1):
1009 case SYS_ICC_AP1Rn_EL1(1):
1010 if (is_read)
1011 fn = __vgic_v3_read_apxr1;
1012 else
1013 fn = __vgic_v3_write_apxr1;
1014 break;
1015 case SYS_ICC_AP0Rn_EL1(2):
1016 case SYS_ICC_AP1Rn_EL1(2):
1017 if (is_read)
1018 fn = __vgic_v3_read_apxr2;
1019 else
1020 fn = __vgic_v3_write_apxr2;
1021 break;
1022 case SYS_ICC_AP0Rn_EL1(3):
1023 case SYS_ICC_AP1Rn_EL1(3):
1024 if (is_read)
1025 fn = __vgic_v3_read_apxr3;
1026 else
1027 fn = __vgic_v3_write_apxr3;
1028 break;
1029 case SYS_ICC_HPPIR0_EL1:
1030 case SYS_ICC_HPPIR1_EL1:
1031 if (unlikely(!is_read))
1032 return 0;
1033 fn = __vgic_v3_read_hppir;
1034 break;
1035 case SYS_ICC_IGRPEN0_EL1:
1036 if (is_read)
1037 fn = __vgic_v3_read_igrpen0;
1038 else
1039 fn = __vgic_v3_write_igrpen0;
1040 break;
1041 case SYS_ICC_BPR0_EL1:
1042 if (is_read)
1043 fn = __vgic_v3_read_bpr0;
1044 else
1045 fn = __vgic_v3_write_bpr0;
1046 break;
1047 case SYS_ICC_DIR_EL1:
1048 if (unlikely(is_read))
1049 return 0;
1050 fn = __vgic_v3_write_dir;
1051 break;
1052 case SYS_ICC_RPR_EL1:
1053 if (unlikely(!is_read))
1054 return 0;
1055 fn = __vgic_v3_read_rpr;
1056 break;
1057 case SYS_ICC_CTLR_EL1:
1058 if (is_read)
1059 fn = __vgic_v3_read_ctlr;
1060 else
1061 fn = __vgic_v3_write_ctlr;
1062 break;
1063 case SYS_ICC_PMR_EL1:
1064 if (is_read)
1065 fn = __vgic_v3_read_pmr;
1066 else
1067 fn = __vgic_v3_write_pmr;
1068 break;
1069 default:
1070 return 0;
1071 }
1072
1073 vmcr = __vgic_v3_read_vmcr();
1074 rt = kvm_vcpu_sys_get_rt(vcpu);
1075 fn(vcpu, vmcr, rt);
1076
1077 return 1;
1078}
1079
1080#endif
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 1c44aa35f909..0e1fc75f3585 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -20,6 +20,7 @@
20#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
21#include <linux/io.h> 21#include <linux/io.h>
22#include <linux/hugetlb.h> 22#include <linux/hugetlb.h>
23#include <linux/sched/signal.h>
23#include <trace/events/kvm.h> 24#include <trace/events/kvm.h>
24#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
25#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
@@ -1262,6 +1263,24 @@ static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
1262 __coherent_cache_guest_page(vcpu, pfn, size); 1263 __coherent_cache_guest_page(vcpu, pfn, size);
1263} 1264}
1264 1265
1266static void kvm_send_hwpoison_signal(unsigned long address,
1267 struct vm_area_struct *vma)
1268{
1269 siginfo_t info;
1270
1271 info.si_signo = SIGBUS;
1272 info.si_errno = 0;
1273 info.si_code = BUS_MCEERR_AR;
1274 info.si_addr = (void __user *)address;
1275
1276 if (is_vm_hugetlb_page(vma))
1277 info.si_addr_lsb = huge_page_shift(hstate_vma(vma));
1278 else
1279 info.si_addr_lsb = PAGE_SHIFT;
1280
1281 send_sig_info(SIGBUS, &info, current);
1282}
1283
1265static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1284static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1266 struct kvm_memory_slot *memslot, unsigned long hva, 1285 struct kvm_memory_slot *memslot, unsigned long hva,
1267 unsigned long fault_status) 1286 unsigned long fault_status)
@@ -1331,6 +1350,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1331 smp_rmb(); 1350 smp_rmb();
1332 1351
1333 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 1352 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
1353 if (pfn == KVM_PFN_ERR_HWPOISON) {
1354 kvm_send_hwpoison_signal(hva, vma);
1355 return 0;
1356 }
1334 if (is_error_noslot_pfn(pfn)) 1357 if (is_error_noslot_pfn(pfn))
1335 return -EFAULT; 1358 return -EFAULT;
1336 1359
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 4b43e7f3b158..fc8a723ff387 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -203,6 +203,24 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
203 return reg; 203 return reg;
204} 204}
205 205
206static void kvm_pmu_check_overflow(struct kvm_vcpu *vcpu)
207{
208 struct kvm_pmu *pmu = &vcpu->arch.pmu;
209 bool overflow = !!kvm_pmu_overflow_status(vcpu);
210
211 if (pmu->irq_level == overflow)
212 return;
213
214 pmu->irq_level = overflow;
215
216 if (likely(irqchip_in_kernel(vcpu->kvm))) {
217 int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
218 pmu->irq_num, overflow,
219 &vcpu->arch.pmu);
220 WARN_ON(ret);
221 }
222}
223
206/** 224/**
207 * kvm_pmu_overflow_set - set PMU overflow interrupt 225 * kvm_pmu_overflow_set - set PMU overflow interrupt
208 * @vcpu: The vcpu pointer 226 * @vcpu: The vcpu pointer
@@ -210,37 +228,18 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
210 */ 228 */
211void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) 229void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val)
212{ 230{
213 u64 reg;
214
215 if (val == 0) 231 if (val == 0)
216 return; 232 return;
217 233
218 vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= val; 234 vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= val;
219 reg = kvm_pmu_overflow_status(vcpu); 235 kvm_pmu_check_overflow(vcpu);
220 if (reg != 0)
221 kvm_vcpu_kick(vcpu);
222} 236}
223 237
224static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) 238static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
225{ 239{
226 struct kvm_pmu *pmu = &vcpu->arch.pmu;
227 bool overflow;
228
229 if (!kvm_arm_pmu_v3_ready(vcpu)) 240 if (!kvm_arm_pmu_v3_ready(vcpu))
230 return; 241 return;
231 242 kvm_pmu_check_overflow(vcpu);
232 overflow = !!kvm_pmu_overflow_status(vcpu);
233 if (pmu->irq_level == overflow)
234 return;
235
236 pmu->irq_level = overflow;
237
238 if (likely(irqchip_in_kernel(vcpu->kvm))) {
239 int ret;
240 ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
241 pmu->irq_num, overflow);
242 WARN_ON(ret);
243 }
244} 243}
245 244
246bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) 245bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
@@ -451,34 +450,74 @@ bool kvm_arm_support_pmu_v3(void)
451 return (perf_num_counters() > 0); 450 return (perf_num_counters() > 0);
452} 451}
453 452
454static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) 453int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
455{ 454{
456 if (!kvm_arm_support_pmu_v3()) 455 if (!vcpu->arch.pmu.created)
457 return -ENODEV; 456 return 0;
458 457
459 /* 458 /*
460 * We currently require an in-kernel VGIC to use the PMU emulation, 459 * A valid interrupt configuration for the PMU is either to have a
461 * because we do not support forwarding PMU overflow interrupts to 460 * properly configured interrupt number and using an in-kernel
462 * userspace yet. 461 * irqchip, or to not have an in-kernel GIC and not set an IRQ.
463 */ 462 */
464 if (!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm)) 463 if (irqchip_in_kernel(vcpu->kvm)) {
464 int irq = vcpu->arch.pmu.irq_num;
465 if (!kvm_arm_pmu_irq_initialized(vcpu))
466 return -EINVAL;
467
468 /*
469 * If we are using an in-kernel vgic, at this point we know
470 * the vgic will be initialized, so we can check the PMU irq
471 * number against the dimensions of the vgic and make sure
472 * it's valid.
473 */
474 if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
475 return -EINVAL;
476 } else if (kvm_arm_pmu_irq_initialized(vcpu)) {
477 return -EINVAL;
478 }
479
480 kvm_pmu_vcpu_reset(vcpu);
481 vcpu->arch.pmu.ready = true;
482
483 return 0;
484}
485
486static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
487{
488 if (!kvm_arm_support_pmu_v3())
465 return -ENODEV; 489 return -ENODEV;
466 490
467 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features) || 491 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
468 !kvm_arm_pmu_irq_initialized(vcpu))
469 return -ENXIO; 492 return -ENXIO;
470 493
471 if (kvm_arm_pmu_v3_ready(vcpu)) 494 if (vcpu->arch.pmu.created)
472 return -EBUSY; 495 return -EBUSY;
473 496
474 kvm_pmu_vcpu_reset(vcpu); 497 if (irqchip_in_kernel(vcpu->kvm)) {
475 vcpu->arch.pmu.ready = true; 498 int ret;
499
500 /*
501 * If using the PMU with an in-kernel virtual GIC
502 * implementation, we require the GIC to be already
503 * initialized when initializing the PMU.
504 */
505 if (!vgic_initialized(vcpu->kvm))
506 return -ENODEV;
507
508 if (!kvm_arm_pmu_irq_initialized(vcpu))
509 return -ENXIO;
476 510
511 ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
512 &vcpu->arch.pmu);
513 if (ret)
514 return ret;
515 }
516
517 vcpu->arch.pmu.created = true;
477 return 0; 518 return 0;
478} 519}
479 520
480#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
481
482/* 521/*
483 * For one VM the interrupt type must be same for each vcpu. 522 * For one VM the interrupt type must be same for each vcpu.
484 * As a PPI, the interrupt number is the same for all vcpus, 523 * As a PPI, the interrupt number is the same for all vcpus,
@@ -512,6 +551,9 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
512 int __user *uaddr = (int __user *)(long)attr->addr; 551 int __user *uaddr = (int __user *)(long)attr->addr;
513 int irq; 552 int irq;
514 553
554 if (!irqchip_in_kernel(vcpu->kvm))
555 return -EINVAL;
556
515 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) 557 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
516 return -ENODEV; 558 return -ENODEV;
517 559
@@ -519,7 +561,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
519 return -EFAULT; 561 return -EFAULT;
520 562
521 /* The PMU overflow interrupt can be a PPI or a valid SPI. */ 563 /* The PMU overflow interrupt can be a PPI or a valid SPI. */
522 if (!(irq_is_ppi(irq) || vgic_valid_spi(vcpu->kvm, irq))) 564 if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
523 return -EINVAL; 565 return -EINVAL;
524 566
525 if (!pmu_irq_is_valid(vcpu->kvm, irq)) 567 if (!pmu_irq_is_valid(vcpu->kvm, irq))
@@ -546,6 +588,9 @@ int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
546 int __user *uaddr = (int __user *)(long)attr->addr; 588 int __user *uaddr = (int __user *)(long)attr->addr;
547 int irq; 589 int irq;
548 590
591 if (!irqchip_in_kernel(vcpu->kvm))
592 return -EINVAL;
593
549 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) 594 if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
550 return -ENODEV; 595 return -ENODEV;
551 596
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index a08d7a93aebb..f1e363bab5e8 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -57,6 +57,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
57 * for KVM will preserve the register state. 57 * for KVM will preserve the register state.
58 */ 58 */
59 kvm_vcpu_block(vcpu); 59 kvm_vcpu_block(vcpu);
60 kvm_clear_request(KVM_REQ_UNHALT, vcpu);
60 61
61 return PSCI_RET_SUCCESS; 62 return PSCI_RET_SUCCESS;
62} 63}
@@ -64,6 +65,8 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
64static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) 65static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
65{ 66{
66 vcpu->arch.power_off = true; 67 vcpu->arch.power_off = true;
68 kvm_make_request(KVM_REQ_SLEEP, vcpu);
69 kvm_vcpu_kick(vcpu);
67} 70}
68 71
69static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) 72static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@@ -178,10 +181,9 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
178 * after this call is handled and before the VCPUs have been 181 * after this call is handled and before the VCPUs have been
179 * re-initialized. 182 * re-initialized.
180 */ 183 */
181 kvm_for_each_vcpu(i, tmp, vcpu->kvm) { 184 kvm_for_each_vcpu(i, tmp, vcpu->kvm)
182 tmp->arch.power_off = true; 185 tmp->arch.power_off = true;
183 kvm_vcpu_kick(tmp); 186 kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
184 }
185 187
186 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); 188 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
187 vcpu->run->system_event.type = type; 189 vcpu->run->system_event.type = type;
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
index f138ed2e9c63..b7baf581611a 100644
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ b/virt/kvm/arm/vgic/vgic-irqfd.c
@@ -34,7 +34,7 @@ static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
34 34
35 if (!vgic_valid_spi(kvm, spi_id)) 35 if (!vgic_valid_spi(kvm, spi_id))
36 return -EINVAL; 36 return -EINVAL;
37 return kvm_vgic_inject_irq(kvm, 0, spi_id, level); 37 return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
38} 38}
39 39
40/** 40/**
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 63e0bbdcddcc..37522e65eb53 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -308,34 +308,36 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
308 vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12, 308 vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc, 12,
309 VGIC_ACCESS_32bit), 309 VGIC_ACCESS_32bit),
310 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP, 310 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
311 vgic_mmio_read_rao, vgic_mmio_write_wi, 1, 311 vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1,
312 VGIC_ACCESS_32bit), 312 VGIC_ACCESS_32bit),
313 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, 313 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
314 vgic_mmio_read_enable, vgic_mmio_write_senable, 1, 314 vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
315 VGIC_ACCESS_32bit), 315 VGIC_ACCESS_32bit),
316 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, 316 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
317 vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, 317 vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
318 VGIC_ACCESS_32bit), 318 VGIC_ACCESS_32bit),
319 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, 319 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
320 vgic_mmio_read_pending, vgic_mmio_write_spending, 1, 320 vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
321 VGIC_ACCESS_32bit), 321 VGIC_ACCESS_32bit),
322 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, 322 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
323 vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, 323 vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1,
324 VGIC_ACCESS_32bit), 324 VGIC_ACCESS_32bit),
325 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, 325 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
326 vgic_mmio_read_active, vgic_mmio_write_sactive, 1, 326 vgic_mmio_read_active, vgic_mmio_write_sactive,
327 NULL, vgic_mmio_uaccess_write_sactive, 1,
327 VGIC_ACCESS_32bit), 328 VGIC_ACCESS_32bit),
328 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, 329 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
329 vgic_mmio_read_active, vgic_mmio_write_cactive, 1, 330 vgic_mmio_read_active, vgic_mmio_write_cactive,
331 NULL, vgic_mmio_uaccess_write_cactive, 1,
330 VGIC_ACCESS_32bit), 332 VGIC_ACCESS_32bit),
331 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, 333 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
332 vgic_mmio_read_priority, vgic_mmio_write_priority, 8, 334 vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
333 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 335 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
334 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET, 336 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
335 vgic_mmio_read_target, vgic_mmio_write_target, 8, 337 vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
336 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 338 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
337 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG, 339 REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
338 vgic_mmio_read_config, vgic_mmio_write_config, 2, 340 vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
339 VGIC_ACCESS_32bit), 341 VGIC_ACCESS_32bit),
340 REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT, 342 REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
341 vgic_mmio_read_raz, vgic_mmio_write_sgir, 4, 343 vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 201d5e2e973d..714fa3933546 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -456,11 +456,13 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
456 vgic_mmio_read_raz, vgic_mmio_write_wi, 1, 456 vgic_mmio_read_raz, vgic_mmio_write_wi, 1,
457 VGIC_ACCESS_32bit), 457 VGIC_ACCESS_32bit),
458 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, 458 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
459 vgic_mmio_read_active, vgic_mmio_write_sactive, NULL, NULL, 1, 459 vgic_mmio_read_active, vgic_mmio_write_sactive,
460 NULL, vgic_mmio_uaccess_write_sactive, 1,
460 VGIC_ACCESS_32bit), 461 VGIC_ACCESS_32bit),
461 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, 462 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
462 vgic_mmio_read_active, vgic_mmio_write_cactive, NULL, NULL, 1, 463 vgic_mmio_read_active, vgic_mmio_write_cactive,
463 VGIC_ACCESS_32bit), 464 NULL, vgic_mmio_uaccess_write_cactive,
465 1, VGIC_ACCESS_32bit),
464 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, 466 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
465 vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, 467 vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
466 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 468 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
@@ -526,12 +528,14 @@ static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
526 vgic_mmio_read_pending, vgic_mmio_write_cpending, 528 vgic_mmio_read_pending, vgic_mmio_write_cpending,
527 vgic_mmio_read_raz, vgic_mmio_write_wi, 4, 529 vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
528 VGIC_ACCESS_32bit), 530 VGIC_ACCESS_32bit),
529 REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0, 531 REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0,
530 vgic_mmio_read_active, vgic_mmio_write_sactive, 4, 532 vgic_mmio_read_active, vgic_mmio_write_sactive,
531 VGIC_ACCESS_32bit), 533 NULL, vgic_mmio_uaccess_write_sactive,
532 REGISTER_DESC_WITH_LENGTH(GICR_ICACTIVER0, 534 4, VGIC_ACCESS_32bit),
533 vgic_mmio_read_active, vgic_mmio_write_cactive, 4, 535 REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICACTIVER0,
534 VGIC_ACCESS_32bit), 536 vgic_mmio_read_active, vgic_mmio_write_cactive,
537 NULL, vgic_mmio_uaccess_write_cactive,
538 4, VGIC_ACCESS_32bit),
535 REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0, 539 REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0,
536 vgic_mmio_read_priority, vgic_mmio_write_priority, 32, 540 vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
537 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 541 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 1c17b2a2f105..c1e4bdd66131 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -231,56 +231,94 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
231 * be migrated while we don't hold the IRQ locks and we don't want to be 231 * be migrated while we don't hold the IRQ locks and we don't want to be
232 * chasing moving targets. 232 * chasing moving targets.
233 * 233 *
234 * For private interrupts, we only have to make sure the single and only VCPU 234 * For private interrupts we don't have to do anything because userspace
235 * that can potentially queue the IRQ is stopped. 235 * accesses to the VGIC state already require all VCPUs to be stopped, and
236 * only the VCPU itself can modify its private interrupts active state, which
237 * guarantees that the VCPU is not running.
236 */ 238 */
237static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) 239static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
238{ 240{
239 if (intid < VGIC_NR_PRIVATE_IRQS) 241 if (intid > VGIC_NR_PRIVATE_IRQS)
240 kvm_arm_halt_vcpu(vcpu);
241 else
242 kvm_arm_halt_guest(vcpu->kvm); 242 kvm_arm_halt_guest(vcpu->kvm);
243} 243}
244 244
245/* See vgic_change_active_prepare */ 245/* See vgic_change_active_prepare */
246static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) 246static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
247{ 247{
248 if (intid < VGIC_NR_PRIVATE_IRQS) 248 if (intid > VGIC_NR_PRIVATE_IRQS)
249 kvm_arm_resume_vcpu(vcpu);
250 else
251 kvm_arm_resume_guest(vcpu->kvm); 249 kvm_arm_resume_guest(vcpu->kvm);
252} 250}
253 251
254void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, 252static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
255 gpa_t addr, unsigned int len, 253 gpa_t addr, unsigned int len,
256 unsigned long val) 254 unsigned long val)
257{ 255{
258 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 256 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
259 int i; 257 int i;
260 258
261 vgic_change_active_prepare(vcpu, intid);
262 for_each_set_bit(i, &val, len * 8) { 259 for_each_set_bit(i, &val, len * 8) {
263 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 260 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
264 vgic_mmio_change_active(vcpu, irq, false); 261 vgic_mmio_change_active(vcpu, irq, false);
265 vgic_put_irq(vcpu->kvm, irq); 262 vgic_put_irq(vcpu->kvm, irq);
266 } 263 }
267 vgic_change_active_finish(vcpu, intid);
268} 264}
269 265
270void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, 266void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
271 gpa_t addr, unsigned int len, 267 gpa_t addr, unsigned int len,
272 unsigned long val) 268 unsigned long val)
273{ 269{
274 u32 intid = VGIC_ADDR_TO_INTID(addr, 1); 270 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
275 int i;
276 271
272 mutex_lock(&vcpu->kvm->lock);
277 vgic_change_active_prepare(vcpu, intid); 273 vgic_change_active_prepare(vcpu, intid);
274
275 __vgic_mmio_write_cactive(vcpu, addr, len, val);
276
277 vgic_change_active_finish(vcpu, intid);
278 mutex_unlock(&vcpu->kvm->lock);
279}
280
281void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
282 gpa_t addr, unsigned int len,
283 unsigned long val)
284{
285 __vgic_mmio_write_cactive(vcpu, addr, len, val);
286}
287
288static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
289 gpa_t addr, unsigned int len,
290 unsigned long val)
291{
292 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
293 int i;
294
278 for_each_set_bit(i, &val, len * 8) { 295 for_each_set_bit(i, &val, len * 8) {
279 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 296 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
280 vgic_mmio_change_active(vcpu, irq, true); 297 vgic_mmio_change_active(vcpu, irq, true);
281 vgic_put_irq(vcpu->kvm, irq); 298 vgic_put_irq(vcpu->kvm, irq);
282 } 299 }
300}
301
302void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
303 gpa_t addr, unsigned int len,
304 unsigned long val)
305{
306 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
307
308 mutex_lock(&vcpu->kvm->lock);
309 vgic_change_active_prepare(vcpu, intid);
310
311 __vgic_mmio_write_sactive(vcpu, addr, len, val);
312
283 vgic_change_active_finish(vcpu, intid); 313 vgic_change_active_finish(vcpu, intid);
314 mutex_unlock(&vcpu->kvm->lock);
315}
316
317void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
318 gpa_t addr, unsigned int len,
319 unsigned long val)
320{
321 __vgic_mmio_write_sactive(vcpu, addr, len, val);
284} 322}
285 323
286unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, 324unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index ea4171acdef3..5693f6df45ec 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -75,7 +75,7 @@ extern struct kvm_io_device_ops kvm_io_gic_ops;
75 * The _WITH_LENGTH version instantiates registers with a fixed length 75 * The _WITH_LENGTH version instantiates registers with a fixed length
76 * and is mutually exclusive with the _PER_IRQ version. 76 * and is mutually exclusive with the _PER_IRQ version.
77 */ 77 */
78#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, bpi, acc) \ 78#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \
79 { \ 79 { \
80 .reg_offset = off, \ 80 .reg_offset = off, \
81 .bits_per_irq = bpi, \ 81 .bits_per_irq = bpi, \
@@ -83,6 +83,8 @@ extern struct kvm_io_device_ops kvm_io_gic_ops;
83 .access_flags = acc, \ 83 .access_flags = acc, \
84 .read = rd, \ 84 .read = rd, \
85 .write = wr, \ 85 .write = wr, \
86 .uaccess_read = ur, \
87 .uaccess_write = uw, \
86 } 88 }
87 89
88#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \ 90#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \
@@ -165,6 +167,14 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
165 gpa_t addr, unsigned int len, 167 gpa_t addr, unsigned int len,
166 unsigned long val); 168 unsigned long val);
167 169
170void vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
171 gpa_t addr, unsigned int len,
172 unsigned long val);
173
174void vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
175 gpa_t addr, unsigned int len,
176 unsigned long val);
177
168unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, 178unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
169 gpa_t addr, unsigned int len); 179 gpa_t addr, unsigned int len);
170 180
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 030248e669f6..96ea597db0e7 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -21,6 +21,10 @@
21 21
22#include "vgic.h" 22#include "vgic.h"
23 23
24static bool group0_trap;
25static bool group1_trap;
26static bool common_trap;
27
24void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) 28void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
25{ 29{
26 struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; 30 struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -258,6 +262,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
258 262
259 /* Get the show on the road... */ 263 /* Get the show on the road... */
260 vgic_v3->vgic_hcr = ICH_HCR_EN; 264 vgic_v3->vgic_hcr = ICH_HCR_EN;
265 if (group0_trap)
266 vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
267 if (group1_trap)
268 vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
269 if (common_trap)
270 vgic_v3->vgic_hcr |= ICH_HCR_TC;
261} 271}
262 272
263int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) 273int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -429,6 +439,26 @@ out:
429 return ret; 439 return ret;
430} 440}
431 441
442DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
443
444static int __init early_group0_trap_cfg(char *buf)
445{
446 return strtobool(buf, &group0_trap);
447}
448early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
449
450static int __init early_group1_trap_cfg(char *buf)
451{
452 return strtobool(buf, &group1_trap);
453}
454early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
455
456static int __init early_common_trap_cfg(char *buf)
457{
458 return strtobool(buf, &common_trap);
459}
460early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
461
432/** 462/**
433 * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT 463 * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
434 * @node: pointer to the DT node 464 * @node: pointer to the DT node
@@ -480,6 +510,21 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
480 if (kvm_vgic_global_state.vcpu_base == 0) 510 if (kvm_vgic_global_state.vcpu_base == 0)
481 kvm_info("disabling GICv2 emulation\n"); 511 kvm_info("disabling GICv2 emulation\n");
482 512
513#ifdef CONFIG_ARM64
514 if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
515 group0_trap = true;
516 group1_trap = true;
517 }
518#endif
519
520 if (group0_trap || group1_trap || common_trap) {
521 kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
522 group0_trap ? "G0" : "",
523 group1_trap ? "G1" : "",
524 common_trap ? "C" : "");
525 static_branch_enable(&vgic_v3_cpuif_trap);
526 }
527
483 kvm_vgic_global_state.vctrl_base = NULL; 528 kvm_vgic_global_state.vctrl_base = NULL;
484 kvm_vgic_global_state.type = VGIC_V3; 529 kvm_vgic_global_state.type = VGIC_V3;
485 kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; 530 kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 83b24d20ff8f..fed717e07938 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -35,11 +35,12 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
35 35
36/* 36/*
37 * Locking order is always: 37 * Locking order is always:
38 * its->cmd_lock (mutex) 38 * kvm->lock (mutex)
39 * its->its_lock (mutex) 39 * its->cmd_lock (mutex)
40 * vgic_cpu->ap_list_lock 40 * its->its_lock (mutex)
41 * kvm->lpi_list_lock 41 * vgic_cpu->ap_list_lock
42 * vgic_irq->irq_lock 42 * kvm->lpi_list_lock
43 * vgic_irq->irq_lock
43 * 44 *
44 * If you need to take multiple locks, always take the upper lock first, 45 * If you need to take multiple locks, always take the upper lock first,
45 * then the lower ones, e.g. first take the its_lock, then the irq_lock. 46 * then the lower ones, e.g. first take the its_lock, then the irq_lock.
@@ -234,10 +235,14 @@ static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
234 235
235/* 236/*
236 * Only valid injection if changing level for level-triggered IRQs or for a 237 * Only valid injection if changing level for level-triggered IRQs or for a
237 * rising edge. 238 * rising edge, and in-kernel connected IRQ lines can only be controlled by
239 * their owner.
238 */ 240 */
239static bool vgic_validate_injection(struct vgic_irq *irq, bool level) 241static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
240{ 242{
243 if (irq->owner != owner)
244 return false;
245
241 switch (irq->config) { 246 switch (irq->config) {
242 case VGIC_CONFIG_LEVEL: 247 case VGIC_CONFIG_LEVEL:
243 return irq->line_level != level; 248 return irq->line_level != level;
@@ -285,8 +290,10 @@ retry:
285 * won't see this one until it exits for some other 290 * won't see this one until it exits for some other
286 * reason. 291 * reason.
287 */ 292 */
288 if (vcpu) 293 if (vcpu) {
294 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
289 kvm_vcpu_kick(vcpu); 295 kvm_vcpu_kick(vcpu);
296 }
290 return false; 297 return false;
291 } 298 }
292 299
@@ -332,6 +339,7 @@ retry:
332 spin_unlock(&irq->irq_lock); 339 spin_unlock(&irq->irq_lock);
333 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 340 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
334 341
342 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
335 kvm_vcpu_kick(vcpu); 343 kvm_vcpu_kick(vcpu);
336 344
337 return true; 345 return true;
@@ -346,13 +354,16 @@ retry:
346 * false: to ignore the call 354 * false: to ignore the call
347 * Level-sensitive true: raise the input signal 355 * Level-sensitive true: raise the input signal
348 * false: lower the input signal 356 * false: lower the input signal
357 * @owner: The opaque pointer to the owner of the IRQ being raised to verify
358 * that the caller is allowed to inject this IRQ. Userspace
359 * injections will have owner == NULL.
349 * 360 *
350 * The VGIC is not concerned with devices being active-LOW or active-HIGH for 361 * The VGIC is not concerned with devices being active-LOW or active-HIGH for
351 * level-sensitive interrupts. You can think of the level parameter as 1 362 * level-sensitive interrupts. You can think of the level parameter as 1
352 * being HIGH and 0 being LOW and all devices being active-HIGH. 363 * being HIGH and 0 being LOW and all devices being active-HIGH.
353 */ 364 */
354int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, 365int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
355 bool level) 366 bool level, void *owner)
356{ 367{
357 struct kvm_vcpu *vcpu; 368 struct kvm_vcpu *vcpu;
358 struct vgic_irq *irq; 369 struct vgic_irq *irq;
@@ -374,7 +385,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
374 385
375 spin_lock(&irq->irq_lock); 386 spin_lock(&irq->irq_lock);
376 387
377 if (!vgic_validate_injection(irq, level)) { 388 if (!vgic_validate_injection(irq, level, owner)) {
378 /* Nothing to see here, move along... */ 389 /* Nothing to see here, move along... */
379 spin_unlock(&irq->irq_lock); 390 spin_unlock(&irq->irq_lock);
380 vgic_put_irq(kvm, irq); 391 vgic_put_irq(kvm, irq);
@@ -431,6 +442,39 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
431} 442}
432 443
433/** 444/**
445 * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
446 *
447 * @vcpu: Pointer to the VCPU (used for PPIs)
448 * @intid: The virtual INTID identifying the interrupt (PPI or SPI)
449 * @owner: Opaque pointer to the owner
450 *
451 * Returns 0 if intid is not already used by another in-kernel device and the
452 * owner is set, otherwise returns an error code.
453 */
454int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
455{
456 struct vgic_irq *irq;
457 int ret = 0;
458
459 if (!vgic_initialized(vcpu->kvm))
460 return -EAGAIN;
461
462 /* SGIs and LPIs cannot be wired up to any device */
463 if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
464 return -EINVAL;
465
466 irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
467 spin_lock(&irq->irq_lock);
468 if (irq->owner && irq->owner != owner)
469 ret = -EEXIST;
470 else
471 irq->owner = owner;
472 spin_unlock(&irq->irq_lock);
473
474 return ret;
475}
476
477/**
434 * vgic_prune_ap_list - Remove non-relevant interrupts from the list 478 * vgic_prune_ap_list - Remove non-relevant interrupts from the list
435 * 479 *
436 * @vcpu: The VCPU pointer 480 * @vcpu: The VCPU pointer
@@ -721,8 +765,10 @@ void vgic_kick_vcpus(struct kvm *kvm)
721 * a good kick... 765 * a good kick...
722 */ 766 */
723 kvm_for_each_vcpu(c, vcpu, kvm) { 767 kvm_for_each_vcpu(c, vcpu, kvm) {
724 if (kvm_vgic_vcpu_pending_irq(vcpu)) 768 if (kvm_vgic_vcpu_pending_irq(vcpu)) {
769 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
725 kvm_vcpu_kick(vcpu); 770 kvm_vcpu_kick(vcpu);
771 }
726 } 772 }
727} 773}
728 774
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f0fe9d02f6bb..19f0ecb9b93e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -73,17 +73,17 @@ MODULE_LICENSE("GPL");
73 73
74/* Architectures should define their poll value according to the halt latency */ 74/* Architectures should define their poll value according to the halt latency */
75unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 75unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
76module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR); 76module_param(halt_poll_ns, uint, 0644);
77EXPORT_SYMBOL_GPL(halt_poll_ns); 77EXPORT_SYMBOL_GPL(halt_poll_ns);
78 78
79/* Default doubles per-vcpu halt_poll_ns. */ 79/* Default doubles per-vcpu halt_poll_ns. */
80unsigned int halt_poll_ns_grow = 2; 80unsigned int halt_poll_ns_grow = 2;
81module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR); 81module_param(halt_poll_ns_grow, uint, 0644);
82EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
83 83
84/* Default resets per-vcpu halt_poll_ns . */ 84/* Default resets per-vcpu halt_poll_ns . */
85unsigned int halt_poll_ns_shrink; 85unsigned int halt_poll_ns_shrink;
86module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); 86module_param(halt_poll_ns_shrink, uint, 0644);
87EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 87EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
88 88
89/* 89/*
@@ -3191,6 +3191,12 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
3191 return PTR_ERR(file); 3191 return PTR_ERR(file);
3192 } 3192 }
3193 3193
3194 /*
3195 * Don't call kvm_put_kvm anymore at this point; file->f_op is
3196 * already set, with ->release() being kvm_vm_release(). In error
3197 * cases it will be called by the final fput(file) and will take
3198 * care of doing kvm_put_kvm(kvm).
3199 */
3194 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3200 if (kvm_create_vm_debugfs(kvm, r) < 0) {
3195 put_unused_fd(r); 3201 put_unused_fd(r);
3196 fput(file); 3202 fput(file);