summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-09 14:42:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-09 14:42:31 -0400
commitd8312a3f61024352f1c7cb967571fd53631b0d6c (patch)
treebe2f2f699e763330b0f0179e9f86009affbc0c7d
parente9092d0d97961146655ce51f43850907d95f68c3 (diff)
parente01bca2fc698d7f0626f0214001af523e18ad60b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "ARM: - VHE optimizations - EL2 address space randomization - speculative execution mitigations ("variant 3a", aka execution past invalid privilege register access) - bugfixes and cleanups PPC: - improvements for the radix page fault handler for HV KVM on POWER9 s390: - more kvm stat counters - virtio gpu plumbing - documentation - facilities improvements x86: - support for VMware magic I/O port and pseudo-PMCs - AMD pause loop exiting - support for AMD core performance extensions - support for synchronous register access - expose nVMX capabilities to userspace - support for Hyper-V signaling via eventfd - use Enlightened VMCS when running on Hyper-V - allow userspace to disable MWAIT/HLT/PAUSE vmexits - usual roundup of optimizations and nested virtualization bugfixes Generic: - API selftest infrastructure (though the only tests are for x86 as of now)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (174 commits) kvm: x86: fix a prototype warning kvm: selftests: add sync_regs_test kvm: selftests: add API testing infrastructure kvm: x86: fix a compile warning KVM: X86: Add Force Emulation Prefix for "emulate the next instruction" KVM: X86: Introduce handle_ud() KVM: vmx: unify adjacent #ifdefs x86: kvm: hide the unused 'cpu' variable KVM: VMX: remove bogus WARN_ON in handle_ept_misconfig Revert "KVM: X86: Fix SMRAM accessing even if VM is shutdown" kvm: Add emulation for movups/movupd KVM: VMX: raise internal error for exception during invalid protected mode state KVM: nVMX: Optimization: Dont set KVM_REQ_EVENT when VMExit with nested_run_pending KVM: nVMX: Require immediate-exit when event reinjected to L2 and L1 event pending KVM: x86: Fix misleading comments on handling pending exceptions KVM: x86: Rename interrupt.pending to interrupt.injected KVM: VMX: No need to clear pending NMI/interrupt on inject realmode interrupt x86/kvm: use Enlightened VMCS when running on Hyper-V x86/hyper-v: detect nested features x86/hyper-v: define struct hv_enlightened_vmcs and clean field bits ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/arm64/memory.txt9
-rw-r--r--Documentation/virtual/kvm/00-INDEX10
-rw-r--r--Documentation/virtual/kvm/api.txt135
-rw-r--r--Documentation/virtual/kvm/cpuid.txt15
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/arm/include/asm/kvm_asm.h5
-rw-r--r--arch/arm/include/asm/kvm_emulate.h21
-rw-r--r--arch/arm/include/asm/kvm_host.h6
-rw-r--r--arch/arm/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm/include/asm/kvm_mmu.h16
-rw-r--r--arch/arm/include/uapi/asm/kvm.h9
-rw-r--r--arch/arm/kvm/coproc.c61
-rw-r--r--arch/arm/kvm/emulate.c4
-rw-r--r--arch/arm/kvm/hyp/Makefile1
-rw-r--r--arch/arm/kvm/hyp/switch.c16
-rw-r--r--arch/arm64/Kconfig16
-rw-r--r--arch/arm64/include/asm/alternative.h41
-rw-r--r--arch/arm64/include/asm/cpucaps.h2
-rw-r--r--arch/arm64/include/asm/insn.h16
-rw-r--r--arch/arm64/include/asm/kvm_arm.h6
-rw-r--r--arch/arm64/include/asm/kvm_asm.h19
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h78
-rw-r--r--arch/arm64/include/asm/kvm_host.h53
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h29
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h165
-rw-r--r--arch/arm64/include/asm/mmu.h8
-rw-r--r--arch/arm64/include/asm/sysreg.h6
-rw-r--r--arch/arm64/kernel/Makefile4
-rw-r--r--arch/arm64/kernel/alternative.c43
-rw-r--r--arch/arm64/kernel/asm-offsets.c1
-rw-r--r--arch/arm64/kernel/bpi.S67
-rw-r--r--arch/arm64/kernel/cpu_errata.c25
-rw-r--r--arch/arm64/kernel/cpufeature.c19
-rw-r--r--arch/arm64/kernel/head.S7
-rw-r--r--arch/arm64/kernel/insn.c190
-rw-r--r--arch/arm64/kvm/Kconfig3
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/debug.c29
-rw-r--r--arch/arm64/kvm/hyp-init.S1
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/debug-sr.c88
-rw-r--r--arch/arm64/kvm/hyp/entry.S6
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S86
-rw-r--r--arch/arm64/kvm/hyp/switch.c382
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c172
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c78
-rw-r--r--arch/arm64/kvm/inject_fault.c24
-rw-r--r--arch/arm64/kvm/regmap.c67
-rw-r--r--arch/arm64/kvm/sys_regs.c199
-rw-r--r--arch/arm64/kvm/sys_regs.h4
-rw-r--r--arch/arm64/kvm/sys_regs_generic_v8.c4
-rw-r--r--arch/arm64/kvm/va_layout.c227
-rw-r--r--arch/mips/include/asm/kvm_para.h5
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/include/asm/kvm_para.h5
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/kvm/book3s.c6
-rw-r--r--arch/powerpc/kvm/book3s.h1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c9
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c333
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c1
-rw-r--r--arch/powerpc/kvm/book3s_pr.c10
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/trace_pr.h15
-rw-r--r--arch/s390/include/asm/kvm_host.h28
-rw-r--r--arch/s390/include/asm/kvm_para.h5
-rw-r--r--arch/s390/include/asm/mmu.h4
-rw-r--r--arch/s390/include/asm/mmu_context.h2
-rw-r--r--arch/s390/kvm/gaccess.c9
-rw-r--r--arch/s390/kvm/intercept.c17
-rw-r--r--arch/s390/kvm/interrupt.c26
-rw-r--r--arch/s390/kvm/kvm-s390.c102
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/priv.c4
-rw-r--r--arch/s390/tools/gen_facilities.c20
-rw-r--r--arch/x86/hyperv/hv_init.c45
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h (renamed from arch/x86/include/uapi/asm/hyperv.h)299
-rw-r--r--arch/x86/include/asm/kvm_host.h54
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/mshyperv.h94
-rw-r--r--arch/x86/include/asm/msr-index.h14
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm.h19
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h9
-rw-r--r--arch/x86/kernel/cpu/common.c3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c22
-rw-r--r--arch/x86/kernel/kvm.c18
-rw-r--r--arch/x86/kernel/process_64.c14
-rw-r--r--arch/x86/kvm/cpuid.c7
-rw-r--r--arch/x86/kvm/emulate.c27
-rw-r--r--arch/x86/kvm/hyperv.c192
-rw-r--r--arch/x86/kvm/hyperv.h4
-rw-r--r--arch/x86/kvm/irq.c26
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h7
-rw-r--r--arch/x86/kvm/lapic.c10
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h11
-rw-r--r--arch/x86/kvm/pmu.c37
-rw-r--r--arch/x86/kvm/pmu.h6
-rw-r--r--arch/x86/kvm/pmu_amd.c142
-rw-r--r--arch/x86/kvm/svm.c312
-rw-r--r--arch/x86/kvm/vmx.c1091
-rw-r--r--arch/x86/kvm/vmx_evmcs.h324
-rw-r--r--arch/x86/kvm/x86.c374
-rw-r--r--arch/x86/kvm/x86.h86
-rw-r--r--drivers/hv/connection.c1
-rw-r--r--drivers/hv/hv.c1
-rw-r--r--drivers/hv/hyperv_vmbus.h1
-rw-r--r--drivers/hv/vmbus_drv.c1
-rw-r--r--include/asm-generic/kvm_para.h5
-rw-r--r--include/kvm/arm_vgic.h14
-rw-r--r--include/linux/hyperv.h1
-rw-r--r--include/linux/kvm_para.h5
-rw-r--r--include/uapi/linux/kvm.h23
-rw-r--r--tools/include/uapi/linux/kvm.h2
-rwxr-xr-xtools/kvm/kvm_stat/kvm_stat11
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/kvm/Makefile39
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h142
-rw-r--r--tools/testing/selftests/kvm/include/sparsebit.h75
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h45
-rw-r--r--tools/testing/selftests/kvm/include/x86.h1043
-rw-r--r--tools/testing/selftests/kvm/lib/assert.c87
-rw-r--r--tools/testing/selftests/kvm/lib/elf.c197
-rw-r--r--tools/testing/selftests/kvm/lib/io.c158
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c1480
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util_internal.h67
-rw-r--r--tools/testing/selftests/kvm/lib/sparsebit.c2087
-rw-r--r--tools/testing/selftests/kvm/lib/x86.c700
-rw-r--r--tools/testing/selftests/kvm/set_sregs_test.c54
-rw-r--r--tools/testing/selftests/kvm/sync_regs_test.c232
-rw-r--r--virt/kvm/arm/aarch32.c2
-rw-r--r--virt/kvm/arm/arch_timer.c10
-rw-r--r--virt/kvm/arm/arm.c48
-rw-r--r--virt/kvm/arm/hyp/timer-sr.c44
-rw-r--r--virt/kvm/arm/hyp/vgic-v2-sr.c159
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c247
-rw-r--r--virt/kvm/arm/mmu.c176
-rw-r--r--virt/kvm/arm/pmu.c36
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c17
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c15
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c152
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c66
-rw-r--r--virt/kvm/arm/vgic/vgic.c33
-rw-r--r--virt/kvm/arm/vgic/vgic.h3
-rw-r--r--virt/kvm/kvm_main.c36
150 files changed, 11900 insertions, 1982 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9a3edf7e901a..11fc28ecdb6d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1907,6 +1907,9 @@
1907 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. 1907 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
1908 Default is 0 (don't ignore, but inject #GP) 1908 Default is 0 (don't ignore, but inject #GP)
1909 1909
1910 kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
1911 Default is false (don't support).
1912
1910 kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit 1913 kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit
1911 KVM MMU at runtime. 1914 KVM MMU at runtime.
1912 Default is 0 (off) 1915 Default is 0 (off)
diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt
index 671bc0639262..c5dab30d3389 100644
--- a/Documentation/arm64/memory.txt
+++ b/Documentation/arm64/memory.txt
@@ -86,9 +86,12 @@ Translation table lookup with 64KB pages:
86 +-------------------------------------------------> [63] TTBR0/1 86 +-------------------------------------------------> [63] TTBR0/1
87 87
88 88
89When using KVM without the Virtualization Host Extensions, the hypervisor 89When using KVM without the Virtualization Host Extensions, the
90maps kernel pages in EL2 at a fixed offset from the kernel VA. See the 90hypervisor maps kernel pages in EL2 at a fixed (and potentially
91kern_hyp_va macro for more details. 91random) offset from the linear mapping. See the kern_hyp_va macro and
92kvm_update_va_mask function for more details. MMIO devices such as
93GICv2 gets mapped next to the HYP idmap page, as do vectors when
94ARM64_HARDEN_EL2_VECTORS is selected for particular CPUs.
92 95
93When using KVM with the Virtualization Host Extensions, no additional 96When using KVM with the Virtualization Host Extensions, no additional
94mappings are created, since the host kernel runs directly in EL2. 97mappings are created, since the host kernel runs directly in EL2.
diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX
index 3da73aabff5a..3492458a4ae8 100644
--- a/Documentation/virtual/kvm/00-INDEX
+++ b/Documentation/virtual/kvm/00-INDEX
@@ -1,7 +1,12 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3amd-memory-encryption.rst
4 - notes on AMD Secure Encrypted Virtualization feature and SEV firmware
5 command description
3api.txt 6api.txt
4 - KVM userspace API. 7 - KVM userspace API.
8arm
9 - internal ABI between the kernel and HYP (for arm/arm64)
5cpuid.txt 10cpuid.txt
6 - KVM-specific cpuid leaves (x86). 11 - KVM-specific cpuid leaves (x86).
7devices/ 12devices/
@@ -26,6 +31,5 @@ s390-diag.txt
26 - Diagnose hypercall description (for IBM S/390) 31 - Diagnose hypercall description (for IBM S/390)
27timekeeping.txt 32timekeeping.txt
28 - timekeeping virtualization for x86-based architectures. 33 - timekeeping virtualization for x86-based architectures.
29amd-memory-encryption.txt 34vcpu-requests.rst
30 - notes on AMD Secure Encrypted Virtualization feature and SEV firmware 35 - internal VCPU request API
31 command description
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d6b3ff51a14f..1c7958b57fe9 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3480,7 +3480,7 @@ encrypted VMs.
3480 3480
3481Currently, this ioctl is used for issuing Secure Encrypted Virtualization 3481Currently, this ioctl is used for issuing Secure Encrypted Virtualization
3482(SEV) commands on AMD Processors. The SEV commands are defined in 3482(SEV) commands on AMD Processors. The SEV commands are defined in
3483Documentation/virtual/kvm/amd-memory-encryption.txt. 3483Documentation/virtual/kvm/amd-memory-encryption.rst.
3484 3484
34854.111 KVM_MEMORY_ENCRYPT_REG_REGION 34854.111 KVM_MEMORY_ENCRYPT_REG_REGION
3486 3486
@@ -3516,6 +3516,38 @@ Returns: 0 on success; -1 on error
3516This ioctl can be used to unregister the guest memory region registered 3516This ioctl can be used to unregister the guest memory region registered
3517with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. 3517with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
3518 3518
35194.113 KVM_HYPERV_EVENTFD
3520
3521Capability: KVM_CAP_HYPERV_EVENTFD
3522Architectures: x86
3523Type: vm ioctl
3524Parameters: struct kvm_hyperv_eventfd (in)
3525
3526This ioctl (un)registers an eventfd to receive notifications from the guest on
3527the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without
3528causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number
3529(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit.
3530
3531struct kvm_hyperv_eventfd {
3532 __u32 conn_id;
3533 __s32 fd;
3534 __u32 flags;
3535 __u32 padding[3];
3536};
3537
3538The conn_id field should fit within 24 bits:
3539
3540#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
3541
3542The acceptable values for the flags field are:
3543
3544#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
3545
3546Returns: 0 on success,
3547 -EINVAL if conn_id or flags is outside the allowed range
3548 -ENOENT on deassign if the conn_id isn't registered
3549 -EEXIST on assign if the conn_id is already registered
3550
3519 3551
35205. The kvm_run structure 35525. The kvm_run structure
3521------------------------ 3553------------------------
@@ -3873,7 +3905,7 @@ in userspace.
3873 __u64 kvm_dirty_regs; 3905 __u64 kvm_dirty_regs;
3874 union { 3906 union {
3875 struct kvm_sync_regs regs; 3907 struct kvm_sync_regs regs;
3876 char padding[1024]; 3908 char padding[SYNC_REGS_SIZE_BYTES];
3877 } s; 3909 } s;
3878 3910
3879If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access 3911If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access
@@ -4078,6 +4110,46 @@ Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be
4078accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from 4110accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from
4079the guest. 4111the guest.
4080 4112
41136.74 KVM_CAP_SYNC_REGS
4114Architectures: s390, x86
4115Target: s390: always enabled, x86: vcpu
4116Parameters: none
4117Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register
4118sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h).
4119
4120As described above in the kvm_sync_regs struct info in section 5 (kvm_run):
4121KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers
4122without having to call SET/GET_*REGS". This reduces overhead by eliminating
4123repeated ioctl calls for setting and/or getting register values. This is
4124particularly important when userspace is making synchronous guest state
4125modifications, e.g. when emulating and/or intercepting instructions in
4126userspace.
4127
4128For s390 specifics, please refer to the source code.
4129
4130For x86:
4131- the register sets to be copied out to kvm_run are selectable
4132 by userspace (rather that all sets being copied out for every exit).
4133- vcpu_events are available in addition to regs and sregs.
4134
4135For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to
4136function as an input bit-array field set by userspace to indicate the
4137specific register sets to be copied out on the next exit.
4138
4139To indicate when userspace has modified values that should be copied into
4140the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set.
4141This is done using the same bitflags as for the 'kvm_valid_regs' field.
4142If the dirty bit is not set, then the register set values will not be copied
4143into the vCPU even if they've been modified.
4144
4145Unused bitfields in the bitarrays must be set to zero.
4146
4147struct kvm_sync_regs {
4148 struct kvm_regs regs;
4149 struct kvm_sregs sregs;
4150 struct kvm_vcpu_events events;
4151};
4152
40817. Capabilities that can be enabled on VMs 41537. Capabilities that can be enabled on VMs
4082------------------------------------------ 4154------------------------------------------
4083 4155
@@ -4286,6 +4358,26 @@ enables QEMU to build error log and branch to guest kernel registered
4286machine check handling routine. Without this capability KVM will 4358machine check handling routine. Without this capability KVM will
4287branch to guests' 0x200 interrupt vector. 4359branch to guests' 0x200 interrupt vector.
4288 4360
43617.13 KVM_CAP_X86_DISABLE_EXITS
4362
4363Architectures: x86
4364Parameters: args[0] defines which exits are disabled
4365Returns: 0 on success, -EINVAL when args[0] contains invalid exits
4366
4367Valid bits in args[0] are
4368
4369#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
4370#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
4371
4372Enabling this capability on a VM provides userspace with a way to no
4373longer intercept some instructions for improved latency in some
4374workloads, and is suggested when vCPUs are associated to dedicated
4375physical CPUs. More bits can be added in the future; userspace can
4376just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable
4377all such vmexits.
4378
4379Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
4380
42898. Other capabilities. 43818. Other capabilities.
4290---------------------- 4382----------------------
4291 4383
@@ -4398,15 +4490,6 @@ reserved.
4398 Both registers and addresses are 64-bits wide. 4490 Both registers and addresses are 64-bits wide.
4399 It will be possible to run 64-bit or 32-bit guest code. 4491 It will be possible to run 64-bit or 32-bit guest code.
4400 4492
44018.8 KVM_CAP_X86_GUEST_MWAIT
4402
4403Architectures: x86
4404
4405This capability indicates that guest using memory monotoring instructions
4406(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time
4407spent while virtual CPU is halted in this way will then be accounted for as
4408guest running time on the host (as opposed to e.g. HLT).
4409
44108.9 KVM_CAP_ARM_USER_IRQ 44938.9 KVM_CAP_ARM_USER_IRQ
4411 4494
4412Architectures: arm, arm64 4495Architectures: arm, arm64
@@ -4483,3 +4566,33 @@ Parameters: none
4483This capability indicates if the flic device will be able to get/set the 4566This capability indicates if the flic device will be able to get/set the
4484AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows 4567AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows
4485to discover this without having to create a flic device. 4568to discover this without having to create a flic device.
4569
45708.14 KVM_CAP_S390_PSW
4571
4572Architectures: s390
4573
4574This capability indicates that the PSW is exposed via the kvm_run structure.
4575
45768.15 KVM_CAP_S390_GMAP
4577
4578Architectures: s390
4579
4580This capability indicates that the user space memory used as guest mapping can
4581be anywhere in the user memory address space, as long as the memory slots are
4582aligned and sized to a segment (1MB) boundary.
4583
45848.16 KVM_CAP_S390_COW
4585
4586Architectures: s390
4587
4588This capability indicates that the user space memory used as guest mapping can
4589use copy-on-write semantics as well as dirty pages tracking via read-only page
4590tables.
4591
45928.17 KVM_CAP_S390_BPB
4593
4594Architectures: s390
4595
4596This capability indicates that kvm will implement the interfaces to handle
4597reset, migration and nested KVM for branch prediction blocking. The stfle
4598facility 82 should not be provided to the guest without this capability.
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 87a7506f31c2..d4f33eb805dd 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -23,8 +23,8 @@ This function queries the presence of KVM cpuid leafs.
23 23
24 24
25function: define KVM_CPUID_FEATURES (0x40000001) 25function: define KVM_CPUID_FEATURES (0x40000001)
26returns : ebx, ecx, edx = 0 26returns : ebx, ecx
27 eax = and OR'ed group of (1 << flag), where each flags is: 27 eax = an OR'ed group of (1 << flag), where each flags is:
28 28
29 29
30flag || value || meaning 30flag || value || meaning
@@ -66,3 +66,14 @@ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
66 || || per-cpu warps are expected in 66 || || per-cpu warps are expected in
67 || || kvmclock. 67 || || kvmclock.
68------------------------------------------------------------------------------ 68------------------------------------------------------------------------------
69
70 edx = an OR'ed group of (1 << flag), where each flags is:
71
72
73flag || value || meaning
74==================================================================================
75KVM_HINTS_DEDICATED || 0 || guest checks this feature bit to
76 || || determine if there is vCPU pinning
77 || || and there is no vCPU over-commitment,
78 || || allowing optimizations
79----------------------------------------------------------------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 4c3c17e1e163..6d296bdce328 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6516,7 +6516,7 @@ S: Maintained
6516F: Documentation/networking/netvsc.txt 6516F: Documentation/networking/netvsc.txt
6517F: arch/x86/include/asm/mshyperv.h 6517F: arch/x86/include/asm/mshyperv.h
6518F: arch/x86/include/asm/trace/hyperv.h 6518F: arch/x86/include/asm/trace/hyperv.h
6519F: arch/x86/include/uapi/asm/hyperv.h 6519F: arch/x86/include/asm/hyperv-tlfs.h
6520F: arch/x86/kernel/cpu/mshyperv.c 6520F: arch/x86/kernel/cpu/mshyperv.c
6521F: arch/x86/hyperv 6521F: arch/x86/hyperv
6522F: drivers/hid/hid-hyperv.c 6522F: drivers/hid/hid-hyperv.c
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 36dd2962a42d..5a953ecb0d78 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -70,7 +70,10 @@ extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
70 70
71extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); 71extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
72 72
73extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 73/* no VHE on 32-bit :( */
74static inline int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { BUG(); return 0; }
75
76extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
74 77
75extern void __init_stage2_translation(void); 78extern void __init_stage2_translation(void);
76 79
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 9003bd19cb70..6493bd479ddc 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -41,7 +41,17 @@ static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num)
41 return vcpu_reg(vcpu, reg_num); 41 return vcpu_reg(vcpu, reg_num);
42} 42}
43 43
44unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu); 44unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu);
45
46static inline unsigned long vpcu_read_spsr(struct kvm_vcpu *vcpu)
47{
48 return *__vcpu_spsr(vcpu);
49}
50
51static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
52{
53 *__vcpu_spsr(vcpu) = v;
54}
45 55
46static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, 56static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu,
47 u8 reg_num) 57 u8 reg_num)
@@ -92,14 +102,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
92 vcpu->arch.hcr = HCR_GUEST_MASK; 102 vcpu->arch.hcr = HCR_GUEST_MASK;
93} 103}
94 104
95static inline unsigned long vcpu_get_hcr(const struct kvm_vcpu *vcpu) 105static inline unsigned long *vcpu_hcr(const struct kvm_vcpu *vcpu)
96{
97 return vcpu->arch.hcr;
98}
99
100static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr)
101{ 106{
102 vcpu->arch.hcr = hcr; 107 return (unsigned long *)&vcpu->arch.hcr;
103} 108}
104 109
105static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) 110static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 248b930563e5..c6a749568dd6 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -155,9 +155,6 @@ struct kvm_vcpu_arch {
155 /* HYP trapping configuration */ 155 /* HYP trapping configuration */
156 u32 hcr; 156 u32 hcr;
157 157
158 /* Interrupt related fields */
159 u32 irq_lines; /* IRQ and FIQ levels */
160
161 /* Exception Information */ 158 /* Exception Information */
162 struct kvm_vcpu_fault_info fault; 159 struct kvm_vcpu_fault_info fault;
163 160
@@ -315,4 +312,7 @@ static inline bool kvm_arm_harden_branch_predictor(void)
315 return false; 312 return false;
316} 313}
317 314
315static inline void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu) {}
316static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
317
318#endif /* __ARM_KVM_HOST_H__ */ 318#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 1ab8329e9ff7..e93a0cac9add 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -110,6 +110,10 @@ void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
110 110
111void __vgic_v3_save_state(struct kvm_vcpu *vcpu); 111void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
112void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); 112void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
113void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
114void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
115void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
116void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
113 117
114asmlinkage void __vfp_save_state(struct vfp_hard_struct *vfp); 118asmlinkage void __vfp_save_state(struct vfp_hard_struct *vfp);
115asmlinkage void __vfp_restore_state(struct vfp_hard_struct *vfp); 119asmlinkage void __vfp_restore_state(struct vfp_hard_struct *vfp);
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index de1b919404e4..707a1f06dc5d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -28,6 +28,13 @@
28 */ 28 */
29#define kern_hyp_va(kva) (kva) 29#define kern_hyp_va(kva) (kva)
30 30
31/* Contrary to arm64, there is no need to generate a PC-relative address */
32#define hyp_symbol_addr(s) \
33 ({ \
34 typeof(s) *addr = &(s); \
35 addr; \
36 })
37
31/* 38/*
32 * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. 39 * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
33 */ 40 */
@@ -42,8 +49,15 @@
42#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
43#include <asm/stage2_pgtable.h> 50#include <asm/stage2_pgtable.h>
44 51
52/* Ensure compatibility with arm64 */
53#define VA_BITS 32
54
45int create_hyp_mappings(void *from, void *to, pgprot_t prot); 55int create_hyp_mappings(void *from, void *to, pgprot_t prot);
46int create_hyp_io_mappings(void *from, void *to, phys_addr_t); 56int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
57 void __iomem **kaddr,
58 void __iomem **haddr);
59int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
60 void **haddr);
47void free_hyp_pgds(void); 61void free_hyp_pgds(void);
48 62
49void stage2_unmap_vm(struct kvm *kvm); 63void stage2_unmap_vm(struct kvm *kvm);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6edd177bb1c7..2ba95d6fe852 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -135,6 +135,15 @@ struct kvm_arch_memory_slot {
135#define KVM_REG_ARM_CRM_SHIFT 7 135#define KVM_REG_ARM_CRM_SHIFT 7
136#define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800 136#define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800
137#define KVM_REG_ARM_32_CRN_SHIFT 11 137#define KVM_REG_ARM_32_CRN_SHIFT 11
138/*
139 * For KVM currently all guest registers are nonsecure, but we reserve a bit
140 * in the encoding to distinguish secure from nonsecure for AArch32 system
141 * registers that are banked by security. This is 1 for the secure banked
142 * register, and 0 for the nonsecure banked register or if the register is
143 * not banked by security.
144 */
145#define KVM_REG_ARM_SECURE_MASK 0x0000000010000000
146#define KVM_REG_ARM_SECURE_SHIFT 28
138 147
139#define ARM_CP15_REG_SHIFT_MASK(x,n) \ 148#define ARM_CP15_REG_SHIFT_MASK(x,n) \
140 (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK) 149 (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK)
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 6d1d2e26dfe5..3a02e76699a6 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -270,6 +270,60 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
270 return true; 270 return true;
271} 271}
272 272
273static bool access_cntp_tval(struct kvm_vcpu *vcpu,
274 const struct coproc_params *p,
275 const struct coproc_reg *r)
276{
277 u64 now = kvm_phys_timer_read();
278 u64 val;
279
280 if (p->is_write) {
281 val = *vcpu_reg(vcpu, p->Rt1);
282 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now);
283 } else {
284 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
285 *vcpu_reg(vcpu, p->Rt1) = val - now;
286 }
287
288 return true;
289}
290
291static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
292 const struct coproc_params *p,
293 const struct coproc_reg *r)
294{
295 u32 val;
296
297 if (p->is_write) {
298 val = *vcpu_reg(vcpu, p->Rt1);
299 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val);
300 } else {
301 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
302 *vcpu_reg(vcpu, p->Rt1) = val;
303 }
304
305 return true;
306}
307
308static bool access_cntp_cval(struct kvm_vcpu *vcpu,
309 const struct coproc_params *p,
310 const struct coproc_reg *r)
311{
312 u64 val;
313
314 if (p->is_write) {
315 val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
316 val |= *vcpu_reg(vcpu, p->Rt1);
317 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val);
318 } else {
319 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
320 *vcpu_reg(vcpu, p->Rt1) = val;
321 *vcpu_reg(vcpu, p->Rt2) = val >> 32;
322 }
323
324 return true;
325}
326
273/* 327/*
274 * We could trap ID_DFR0 and tell the guest we don't support performance 328 * We could trap ID_DFR0 and tell the guest we don't support performance
275 * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was 329 * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was
@@ -423,10 +477,17 @@ static const struct coproc_reg cp15_regs[] = {
423 { CRn(13), CRm( 0), Op1( 0), Op2( 4), is32, 477 { CRn(13), CRm( 0), Op1( 0), Op2( 4), is32,
424 NULL, reset_unknown, c13_TID_PRIV }, 478 NULL, reset_unknown, c13_TID_PRIV },
425 479
480 /* CNTP */
481 { CRm64(14), Op1( 2), is64, access_cntp_cval},
482
426 /* CNTKCTL: swapped by interrupt.S. */ 483 /* CNTKCTL: swapped by interrupt.S. */
427 { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32, 484 { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32,
428 NULL, reset_val, c14_CNTKCTL, 0x00000000 }, 485 NULL, reset_val, c14_CNTKCTL, 0x00000000 },
429 486
487 /* CNTP */
488 { CRn(14), CRm( 2), Op1( 0), Op2( 0), is32, access_cntp_tval },
489 { CRn(14), CRm( 2), Op1( 0), Op2( 1), is32, access_cntp_ctl },
490
430 /* The Configuration Base Address Register. */ 491 /* The Configuration Base Address Register. */
431 { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, 492 { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar},
432}; 493};
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index cdff963f133a..9046b53d87c1 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -142,7 +142,7 @@ unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num)
142/* 142/*
143 * Return the SPSR for the current mode of the virtual CPU. 143 * Return the SPSR for the current mode of the virtual CPU.
144 */ 144 */
145unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu) 145unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu)
146{ 146{
147 unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK; 147 unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK;
148 switch (mode) { 148 switch (mode) {
@@ -174,5 +174,5 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
174 */ 174 */
175void kvm_inject_vabt(struct kvm_vcpu *vcpu) 175void kvm_inject_vabt(struct kvm_vcpu *vcpu)
176{ 176{
177 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VA); 177 *vcpu_hcr(vcpu) |= HCR_VA;
178} 178}
diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile
index 63d6b404d88e..7fc0638f263a 100644
--- a/arch/arm/kvm/hyp/Makefile
+++ b/arch/arm/kvm/hyp/Makefile
@@ -9,7 +9,6 @@ KVM=../../../../virt/kvm
9 9
10CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve) 10CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve)
11 11
12obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
13obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o 12obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
14obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o 13obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
15 14
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index ae45ae96aac2..acf1c37fa49c 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -44,7 +44,7 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu, u32 *fpexc_host)
44 isb(); 44 isb();
45 } 45 }
46 46
47 write_sysreg(vcpu->arch.hcr | vcpu->arch.irq_lines, HCR); 47 write_sysreg(vcpu->arch.hcr, HCR);
48 /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ 48 /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
49 write_sysreg(HSTR_T(15), HSTR); 49 write_sysreg(HSTR_T(15), HSTR);
50 write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR); 50 write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR);
@@ -90,18 +90,18 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
90 90
91static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) 91static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
92{ 92{
93 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) 93 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
94 __vgic_v3_save_state(vcpu); 94 __vgic_v3_save_state(vcpu);
95 else 95 __vgic_v3_deactivate_traps(vcpu);
96 __vgic_v2_save_state(vcpu); 96 }
97} 97}
98 98
99static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) 99static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
100{ 100{
101 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) 101 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
102 __vgic_v3_activate_traps(vcpu);
102 __vgic_v3_restore_state(vcpu); 103 __vgic_v3_restore_state(vcpu);
103 else 104 }
104 __vgic_v2_restore_state(vcpu);
105} 105}
106 106
107static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) 107static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
@@ -154,7 +154,7 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
154 return true; 154 return true;
155} 155}
156 156
157int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) 157int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
158{ 158{
159 struct kvm_cpu_context *host_ctxt; 159 struct kvm_cpu_context *host_ctxt;
160 struct kvm_cpu_context *guest_ctxt; 160 struct kvm_cpu_context *guest_ctxt;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 177be0d1d090..eb2cf4938f6d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -922,6 +922,22 @@ config HARDEN_BRANCH_PREDICTOR
922 922
923 If unsure, say Y. 923 If unsure, say Y.
924 924
925config HARDEN_EL2_VECTORS
926 bool "Harden EL2 vector mapping against system register leak" if EXPERT
927 default y
928 help
929 Speculation attacks against some high-performance processors can
930 be used to leak privileged information such as the vector base
931 register, resulting in a potential defeat of the EL2 layout
932 randomization.
933
934 This config option will map the vectors to a fixed location,
935 independent of the EL2 code mapping, so that revealing VBAR_EL2
936 to an attacker does not give away any extra information. This
937 only gets enabled on affected CPUs.
938
939 If unsure, say Y.
940
925menuconfig ARMV8_DEPRECATED 941menuconfig ARMV8_DEPRECATED
926 bool "Emulate deprecated/obsolete ARMv8 instructions" 942 bool "Emulate deprecated/obsolete ARMv8 instructions"
927 depends on COMPAT 943 depends on COMPAT
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 669028172fd6..a91933b1e2e6 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -5,6 +5,8 @@
5#include <asm/cpucaps.h> 5#include <asm/cpucaps.h>
6#include <asm/insn.h> 6#include <asm/insn.h>
7 7
8#define ARM64_CB_PATCH ARM64_NCAPS
9
8#ifndef __ASSEMBLY__ 10#ifndef __ASSEMBLY__
9 11
10#include <linux/init.h> 12#include <linux/init.h>
@@ -22,12 +24,19 @@ struct alt_instr {
22 u8 alt_len; /* size of new instruction(s), <= orig_len */ 24 u8 alt_len; /* size of new instruction(s), <= orig_len */
23}; 25};
24 26
27typedef void (*alternative_cb_t)(struct alt_instr *alt,
28 __le32 *origptr, __le32 *updptr, int nr_inst);
29
25void __init apply_alternatives_all(void); 30void __init apply_alternatives_all(void);
26void apply_alternatives(void *start, size_t length); 31void apply_alternatives(void *start, size_t length);
27 32
28#define ALTINSTR_ENTRY(feature) \ 33#define ALTINSTR_ENTRY(feature,cb) \
29 " .word 661b - .\n" /* label */ \ 34 " .word 661b - .\n" /* label */ \
35 " .if " __stringify(cb) " == 0\n" \
30 " .word 663f - .\n" /* new instruction */ \ 36 " .word 663f - .\n" /* new instruction */ \
37 " .else\n" \
38 " .word " __stringify(cb) "- .\n" /* callback */ \
39 " .endif\n" \
31 " .hword " __stringify(feature) "\n" /* feature bit */ \ 40 " .hword " __stringify(feature) "\n" /* feature bit */ \
32 " .byte 662b-661b\n" /* source len */ \ 41 " .byte 662b-661b\n" /* source len */ \
33 " .byte 664f-663f\n" /* replacement len */ 42 " .byte 664f-663f\n" /* replacement len */
@@ -45,15 +54,18 @@ void apply_alternatives(void *start, size_t length);
45 * but most assemblers die if insn1 or insn2 have a .inst. This should 54 * but most assemblers die if insn1 or insn2 have a .inst. This should
46 * be fixed in a binutils release posterior to 2.25.51.0.2 (anything 55 * be fixed in a binutils release posterior to 2.25.51.0.2 (anything
47 * containing commit 4e4d08cf7399b606 or c1baaddf8861). 56 * containing commit 4e4d08cf7399b606 or c1baaddf8861).
57 *
58 * Alternatives with callbacks do not generate replacement instructions.
48 */ 59 */
49#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ 60#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled, cb) \
50 ".if "__stringify(cfg_enabled)" == 1\n" \ 61 ".if "__stringify(cfg_enabled)" == 1\n" \
51 "661:\n\t" \ 62 "661:\n\t" \
52 oldinstr "\n" \ 63 oldinstr "\n" \
53 "662:\n" \ 64 "662:\n" \
54 ".pushsection .altinstructions,\"a\"\n" \ 65 ".pushsection .altinstructions,\"a\"\n" \
55 ALTINSTR_ENTRY(feature) \ 66 ALTINSTR_ENTRY(feature,cb) \
56 ".popsection\n" \ 67 ".popsection\n" \
68 " .if " __stringify(cb) " == 0\n" \
57 ".pushsection .altinstr_replacement, \"a\"\n" \ 69 ".pushsection .altinstr_replacement, \"a\"\n" \
58 "663:\n\t" \ 70 "663:\n\t" \
59 newinstr "\n" \ 71 newinstr "\n" \
@@ -61,11 +73,17 @@ void apply_alternatives(void *start, size_t length);
61 ".popsection\n\t" \ 73 ".popsection\n\t" \
62 ".org . - (664b-663b) + (662b-661b)\n\t" \ 74 ".org . - (664b-663b) + (662b-661b)\n\t" \
63 ".org . - (662b-661b) + (664b-663b)\n" \ 75 ".org . - (662b-661b) + (664b-663b)\n" \
76 ".else\n\t" \
77 "663:\n\t" \
78 "664:\n\t" \
79 ".endif\n" \
64 ".endif\n" 80 ".endif\n"
65 81
66#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ 82#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \
67 __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) 83 __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg), 0)
68 84
85#define ALTERNATIVE_CB(oldinstr, cb) \
86 __ALTERNATIVE_CFG(oldinstr, "NOT_AN_INSTRUCTION", ARM64_CB_PATCH, 1, cb)
69#else 87#else
70 88
71#include <asm/assembler.h> 89#include <asm/assembler.h>
@@ -132,6 +150,14 @@ void apply_alternatives(void *start, size_t length);
132661: 150661:
133.endm 151.endm
134 152
153.macro alternative_cb cb
154 .set .Lasm_alt_mode, 0
155 .pushsection .altinstructions, "a"
156 altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0
157 .popsection
158661:
159.endm
160
135/* 161/*
136 * Provide the other half of the alternative code sequence. 162 * Provide the other half of the alternative code sequence.
137 */ 163 */
@@ -158,6 +184,13 @@ void apply_alternatives(void *start, size_t length);
158.endm 184.endm
159 185
160/* 186/*
187 * Callback-based alternative epilogue
188 */
189.macro alternative_cb_end
190662:
191.endm
192
193/*
161 * Provides a trivial alternative or default sequence consisting solely 194 * Provides a trivial alternative or default sequence consisting solely
162 * of NOPs. The number of NOPs is chosen automatically to match the 195 * of NOPs. The number of NOPs is chosen automatically to match the
163 * previous case. 196 * previous case.
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 21bb624e0a7a..a311880feb0f 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -32,7 +32,7 @@
32#define ARM64_HAS_VIRT_HOST_EXTN 11 32#define ARM64_HAS_VIRT_HOST_EXTN 11
33#define ARM64_WORKAROUND_CAVIUM_27456 12 33#define ARM64_WORKAROUND_CAVIUM_27456 12
34#define ARM64_HAS_32BIT_EL0 13 34#define ARM64_HAS_32BIT_EL0 13
35#define ARM64_HYP_OFFSET_LOW 14 35#define ARM64_HARDEN_EL2_VECTORS 14
36#define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 36#define ARM64_MISMATCHED_CACHE_LINE_SIZE 15
37#define ARM64_HAS_NO_FPSIMD 16 37#define ARM64_HAS_NO_FPSIMD 16
38#define ARM64_WORKAROUND_REPEAT_TLBI 17 38#define ARM64_WORKAROUND_REPEAT_TLBI 17
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 4214c38d016b..f62c56b1793f 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -70,6 +70,7 @@ enum aarch64_insn_imm_type {
70 AARCH64_INSN_IMM_6, 70 AARCH64_INSN_IMM_6,
71 AARCH64_INSN_IMM_S, 71 AARCH64_INSN_IMM_S,
72 AARCH64_INSN_IMM_R, 72 AARCH64_INSN_IMM_R,
73 AARCH64_INSN_IMM_N,
73 AARCH64_INSN_IMM_MAX 74 AARCH64_INSN_IMM_MAX
74}; 75};
75 76
@@ -314,6 +315,11 @@ __AARCH64_INSN_FUNCS(eor, 0x7F200000, 0x4A000000)
314__AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000) 315__AARCH64_INSN_FUNCS(eon, 0x7F200000, 0x4A200000)
315__AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000) 316__AARCH64_INSN_FUNCS(ands, 0x7F200000, 0x6A000000)
316__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000) 317__AARCH64_INSN_FUNCS(bics, 0x7F200000, 0x6A200000)
318__AARCH64_INSN_FUNCS(and_imm, 0x7F800000, 0x12000000)
319__AARCH64_INSN_FUNCS(orr_imm, 0x7F800000, 0x32000000)
320__AARCH64_INSN_FUNCS(eor_imm, 0x7F800000, 0x52000000)
321__AARCH64_INSN_FUNCS(ands_imm, 0x7F800000, 0x72000000)
322__AARCH64_INSN_FUNCS(extr, 0x7FA00000, 0x13800000)
317__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000) 323__AARCH64_INSN_FUNCS(b, 0xFC000000, 0x14000000)
318__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000) 324__AARCH64_INSN_FUNCS(bl, 0xFC000000, 0x94000000)
319__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000) 325__AARCH64_INSN_FUNCS(cbz, 0x7F000000, 0x34000000)
@@ -423,6 +429,16 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
423 int shift, 429 int shift,
424 enum aarch64_insn_variant variant, 430 enum aarch64_insn_variant variant,
425 enum aarch64_insn_logic_type type); 431 enum aarch64_insn_logic_type type);
432u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
433 enum aarch64_insn_variant variant,
434 enum aarch64_insn_register Rn,
435 enum aarch64_insn_register Rd,
436 u64 imm);
437u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
438 enum aarch64_insn_register Rm,
439 enum aarch64_insn_register Rn,
440 enum aarch64_insn_register Rd,
441 u8 lsb);
426u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base, 442u32 aarch64_insn_gen_prefetch(enum aarch64_insn_register base,
427 enum aarch64_insn_prfm_type type, 443 enum aarch64_insn_prfm_type type,
428 enum aarch64_insn_prfm_target target, 444 enum aarch64_insn_prfm_target target,
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index b0c84171e6a3..6dd285e979c9 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -25,6 +25,7 @@
25/* Hyp Configuration Register (HCR) bits */ 25/* Hyp Configuration Register (HCR) bits */
26#define HCR_TEA (UL(1) << 37) 26#define HCR_TEA (UL(1) << 37)
27#define HCR_TERR (UL(1) << 36) 27#define HCR_TERR (UL(1) << 36)
28#define HCR_TLOR (UL(1) << 35)
28#define HCR_E2H (UL(1) << 34) 29#define HCR_E2H (UL(1) << 34)
29#define HCR_ID (UL(1) << 33) 30#define HCR_ID (UL(1) << 33)
30#define HCR_CD (UL(1) << 32) 31#define HCR_CD (UL(1) << 32)
@@ -64,6 +65,7 @@
64 65
65/* 66/*
66 * The bits we set in HCR: 67 * The bits we set in HCR:
68 * TLOR: Trap LORegion register accesses
67 * RW: 64bit by default, can be overridden for 32bit VMs 69 * RW: 64bit by default, can be overridden for 32bit VMs
68 * TAC: Trap ACTLR 70 * TAC: Trap ACTLR
69 * TSC: Trap SMC 71 * TSC: Trap SMC
@@ -81,9 +83,9 @@
81 */ 83 */
82#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ 84#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
83 HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \ 85 HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
84 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW) 86 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
87 HCR_FMO | HCR_IMO)
85#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) 88#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
86#define HCR_INT_OVERRIDE (HCR_FMO | HCR_IMO)
87#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) 89#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
88 90
89/* TCR_EL2 Registers bits */ 91/* TCR_EL2 Registers bits */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 24961b732e65..d53d40704416 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -33,6 +33,7 @@
33#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 33#define KVM_ARM64_DEBUG_DIRTY_SHIFT 0
34#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) 34#define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
35 35
36/* Translate a kernel address of @sym into its equivalent linear mapping */
36#define kvm_ksym_ref(sym) \ 37#define kvm_ksym_ref(sym) \
37 ({ \ 38 ({ \
38 void *val = &sym; \ 39 void *val = &sym; \
@@ -57,7 +58,9 @@ extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
57 58
58extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); 59extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
59 60
60extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); 61extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu);
62
63extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
61 64
62extern u64 __vgic_v3_get_ich_vtr_el2(void); 65extern u64 __vgic_v3_get_ich_vtr_el2(void);
63extern u64 __vgic_v3_read_vmcr(void); 66extern u64 __vgic_v3_read_vmcr(void);
@@ -70,6 +73,20 @@ extern u32 __init_stage2_translation(void);
70 73
71extern void __qcom_hyp_sanitize_btac_predictors(void); 74extern void __qcom_hyp_sanitize_btac_predictors(void);
72 75
76#else /* __ASSEMBLY__ */
77
78.macro get_host_ctxt reg, tmp
79 adr_l \reg, kvm_host_cpu_state
80 mrs \tmp, tpidr_el2
81 add \reg, \reg, \tmp
82.endm
83
84.macro get_vcpu_ptr vcpu, ctxt
85 get_host_ctxt \ctxt, \vcpu
86 ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
87 kern_hyp_va \vcpu
88.endm
89
73#endif 90#endif
74 91
75#endif /* __ARM_KVM_ASM_H__ */ 92#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 413dc82b1e89..23b33e8ea03a 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -26,13 +26,15 @@
26 26
27#include <asm/esr.h> 27#include <asm/esr.h>
28#include <asm/kvm_arm.h> 28#include <asm/kvm_arm.h>
29#include <asm/kvm_hyp.h>
29#include <asm/kvm_mmio.h> 30#include <asm/kvm_mmio.h>
30#include <asm/ptrace.h> 31#include <asm/ptrace.h>
31#include <asm/cputype.h> 32#include <asm/cputype.h>
32#include <asm/virt.h> 33#include <asm/virt.h>
33 34
34unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num); 35unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
35unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu); 36unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu);
37void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v);
36 38
37bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); 39bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
38void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); 40void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
@@ -45,6 +47,11 @@ void kvm_inject_undef32(struct kvm_vcpu *vcpu);
45void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); 47void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
46void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); 48void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
47 49
50static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
51{
52 return !(vcpu->arch.hcr_el2 & HCR_RW);
53}
54
48static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) 55static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
49{ 56{
50 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; 57 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
@@ -59,16 +66,19 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
59 66
60 if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) 67 if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
61 vcpu->arch.hcr_el2 &= ~HCR_RW; 68 vcpu->arch.hcr_el2 &= ~HCR_RW;
62}
63 69
64static inline unsigned long vcpu_get_hcr(struct kvm_vcpu *vcpu) 70 /*
65{ 71 * TID3: trap feature register accesses that we virtualise.
66 return vcpu->arch.hcr_el2; 72 * For now this is conditional, since no AArch32 feature regs
73 * are currently virtualised.
74 */
75 if (!vcpu_el1_is_32bit(vcpu))
76 vcpu->arch.hcr_el2 |= HCR_TID3;
67} 77}
68 78
69static inline void vcpu_set_hcr(struct kvm_vcpu *vcpu, unsigned long hcr) 79static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
70{ 80{
71 vcpu->arch.hcr_el2 = hcr; 81 return (unsigned long *)&vcpu->arch.hcr_el2;
72} 82}
73 83
74static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr) 84static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr)
@@ -81,11 +91,27 @@ static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
81 return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc; 91 return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
82} 92}
83 93
84static inline unsigned long *vcpu_elr_el1(const struct kvm_vcpu *vcpu) 94static inline unsigned long *__vcpu_elr_el1(const struct kvm_vcpu *vcpu)
85{ 95{
86 return (unsigned long *)&vcpu_gp_regs(vcpu)->elr_el1; 96 return (unsigned long *)&vcpu_gp_regs(vcpu)->elr_el1;
87} 97}
88 98
99static inline unsigned long vcpu_read_elr_el1(const struct kvm_vcpu *vcpu)
100{
101 if (vcpu->arch.sysregs_loaded_on_cpu)
102 return read_sysreg_el1(elr);
103 else
104 return *__vcpu_elr_el1(vcpu);
105}
106
107static inline void vcpu_write_elr_el1(const struct kvm_vcpu *vcpu, unsigned long v)
108{
109 if (vcpu->arch.sysregs_loaded_on_cpu)
110 write_sysreg_el1(v, elr);
111 else
112 *__vcpu_elr_el1(vcpu) = v;
113}
114
89static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) 115static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
90{ 116{
91 return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pstate; 117 return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pstate;
@@ -135,13 +161,28 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
135 vcpu_gp_regs(vcpu)->regs.regs[reg_num] = val; 161 vcpu_gp_regs(vcpu)->regs.regs[reg_num] = val;
136} 162}
137 163
138/* Get vcpu SPSR for current mode */ 164static inline unsigned long vcpu_read_spsr(const struct kvm_vcpu *vcpu)
139static inline unsigned long *vcpu_spsr(const struct kvm_vcpu *vcpu)
140{ 165{
141 if (vcpu_mode_is_32bit(vcpu)) 166 if (vcpu_mode_is_32bit(vcpu))
142 return vcpu_spsr32(vcpu); 167 return vcpu_read_spsr32(vcpu);
143 168
144 return (unsigned long *)&vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1]; 169 if (vcpu->arch.sysregs_loaded_on_cpu)
170 return read_sysreg_el1(spsr);
171 else
172 return vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1];
173}
174
175static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v)
176{
177 if (vcpu_mode_is_32bit(vcpu)) {
178 vcpu_write_spsr32(vcpu, v);
179 return;
180 }
181
182 if (vcpu->arch.sysregs_loaded_on_cpu)
183 write_sysreg_el1(v, spsr);
184 else
185 vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v;
145} 186}
146 187
147static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) 188static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu)
@@ -282,15 +323,18 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
282 323
283static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) 324static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
284{ 325{
285 return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; 326 return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
286} 327}
287 328
288static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) 329static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
289{ 330{
290 if (vcpu_mode_is_32bit(vcpu)) 331 if (vcpu_mode_is_32bit(vcpu)) {
291 *vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT; 332 *vcpu_cpsr(vcpu) |= COMPAT_PSR_E_BIT;
292 else 333 } else {
293 vcpu_sys_reg(vcpu, SCTLR_EL1) |= (1 << 25); 334 u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
335 sctlr |= (1 << 25);
336 vcpu_write_sys_reg(vcpu, SCTLR_EL1, sctlr);
337 }
294} 338}
295 339
296static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) 340static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
@@ -298,7 +342,7 @@ static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
298 if (vcpu_mode_is_32bit(vcpu)) 342 if (vcpu_mode_is_32bit(vcpu))
299 return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT); 343 return !!(*vcpu_cpsr(vcpu) & COMPAT_PSR_E_BIT);
300 344
301 return !!(vcpu_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); 345 return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & (1 << 25));
302} 346}
303 347
304static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, 348static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 596f8e414a4c..ab46bc70add6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -272,9 +272,6 @@ struct kvm_vcpu_arch {
272 /* IO related fields */ 272 /* IO related fields */
273 struct kvm_decode mmio_decode; 273 struct kvm_decode mmio_decode;
274 274
275 /* Interrupt related fields */
276 u64 irq_lines; /* IRQ and FIQ levels */
277
278 /* Cache some mmu pages needed inside spinlock regions */ 275 /* Cache some mmu pages needed inside spinlock regions */
279 struct kvm_mmu_memory_cache mmu_page_cache; 276 struct kvm_mmu_memory_cache mmu_page_cache;
280 277
@@ -287,10 +284,25 @@ struct kvm_vcpu_arch {
287 284
288 /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ 285 /* Virtual SError ESR to restore when HCR_EL2.VSE is set */
289 u64 vsesr_el2; 286 u64 vsesr_el2;
287
288 /* True when deferrable sysregs are loaded on the physical CPU,
289 * see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
290 bool sysregs_loaded_on_cpu;
290}; 291};
291 292
292#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs) 293#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs)
293#define vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) 294
295/*
296 * Only use __vcpu_sys_reg if you know you want the memory backed version of a
297 * register, and not the one most recently accessed by a running VCPU. For
298 * example, for userspace access or for system registers that are never context
299 * switched, but only emulated.
300 */
301#define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)])
302
303u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg);
304void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
305
294/* 306/*
295 * CP14 and CP15 live in the same array, as they are backed by the 307 * CP14 and CP15 live in the same array, as they are backed by the
296 * same system registers. 308 * same system registers.
@@ -298,14 +310,6 @@ struct kvm_vcpu_arch {
298#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r)]) 310#define vcpu_cp14(v,r) ((v)->arch.ctxt.copro[(r)])
299#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r)]) 311#define vcpu_cp15(v,r) ((v)->arch.ctxt.copro[(r)])
300 312
301#ifdef CONFIG_CPU_BIG_ENDIAN
302#define vcpu_cp15_64_high(v,r) vcpu_cp15((v),(r))
303#define vcpu_cp15_64_low(v,r) vcpu_cp15((v),(r) + 1)
304#else
305#define vcpu_cp15_64_high(v,r) vcpu_cp15((v),(r) + 1)
306#define vcpu_cp15_64_low(v,r) vcpu_cp15((v),(r))
307#endif
308
309struct kvm_vm_stat { 313struct kvm_vm_stat {
310 ulong remote_tlb_flush; 314 ulong remote_tlb_flush;
311}; 315};
@@ -358,10 +362,15 @@ int kvm_perf_teardown(void);
358 362
359struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); 363struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
360 364
365void __kvm_set_tpidr_el2(u64 tpidr_el2);
366DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
367
361static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, 368static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
362 unsigned long hyp_stack_ptr, 369 unsigned long hyp_stack_ptr,
363 unsigned long vector_ptr) 370 unsigned long vector_ptr)
364{ 371{
372 u64 tpidr_el2;
373
365 /* 374 /*
366 * Call initialization code, and switch to the full blown HYP code. 375 * Call initialization code, and switch to the full blown HYP code.
367 * If the cpucaps haven't been finalized yet, something has gone very 376 * If the cpucaps haven't been finalized yet, something has gone very
@@ -370,6 +379,16 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
370 */ 379 */
371 BUG_ON(!static_branch_likely(&arm64_const_caps_ready)); 380 BUG_ON(!static_branch_likely(&arm64_const_caps_ready));
372 __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr); 381 __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
382
383 /*
384 * Calculate the raw per-cpu offset without a translation from the
385 * kernel's mapping to the linear mapping, and store it in tpidr_el2
386 * so that we can use adr_l to access per-cpu variables in EL2.
387 */
388 tpidr_el2 = (u64)this_cpu_ptr(&kvm_host_cpu_state)
389 - (u64)kvm_ksym_ref(kvm_host_cpu_state);
390
391 kvm_call_hyp(__kvm_set_tpidr_el2, tpidr_el2);
373} 392}
374 393
375static inline void kvm_arch_hardware_unsetup(void) {} 394static inline void kvm_arch_hardware_unsetup(void) {}
@@ -416,6 +435,13 @@ static inline void kvm_arm_vhe_guest_enter(void)
416static inline void kvm_arm_vhe_guest_exit(void) 435static inline void kvm_arm_vhe_guest_exit(void)
417{ 436{
418 local_daif_restore(DAIF_PROCCTX_NOIRQ); 437 local_daif_restore(DAIF_PROCCTX_NOIRQ);
438
439 /*
440 * When we exit from the guest we change a number of CPU configuration
441 * parameters, such as traps. Make sure these changes take effect
442 * before running the host or additional guests.
443 */
444 isb();
419} 445}
420 446
421static inline bool kvm_arm_harden_branch_predictor(void) 447static inline bool kvm_arm_harden_branch_predictor(void)
@@ -423,4 +449,7 @@ static inline bool kvm_arm_harden_branch_predictor(void)
423 return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR); 449 return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
424} 450}
425 451
452void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
453void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
454
426#endif /* __ARM64_KVM_HOST_H__ */ 455#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index f26f9cd70c72..384c34397619 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -120,37 +120,38 @@ typeof(orig) * __hyp_text fname(void) \
120 return val; \ 120 return val; \
121} 121}
122 122
123void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
124void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
125int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu); 123int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
126 124
127void __vgic_v3_save_state(struct kvm_vcpu *vcpu); 125void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
128void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); 126void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
127void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
128void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
129void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
130void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
129int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); 131int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
130 132
131void __timer_enable_traps(struct kvm_vcpu *vcpu); 133void __timer_enable_traps(struct kvm_vcpu *vcpu);
132void __timer_disable_traps(struct kvm_vcpu *vcpu); 134void __timer_disable_traps(struct kvm_vcpu *vcpu);
133 135
134void __sysreg_save_host_state(struct kvm_cpu_context *ctxt); 136void __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt);
135void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt); 137void __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt);
136void __sysreg_save_guest_state(struct kvm_cpu_context *ctxt); 138void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt);
137void __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt); 139void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt);
140void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt);
141void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt);
138void __sysreg32_save_state(struct kvm_vcpu *vcpu); 142void __sysreg32_save_state(struct kvm_vcpu *vcpu);
139void __sysreg32_restore_state(struct kvm_vcpu *vcpu); 143void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
140 144
141void __debug_save_state(struct kvm_vcpu *vcpu, 145void __debug_switch_to_guest(struct kvm_vcpu *vcpu);
142 struct kvm_guest_debug_arch *dbg, 146void __debug_switch_to_host(struct kvm_vcpu *vcpu);
143 struct kvm_cpu_context *ctxt);
144void __debug_restore_state(struct kvm_vcpu *vcpu,
145 struct kvm_guest_debug_arch *dbg,
146 struct kvm_cpu_context *ctxt);
147void __debug_cond_save_host_state(struct kvm_vcpu *vcpu);
148void __debug_cond_restore_host_state(struct kvm_vcpu *vcpu);
149 147
150void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); 148void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
151void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); 149void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
152bool __fpsimd_enabled(void); 150bool __fpsimd_enabled(void);
153 151
152void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
153void deactivate_traps_vhe_put(void);
154
154u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); 155u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
155void __noreturn __hyp_do_panic(unsigned long, ...); 156void __noreturn __hyp_do_panic(unsigned long, ...);
156 157
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7faed6e48b46..082110993647 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -69,9 +69,6 @@
69 * mappings, and none of this applies in that case. 69 * mappings, and none of this applies in that case.
70 */ 70 */
71 71
72#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1)
73#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1)
74
75#ifdef __ASSEMBLY__ 72#ifdef __ASSEMBLY__
76 73
77#include <asm/alternative.h> 74#include <asm/alternative.h>
@@ -81,28 +78,19 @@
81 * Convert a kernel VA into a HYP VA. 78 * Convert a kernel VA into a HYP VA.
82 * reg: VA to be converted. 79 * reg: VA to be converted.
83 * 80 *
84 * This generates the following sequences: 81 * The actual code generation takes place in kvm_update_va_mask, and
85 * - High mask: 82 * the instructions below are only there to reserve the space and
86 * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK 83 * perform the register allocation (kvm_update_va_mask uses the
87 * nop 84 * specific registers encoded in the instructions).
88 * - Low mask:
89 * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
90 * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
91 * - VHE:
92 * nop
93 * nop
94 *
95 * The "low mask" version works because the mask is a strict subset of
96 * the "high mask", hence performing the first mask for nothing.
97 * Should be completely invisible on any viable CPU.
98 */ 85 */
99.macro kern_hyp_va reg 86.macro kern_hyp_va reg
100alternative_if_not ARM64_HAS_VIRT_HOST_EXTN 87alternative_cb kvm_update_va_mask
101 and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK 88 and \reg, \reg, #1 /* mask with va_mask */
102alternative_else_nop_endif 89 ror \reg, \reg, #1 /* rotate to the first tag bit */
103alternative_if ARM64_HYP_OFFSET_LOW 90 add \reg, \reg, #0 /* insert the low 12 bits of the tag */
104 and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK 91 add \reg, \reg, #0, lsl 12 /* insert the top 12 bits of the tag */
105alternative_else_nop_endif 92 ror \reg, \reg, #63 /* rotate back */
93alternative_cb_end
106.endm 94.endm
107 95
108#else 96#else
@@ -113,24 +101,44 @@ alternative_else_nop_endif
113#include <asm/mmu_context.h> 101#include <asm/mmu_context.h>
114#include <asm/pgtable.h> 102#include <asm/pgtable.h>
115 103
104void kvm_update_va_mask(struct alt_instr *alt,
105 __le32 *origptr, __le32 *updptr, int nr_inst);
106
116static inline unsigned long __kern_hyp_va(unsigned long v) 107static inline unsigned long __kern_hyp_va(unsigned long v)
117{ 108{
118 asm volatile(ALTERNATIVE("and %0, %0, %1", 109 asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"
119 "nop", 110 "ror %0, %0, #1\n"
120 ARM64_HAS_VIRT_HOST_EXTN) 111 "add %0, %0, #0\n"
121 : "+r" (v) 112 "add %0, %0, #0, lsl 12\n"
122 : "i" (HYP_PAGE_OFFSET_HIGH_MASK)); 113 "ror %0, %0, #63\n",
123 asm volatile(ALTERNATIVE("nop", 114 kvm_update_va_mask)
124 "and %0, %0, %1", 115 : "+r" (v));
125 ARM64_HYP_OFFSET_LOW)
126 : "+r" (v)
127 : "i" (HYP_PAGE_OFFSET_LOW_MASK));
128 return v; 116 return v;
129} 117}
130 118
131#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) 119#define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v))))
132 120
133/* 121/*
122 * Obtain the PC-relative address of a kernel symbol
123 * s: symbol
124 *
125 * The goal of this macro is to return a symbol's address based on a
126 * PC-relative computation, as opposed to a loading the VA from a
127 * constant pool or something similar. This works well for HYP, as an
128 * absolute VA is guaranteed to be wrong. Only use this if trying to
129 * obtain the address of a symbol (i.e. not something you obtained by
130 * following a pointer).
131 */
132#define hyp_symbol_addr(s) \
133 ({ \
134 typeof(s) *addr; \
135 asm("adrp %0, %1\n" \
136 "add %0, %0, :lo12:%1\n" \
137 : "=r" (addr) : "S" (&s)); \
138 addr; \
139 })
140
141/*
134 * We currently only support a 40bit IPA. 142 * We currently only support a 40bit IPA.
135 */ 143 */
136#define KVM_PHYS_SHIFT (40) 144#define KVM_PHYS_SHIFT (40)
@@ -140,7 +148,11 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
140#include <asm/stage2_pgtable.h> 148#include <asm/stage2_pgtable.h>
141 149
142int create_hyp_mappings(void *from, void *to, pgprot_t prot); 150int create_hyp_mappings(void *from, void *to, pgprot_t prot);
143int create_hyp_io_mappings(void *from, void *to, phys_addr_t); 151int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
152 void __iomem **kaddr,
153 void __iomem **haddr);
154int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
155 void **haddr);
144void free_hyp_pgds(void); 156void free_hyp_pgds(void);
145 157
146void stage2_unmap_vm(struct kvm *kvm); 158void stage2_unmap_vm(struct kvm *kvm);
@@ -249,7 +261,7 @@ struct kvm;
249 261
250static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) 262static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
251{ 263{
252 return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; 264 return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
253} 265}
254 266
255static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) 267static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
@@ -348,36 +360,95 @@ static inline unsigned int kvm_get_vmid_bits(void)
348 return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; 360 return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
349} 361}
350 362
351#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR 363#ifdef CONFIG_KVM_INDIRECT_VECTORS
364/*
365 * EL2 vectors can be mapped and rerouted in a number of ways,
366 * depending on the kernel configuration and CPU present:
367 *
368 * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
369 * hardening sequence is placed in one of the vector slots, which is
370 * executed before jumping to the real vectors.
371 *
372 * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
373 * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
374 * hardening sequence is mapped next to the idmap page, and executed
375 * before jumping to the real vectors.
376 *
377 * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
378 * empty slot is selected, mapped next to the idmap page, and
379 * executed before jumping to the real vectors.
380 *
381 * Note that ARM64_HARDEN_EL2_VECTORS is somewhat incompatible with
382 * VHE, as we don't have hypervisor-specific mappings. If the system
383 * is VHE and yet selects this capability, it will be ignored.
384 */
352#include <asm/mmu.h> 385#include <asm/mmu.h>
353 386
387extern void *__kvm_bp_vect_base;
388extern int __kvm_harden_el2_vector_slot;
389
354static inline void *kvm_get_hyp_vector(void) 390static inline void *kvm_get_hyp_vector(void)
355{ 391{
356 struct bp_hardening_data *data = arm64_get_bp_hardening_data(); 392 struct bp_hardening_data *data = arm64_get_bp_hardening_data();
357 void *vect = kvm_ksym_ref(__kvm_hyp_vector); 393 void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
394 int slot = -1;
358 395
359 if (data->fn) { 396 if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
360 vect = __bp_harden_hyp_vecs_start + 397 vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs_start));
361 data->hyp_vectors_slot * SZ_2K; 398 slot = data->hyp_vectors_slot;
399 }
362 400
363 if (!has_vhe()) 401 if (this_cpu_has_cap(ARM64_HARDEN_EL2_VECTORS) && !has_vhe()) {
364 vect = lm_alias(vect); 402 vect = __kvm_bp_vect_base;
403 if (slot == -1)
404 slot = __kvm_harden_el2_vector_slot;
365 } 405 }
366 406
407 if (slot != -1)
408 vect += slot * SZ_2K;
409
367 return vect; 410 return vect;
368} 411}
369 412
413/* This is only called on a !VHE system */
370static inline int kvm_map_vectors(void) 414static inline int kvm_map_vectors(void)
371{ 415{
372 return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start), 416 /*
373 kvm_ksym_ref(__bp_harden_hyp_vecs_end), 417 * HBP = ARM64_HARDEN_BRANCH_PREDICTOR
374 PAGE_HYP_EXEC); 418 * HEL2 = ARM64_HARDEN_EL2_VECTORS
375} 419 *
420 * !HBP + !HEL2 -> use direct vectors
421 * HBP + !HEL2 -> use hardened vectors in place
422 * !HBP + HEL2 -> allocate one vector slot and use exec mapping
423 * HBP + HEL2 -> use hardened vertors and use exec mapping
424 */
425 if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
426 __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs_start);
427 __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
428 }
429
430 if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
431 phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs_start);
432 unsigned long size = (__bp_harden_hyp_vecs_end -
433 __bp_harden_hyp_vecs_start);
434
435 /*
436 * Always allocate a spare vector slot, as we don't
437 * know yet which CPUs have a BP hardening slot that
438 * we can reuse.
439 */
440 __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
441 BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
442 return create_hyp_exec_mappings(vect_pa, size,
443 &__kvm_bp_vect_base);
444 }
376 445
446 return 0;
447}
377#else 448#else
378static inline void *kvm_get_hyp_vector(void) 449static inline void *kvm_get_hyp_vector(void)
379{ 450{
380 return kvm_ksym_ref(__kvm_hyp_vector); 451 return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
381} 452}
382 453
383static inline int kvm_map_vectors(void) 454static inline int kvm_map_vectors(void)
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index a050d4f3615d..dd320df0d026 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -21,6 +21,8 @@
21#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT) 21#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT)
22#define TTBR_ASID_MASK (UL(0xffff) << 48) 22#define TTBR_ASID_MASK (UL(0xffff) << 48)
23 23
24#define BP_HARDEN_EL2_SLOTS 4
25
24#ifndef __ASSEMBLY__ 26#ifndef __ASSEMBLY__
25 27
26typedef struct { 28typedef struct {
@@ -49,9 +51,13 @@ struct bp_hardening_data {
49 bp_hardening_cb_t fn; 51 bp_hardening_cb_t fn;
50}; 52};
51 53
52#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR 54#if (defined(CONFIG_HARDEN_BRANCH_PREDICTOR) || \
55 defined(CONFIG_HARDEN_EL2_VECTORS))
53extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[]; 56extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[];
57extern atomic_t arm64_el2_vector_last_slot;
58#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR || CONFIG_HARDEN_EL2_VECTORS */
54 59
60#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
55DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); 61DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
56 62
57static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) 63static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index e7b9f154e476..6171178075dc 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -288,6 +288,12 @@
288#define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0) 288#define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0)
289#define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0) 289#define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0)
290 290
291#define SYS_LORSA_EL1 sys_reg(3, 0, 10, 4, 0)
292#define SYS_LOREA_EL1 sys_reg(3, 0, 10, 4, 1)
293#define SYS_LORN_EL1 sys_reg(3, 0, 10, 4, 2)
294#define SYS_LORC_EL1 sys_reg(3, 0, 10, 4, 3)
295#define SYS_LORID_EL1 sys_reg(3, 0, 10, 4, 7)
296
291#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0) 297#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0)
292#define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1) 298#define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1)
293 299
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6a4bd80c75bd..9b55a3f24be7 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -55,9 +55,7 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
55arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 55arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
56arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o 56arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
57 57
58ifeq ($(CONFIG_KVM),y) 58arm64-obj-$(CONFIG_KVM_INDIRECT_VECTORS)+= bpi.o
59arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o
60endif
61 59
62obj-y += $(arm64-obj-y) vdso/ probes/ 60obj-y += $(arm64-obj-y) vdso/ probes/
63obj-m += $(arm64-obj-m) 61obj-m += $(arm64-obj-m)
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 414288a558c8..5c4bce4ac381 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -107,32 +107,53 @@ static u32 get_alt_insn(struct alt_instr *alt, __le32 *insnptr, __le32 *altinsnp
107 return insn; 107 return insn;
108} 108}
109 109
110static void patch_alternative(struct alt_instr *alt,
111 __le32 *origptr, __le32 *updptr, int nr_inst)
112{
113 __le32 *replptr;
114 int i;
115
116 replptr = ALT_REPL_PTR(alt);
117 for (i = 0; i < nr_inst; i++) {
118 u32 insn;
119
120 insn = get_alt_insn(alt, origptr + i, replptr + i);
121 updptr[i] = cpu_to_le32(insn);
122 }
123}
124
110static void __apply_alternatives(void *alt_region, bool use_linear_alias) 125static void __apply_alternatives(void *alt_region, bool use_linear_alias)
111{ 126{
112 struct alt_instr *alt; 127 struct alt_instr *alt;
113 struct alt_region *region = alt_region; 128 struct alt_region *region = alt_region;
114 __le32 *origptr, *replptr, *updptr; 129 __le32 *origptr, *updptr;
130 alternative_cb_t alt_cb;
115 131
116 for (alt = region->begin; alt < region->end; alt++) { 132 for (alt = region->begin; alt < region->end; alt++) {
117 u32 insn; 133 int nr_inst;
118 int i, nr_inst;
119 134
120 if (!cpus_have_cap(alt->cpufeature)) 135 /* Use ARM64_CB_PATCH as an unconditional patch */
136 if (alt->cpufeature < ARM64_CB_PATCH &&
137 !cpus_have_cap(alt->cpufeature))
121 continue; 138 continue;
122 139
123 BUG_ON(alt->alt_len != alt->orig_len); 140 if (alt->cpufeature == ARM64_CB_PATCH)
141 BUG_ON(alt->alt_len != 0);
142 else
143 BUG_ON(alt->alt_len != alt->orig_len);
124 144
125 pr_info_once("patching kernel code\n"); 145 pr_info_once("patching kernel code\n");
126 146
127 origptr = ALT_ORIG_PTR(alt); 147 origptr = ALT_ORIG_PTR(alt);
128 replptr = ALT_REPL_PTR(alt);
129 updptr = use_linear_alias ? lm_alias(origptr) : origptr; 148 updptr = use_linear_alias ? lm_alias(origptr) : origptr;
130 nr_inst = alt->alt_len / sizeof(insn); 149 nr_inst = alt->orig_len / AARCH64_INSN_SIZE;
131 150
132 for (i = 0; i < nr_inst; i++) { 151 if (alt->cpufeature < ARM64_CB_PATCH)
133 insn = get_alt_insn(alt, origptr + i, replptr + i); 152 alt_cb = patch_alternative;
134 updptr[i] = cpu_to_le32(insn); 153 else
135 } 154 alt_cb = ALT_REPL_PTR(alt);
155
156 alt_cb(alt, origptr, updptr, nr_inst);
136 157
137 flush_icache_range((uintptr_t)origptr, 158 flush_icache_range((uintptr_t)origptr,
138 (uintptr_t)(origptr + nr_inst)); 159 (uintptr_t)(origptr + nr_inst));
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 1303e04110cd..78e1b0a70aaf 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -138,6 +138,7 @@ int main(void)
138 DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs)); 138 DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs));
139 DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2])); 139 DEFINE(VCPU_FPEXC32_EL2, offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
140 DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); 140 DEFINE(VCPU_HOST_CONTEXT, offsetof(struct kvm_vcpu, arch.host_cpu_context));
141 DEFINE(HOST_CONTEXT_VCPU, offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
141#endif 142#endif
142#ifdef CONFIG_CPU_PM 143#ifdef CONFIG_CPU_PM
143 DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx)); 144 DEFINE(CPU_SUSPEND_SZ, sizeof(struct cpu_suspend_ctx));
diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S
index e5de33513b5d..bb0b67722e86 100644
--- a/arch/arm64/kernel/bpi.S
+++ b/arch/arm64/kernel/bpi.S
@@ -19,42 +19,61 @@
19#include <linux/linkage.h> 19#include <linux/linkage.h>
20#include <linux/arm-smccc.h> 20#include <linux/arm-smccc.h>
21 21
22.macro ventry target 22#include <asm/alternative.h>
23 .rept 31 23#include <asm/mmu.h>
24
25.macro hyp_ventry
26 .align 7
271: .rept 27
24 nop 28 nop
25 .endr 29 .endr
26 b \target 30/*
31 * The default sequence is to directly branch to the KVM vectors,
32 * using the computed offset. This applies for VHE as well as
33 * !ARM64_HARDEN_EL2_VECTORS.
34 *
35 * For ARM64_HARDEN_EL2_VECTORS configurations, this gets replaced
36 * with:
37 *
38 * stp x0, x1, [sp, #-16]!
39 * movz x0, #(addr & 0xffff)
40 * movk x0, #((addr >> 16) & 0xffff), lsl #16
41 * movk x0, #((addr >> 32) & 0xffff), lsl #32
42 * br x0
43 *
44 * Where addr = kern_hyp_va(__kvm_hyp_vector) + vector-offset + 4.
45 * See kvm_patch_vector_branch for details.
46 */
47alternative_cb kvm_patch_vector_branch
48 b __kvm_hyp_vector + (1b - 0b)
49 nop
50 nop
51 nop
52 nop
53alternative_cb_end
27.endm 54.endm
28 55
29.macro vectors target 56.macro generate_vectors
30 ventry \target + 0x000 570:
31 ventry \target + 0x080 58 .rept 16
32 ventry \target + 0x100 59 hyp_ventry
33 ventry \target + 0x180 60 .endr
34 61 .org 0b + SZ_2K // Safety measure
35 ventry \target + 0x200 62.endm
36 ventry \target + 0x280
37 ventry \target + 0x300
38 ventry \target + 0x380
39 63
40 ventry \target + 0x400
41 ventry \target + 0x480
42 ventry \target + 0x500
43 ventry \target + 0x580
44 64
45 ventry \target + 0x600 65 .text
46 ventry \target + 0x680 66 .pushsection .hyp.text, "ax"
47 ventry \target + 0x700
48 ventry \target + 0x780
49.endm
50 67
51 .align 11 68 .align 11
52ENTRY(__bp_harden_hyp_vecs_start) 69ENTRY(__bp_harden_hyp_vecs_start)
53 .rept 4 70 .rept BP_HARDEN_EL2_SLOTS
54 vectors __kvm_hyp_vector 71 generate_vectors
55 .endr 72 .endr
56ENTRY(__bp_harden_hyp_vecs_end) 73ENTRY(__bp_harden_hyp_vecs_end)
57 74
75 .popsection
76
58ENTRY(__qcom_hyp_sanitize_link_stack_start) 77ENTRY(__qcom_hyp_sanitize_link_stack_start)
59 stp x29, x30, [sp, #-16]! 78 stp x29, x30, [sp, #-16]!
60 .rept 16 79 .rept 16
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 2df792771053..9262ec57f5ab 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -78,6 +78,8 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused)
78 config_sctlr_el1(SCTLR_EL1_UCT, 0); 78 config_sctlr_el1(SCTLR_EL1_UCT, 0);
79} 79}
80 80
81atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
82
81#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR 83#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
82#include <asm/mmu_context.h> 84#include <asm/mmu_context.h>
83#include <asm/cacheflush.h> 85#include <asm/cacheflush.h>
@@ -108,7 +110,6 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
108 const char *hyp_vecs_start, 110 const char *hyp_vecs_start,
109 const char *hyp_vecs_end) 111 const char *hyp_vecs_end)
110{ 112{
111 static int last_slot = -1;
112 static DEFINE_SPINLOCK(bp_lock); 113 static DEFINE_SPINLOCK(bp_lock);
113 int cpu, slot = -1; 114 int cpu, slot = -1;
114 115
@@ -121,10 +122,8 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
121 } 122 }
122 123
123 if (slot == -1) { 124 if (slot == -1) {
124 last_slot++; 125 slot = atomic_inc_return(&arm64_el2_vector_last_slot);
125 BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) 126 BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
126 / SZ_2K) <= last_slot);
127 slot = last_slot;
128 __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); 127 __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
129 } 128 }
130 129
@@ -348,6 +347,10 @@ static const struct arm64_cpu_capabilities arm64_bp_harden_list[] = {
348 347
349#endif 348#endif
350 349
350#ifndef ERRATA_MIDR_ALL_VERSIONS
351#define ERRATA_MIDR_ALL_VERSIONS(x) MIDR_ALL_VERSIONS(x)
352#endif
353
351const struct arm64_cpu_capabilities arm64_errata[] = { 354const struct arm64_cpu_capabilities arm64_errata[] = {
352#if defined(CONFIG_ARM64_ERRATUM_826319) || \ 355#if defined(CONFIG_ARM64_ERRATUM_826319) || \
353 defined(CONFIG_ARM64_ERRATUM_827319) || \ 356 defined(CONFIG_ARM64_ERRATUM_827319) || \
@@ -501,6 +504,18 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
501 ERRATA_MIDR_RANGE_LIST(qcom_bp_harden_cpus), 504 ERRATA_MIDR_RANGE_LIST(qcom_bp_harden_cpus),
502 }, 505 },
503#endif 506#endif
507#ifdef CONFIG_HARDEN_EL2_VECTORS
508 {
509 .desc = "Cortex-A57 EL2 vector hardening",
510 .capability = ARM64_HARDEN_EL2_VECTORS,
511 ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
512 },
513 {
514 .desc = "Cortex-A72 EL2 vector hardening",
515 .capability = ARM64_HARDEN_EL2_VECTORS,
516 ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
517 },
518#endif
504 { 519 {
505 } 520 }
506}; 521};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 96b15d7b10a8..536d572e5596 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -838,19 +838,6 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int _
838 MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); 838 MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK));
839} 839}
840 840
841static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
842 int __unused)
843{
844 phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
845
846 /*
847 * Activate the lower HYP offset only if:
848 * - the idmap doesn't clash with it,
849 * - the kernel is not running at EL2.
850 */
851 return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
852}
853
854static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) 841static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused)
855{ 842{
856 u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); 843 u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
@@ -1121,12 +1108,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
1121 .field_pos = ID_AA64PFR0_EL0_SHIFT, 1108 .field_pos = ID_AA64PFR0_EL0_SHIFT,
1122 .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, 1109 .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
1123 }, 1110 },
1124 {
1125 .desc = "Reduced HYP mapping offset",
1126 .capability = ARM64_HYP_OFFSET_LOW,
1127 .type = ARM64_CPUCAP_SYSTEM_FEATURE,
1128 .matches = hyp_offset_low,
1129 },
1130#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 1111#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
1131 { 1112 {
1132 .desc = "Kernel page table isolation (KPTI)", 1113 .desc = "Kernel page table isolation (KPTI)",
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 2b6b8b24e5ab..b0853069702f 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -577,6 +577,13 @@ set_hcr:
5777: 5777:
578 msr mdcr_el2, x3 // Configure debug traps 578 msr mdcr_el2, x3 // Configure debug traps
579 579
580 /* LORegions */
581 mrs x1, id_aa64mmfr1_el1
582 ubfx x0, x1, #ID_AA64MMFR1_LOR_SHIFT, 4
583 cbz x0, 1f
584 msr_s SYS_LORC_EL1, xzr
5851:
586
580 /* Stage-2 translation */ 587 /* Stage-2 translation */
581 msr vttbr_el2, xzr 588 msr vttbr_el2, xzr
582 589
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 2718a77da165..816d03c4c913 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -35,6 +35,7 @@
35 35
36#define AARCH64_INSN_SF_BIT BIT(31) 36#define AARCH64_INSN_SF_BIT BIT(31)
37#define AARCH64_INSN_N_BIT BIT(22) 37#define AARCH64_INSN_N_BIT BIT(22)
38#define AARCH64_INSN_LSL_12 BIT(22)
38 39
39static int aarch64_insn_encoding_class[] = { 40static int aarch64_insn_encoding_class[] = {
40 AARCH64_INSN_CLS_UNKNOWN, 41 AARCH64_INSN_CLS_UNKNOWN,
@@ -343,6 +344,10 @@ static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
343 mask = BIT(6) - 1; 344 mask = BIT(6) - 1;
344 shift = 16; 345 shift = 16;
345 break; 346 break;
347 case AARCH64_INSN_IMM_N:
348 mask = 1;
349 shift = 22;
350 break;
346 default: 351 default:
347 return -EINVAL; 352 return -EINVAL;
348 } 353 }
@@ -899,9 +904,18 @@ u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
899 return AARCH64_BREAK_FAULT; 904 return AARCH64_BREAK_FAULT;
900 } 905 }
901 906
907 /* We can't encode more than a 24bit value (12bit + 12bit shift) */
908 if (imm & ~(BIT(24) - 1))
909 goto out;
910
911 /* If we have something in the top 12 bits... */
902 if (imm & ~(SZ_4K - 1)) { 912 if (imm & ~(SZ_4K - 1)) {
903 pr_err("%s: invalid immediate encoding %d\n", __func__, imm); 913 /* ... and in the low 12 bits -> error */
904 return AARCH64_BREAK_FAULT; 914 if (imm & (SZ_4K - 1))
915 goto out;
916
917 imm >>= 12;
918 insn |= AARCH64_INSN_LSL_12;
905 } 919 }
906 920
907 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst); 921 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);
@@ -909,6 +923,10 @@ u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
909 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src); 923 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
910 924
911 return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm); 925 return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
926
927out:
928 pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
929 return AARCH64_BREAK_FAULT;
912} 930}
913 931
914u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst, 932u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
@@ -1481,3 +1499,171 @@ pstate_check_t * const aarch32_opcode_cond_checks[16] = {
1481 __check_hi, __check_ls, __check_ge, __check_lt, 1499 __check_hi, __check_ls, __check_ge, __check_lt,
1482 __check_gt, __check_le, __check_al, __check_al 1500 __check_gt, __check_le, __check_al, __check_al
1483}; 1501};
1502
1503static bool range_of_ones(u64 val)
1504{
1505 /* Doesn't handle full ones or full zeroes */
1506 u64 sval = val >> __ffs64(val);
1507
1508 /* One of Sean Eron Anderson's bithack tricks */
1509 return ((sval + 1) & (sval)) == 0;
1510}
1511
1512static u32 aarch64_encode_immediate(u64 imm,
1513 enum aarch64_insn_variant variant,
1514 u32 insn)
1515{
1516 unsigned int immr, imms, n, ones, ror, esz, tmp;
1517 u64 mask = ~0UL;
1518
1519 /* Can't encode full zeroes or full ones */
1520 if (!imm || !~imm)
1521 return AARCH64_BREAK_FAULT;
1522
1523 switch (variant) {
1524 case AARCH64_INSN_VARIANT_32BIT:
1525 if (upper_32_bits(imm))
1526 return AARCH64_BREAK_FAULT;
1527 esz = 32;
1528 break;
1529 case AARCH64_INSN_VARIANT_64BIT:
1530 insn |= AARCH64_INSN_SF_BIT;
1531 esz = 64;
1532 break;
1533 default:
1534 pr_err("%s: unknown variant encoding %d\n", __func__, variant);
1535 return AARCH64_BREAK_FAULT;
1536 }
1537
1538 /*
1539 * Inverse of Replicate(). Try to spot a repeating pattern
1540 * with a pow2 stride.
1541 */
1542 for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
1543 u64 emask = BIT(tmp) - 1;
1544
1545 if ((imm & emask) != ((imm >> tmp) & emask))
1546 break;
1547
1548 esz = tmp;
1549 mask = emask;
1550 }
1551
1552 /* N is only set if we're encoding a 64bit value */
1553 n = esz == 64;
1554
1555 /* Trim imm to the element size */
1556 imm &= mask;
1557
1558 /* That's how many ones we need to encode */
1559 ones = hweight64(imm);
1560
1561 /*
1562 * imms is set to (ones - 1), prefixed with a string of ones
1563 * and a zero if they fit. Cap it to 6 bits.
1564 */
1565 imms = ones - 1;
1566 imms |= 0xf << ffs(esz);
1567 imms &= BIT(6) - 1;
1568
1569 /* Compute the rotation */
1570 if (range_of_ones(imm)) {
1571 /*
1572 * Pattern: 0..01..10..0
1573 *
1574 * Compute how many rotate we need to align it right
1575 */
1576 ror = __ffs64(imm);
1577 } else {
1578 /*
1579 * Pattern: 0..01..10..01..1
1580 *
1581 * Fill the unused top bits with ones, and check if
1582 * the result is a valid immediate (all ones with a
1583 * contiguous ranges of zeroes).
1584 */
1585 imm |= ~mask;
1586 if (!range_of_ones(~imm))
1587 return AARCH64_BREAK_FAULT;
1588
1589 /*
1590 * Compute the rotation to get a continuous set of
1591 * ones, with the first bit set at position 0
1592 */
1593 ror = fls(~imm);
1594 }
1595
1596 /*
1597 * immr is the number of bits we need to rotate back to the
1598 * original set of ones. Note that this is relative to the
1599 * element size...
1600 */
1601 immr = (esz - ror) % esz;
1602
1603 insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
1604 insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
1605 return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
1606}
1607
1608u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
1609 enum aarch64_insn_variant variant,
1610 enum aarch64_insn_register Rn,
1611 enum aarch64_insn_register Rd,
1612 u64 imm)
1613{
1614 u32 insn;
1615
1616 switch (type) {
1617 case AARCH64_INSN_LOGIC_AND:
1618 insn = aarch64_insn_get_and_imm_value();
1619 break;
1620 case AARCH64_INSN_LOGIC_ORR:
1621 insn = aarch64_insn_get_orr_imm_value();
1622 break;
1623 case AARCH64_INSN_LOGIC_EOR:
1624 insn = aarch64_insn_get_eor_imm_value();
1625 break;
1626 case AARCH64_INSN_LOGIC_AND_SETFLAGS:
1627 insn = aarch64_insn_get_ands_imm_value();
1628 break;
1629 default:
1630 pr_err("%s: unknown logical encoding %d\n", __func__, type);
1631 return AARCH64_BREAK_FAULT;
1632 }
1633
1634 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
1635 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
1636 return aarch64_encode_immediate(imm, variant, insn);
1637}
1638
1639u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
1640 enum aarch64_insn_register Rm,
1641 enum aarch64_insn_register Rn,
1642 enum aarch64_insn_register Rd,
1643 u8 lsb)
1644{
1645 u32 insn;
1646
1647 insn = aarch64_insn_get_extr_value();
1648
1649 switch (variant) {
1650 case AARCH64_INSN_VARIANT_32BIT:
1651 if (lsb > 31)
1652 return AARCH64_BREAK_FAULT;
1653 break;
1654 case AARCH64_INSN_VARIANT_64BIT:
1655 if (lsb > 63)
1656 return AARCH64_BREAK_FAULT;
1657 insn |= AARCH64_INSN_SF_BIT;
1658 insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
1659 break;
1660 default:
1661 pr_err("%s: unknown variant encoding %d\n", __func__, variant);
1662 return AARCH64_BREAK_FAULT;
1663 }
1664
1665 insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
1666 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
1667 insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
1668 return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
1669}
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 2257dfcc44cc..a2e3a5af1113 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -57,6 +57,9 @@ config KVM_ARM_PMU
57 Adds support for a virtual Performance Monitoring Unit (PMU) in 57 Adds support for a virtual Performance Monitoring Unit (PMU) in
58 virtual machines. 58 virtual machines.
59 59
60config KVM_INDIRECT_VECTORS
61 def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
62
60source drivers/vhost/Kconfig 63source drivers/vhost/Kconfig
61 64
62endif # VIRTUALIZATION 65endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 87c4f7ae24de..93afff91cb7c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,7 +16,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/e
16kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o 16kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
17kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o 17kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
18 18
19kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o 19kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
20kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o 20kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
21kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o 21kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
22kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o 22kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index fa63b28c65e0..a1f4ebdfe6d3 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -46,7 +46,9 @@ static DEFINE_PER_CPU(u32, mdcr_el2);
46 */ 46 */
47static void save_guest_debug_regs(struct kvm_vcpu *vcpu) 47static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
48{ 48{
49 vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1); 49 u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
50
51 vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
50 52
51 trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", 53 trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
52 vcpu->arch.guest_debug_preserved.mdscr_el1); 54 vcpu->arch.guest_debug_preserved.mdscr_el1);
@@ -54,10 +56,12 @@ static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
54 56
55static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) 57static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
56{ 58{
57 vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1; 59 u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
60
61 vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
58 62
59 trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", 63 trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
60 vcpu_sys_reg(vcpu, MDSCR_EL1)); 64 vcpu_read_sys_reg(vcpu, MDSCR_EL1));
61} 65}
62 66
63/** 67/**
@@ -108,6 +112,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
108void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) 112void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
109{ 113{
110 bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY); 114 bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
115 unsigned long mdscr;
111 116
112 trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); 117 trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
113 118
@@ -152,9 +157,13 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
152 */ 157 */
153 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 158 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
154 *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; 159 *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
155 vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS; 160 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
161 mdscr |= DBG_MDSCR_SS;
162 vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
156 } else { 163 } else {
157 vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS; 164 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
165 mdscr &= ~DBG_MDSCR_SS;
166 vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
158 } 167 }
159 168
160 trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); 169 trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
@@ -170,7 +179,9 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
170 */ 179 */
171 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { 180 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
172 /* Enable breakpoints/watchpoints */ 181 /* Enable breakpoints/watchpoints */
173 vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE; 182 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
183 mdscr |= DBG_MDSCR_MDE;
184 vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1);
174 185
175 vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; 186 vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
176 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; 187 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
@@ -193,8 +204,12 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
193 if (trap_debug) 204 if (trap_debug)
194 vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; 205 vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
195 206
207 /* If KDE or MDE are set, perform a full save/restore cycle. */
208 if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE))
209 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
210
196 trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); 211 trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
197 trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1)); 212 trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1));
198} 213}
199 214
200void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) 215void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 5aa9ccf6db99..6fd91b31a131 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -117,7 +117,6 @@ CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
117 /* Set the stack and new vectors */ 117 /* Set the stack and new vectors */
118 kern_hyp_va x1 118 kern_hyp_va x1
119 mov sp, x1 119 mov sp, x1
120 kern_hyp_va x2
121 msr vbar_el2, x2 120 msr vbar_el2, x2
122 121
123 /* copy tpidr_el1 into tpidr_el2 for use by HYP */ 122 /* copy tpidr_el1 into tpidr_el2 for use by HYP */
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f04400d494b7..4313f7475333 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -7,10 +7,10 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING
7 7
8KVM=../../../../virt/kvm 8KVM=../../../../virt/kvm
9 9
10obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
11obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o 10obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
12obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o 11obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
13 12
13obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
14obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o 14obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
15obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o 15obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
16obj-$(CONFIG_KVM_ARM_HOST) += entry.o 16obj-$(CONFIG_KVM_ARM_HOST) += entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index dabb5cc7b087..3e717f66f011 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -66,11 +66,6 @@
66 default: write_debug(ptr[0], reg, 0); \ 66 default: write_debug(ptr[0], reg, 0); \
67 } 67 }
68 68
69static void __hyp_text __debug_save_spe_vhe(u64 *pmscr_el1)
70{
71 /* The vcpu can run. but it can't hide. */
72}
73
74static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1) 69static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
75{ 70{
76 u64 reg; 71 u64 reg;
@@ -103,11 +98,7 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
103 dsb(nsh); 98 dsb(nsh);
104} 99}
105 100
106static hyp_alternate_select(__debug_save_spe, 101static void __hyp_text __debug_restore_spe_nvhe(u64 pmscr_el1)
107 __debug_save_spe_nvhe, __debug_save_spe_vhe,
108 ARM64_HAS_VIRT_HOST_EXTN);
109
110static void __hyp_text __debug_restore_spe(u64 pmscr_el1)
111{ 102{
112 if (!pmscr_el1) 103 if (!pmscr_el1)
113 return; 104 return;
@@ -119,16 +110,13 @@ static void __hyp_text __debug_restore_spe(u64 pmscr_el1)
119 write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); 110 write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1);
120} 111}
121 112
122void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu, 113static void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
123 struct kvm_guest_debug_arch *dbg, 114 struct kvm_guest_debug_arch *dbg,
124 struct kvm_cpu_context *ctxt) 115 struct kvm_cpu_context *ctxt)
125{ 116{
126 u64 aa64dfr0; 117 u64 aa64dfr0;
127 int brps, wrps; 118 int brps, wrps;
128 119
129 if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
130 return;
131
132 aa64dfr0 = read_sysreg(id_aa64dfr0_el1); 120 aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
133 brps = (aa64dfr0 >> 12) & 0xf; 121 brps = (aa64dfr0 >> 12) & 0xf;
134 wrps = (aa64dfr0 >> 20) & 0xf; 122 wrps = (aa64dfr0 >> 20) & 0xf;
@@ -141,16 +129,13 @@ void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
141 ctxt->sys_regs[MDCCINT_EL1] = read_sysreg(mdccint_el1); 129 ctxt->sys_regs[MDCCINT_EL1] = read_sysreg(mdccint_el1);
142} 130}
143 131
144void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu, 132static void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu,
145 struct kvm_guest_debug_arch *dbg, 133 struct kvm_guest_debug_arch *dbg,
146 struct kvm_cpu_context *ctxt) 134 struct kvm_cpu_context *ctxt)
147{ 135{
148 u64 aa64dfr0; 136 u64 aa64dfr0;
149 int brps, wrps; 137 int brps, wrps;
150 138
151 if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
152 return;
153
154 aa64dfr0 = read_sysreg(id_aa64dfr0_el1); 139 aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
155 140
156 brps = (aa64dfr0 >> 12) & 0xf; 141 brps = (aa64dfr0 >> 12) & 0xf;
@@ -164,27 +149,54 @@ void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu,
164 write_sysreg(ctxt->sys_regs[MDCCINT_EL1], mdccint_el1); 149 write_sysreg(ctxt->sys_regs[MDCCINT_EL1], mdccint_el1);
165} 150}
166 151
167void __hyp_text __debug_cond_save_host_state(struct kvm_vcpu *vcpu) 152void __hyp_text __debug_switch_to_guest(struct kvm_vcpu *vcpu)
168{ 153{
169 /* If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY is set, perform 154 struct kvm_cpu_context *host_ctxt;
170 * a full save/restore cycle. */ 155 struct kvm_cpu_context *guest_ctxt;
171 if ((vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_KDE) || 156 struct kvm_guest_debug_arch *host_dbg;
172 (vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_MDE)) 157 struct kvm_guest_debug_arch *guest_dbg;
173 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; 158
174 159 /*
175 __debug_save_state(vcpu, &vcpu->arch.host_debug_state.regs, 160 * Non-VHE: Disable and flush SPE data generation
176 kern_hyp_va(vcpu->arch.host_cpu_context)); 161 * VHE: The vcpu can run, but it can't hide.
177 __debug_save_spe()(&vcpu->arch.host_debug_state.pmscr_el1); 162 */
163 if (!has_vhe())
164 __debug_save_spe_nvhe(&vcpu->arch.host_debug_state.pmscr_el1);
165
166 if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
167 return;
168
169 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
170 guest_ctxt = &vcpu->arch.ctxt;
171 host_dbg = &vcpu->arch.host_debug_state.regs;
172 guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
173
174 __debug_save_state(vcpu, host_dbg, host_ctxt);
175 __debug_restore_state(vcpu, guest_dbg, guest_ctxt);
178} 176}
179 177
180void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu) 178void __hyp_text __debug_switch_to_host(struct kvm_vcpu *vcpu)
181{ 179{
182 __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); 180 struct kvm_cpu_context *host_ctxt;
183 __debug_restore_state(vcpu, &vcpu->arch.host_debug_state.regs, 181 struct kvm_cpu_context *guest_ctxt;
184 kern_hyp_va(vcpu->arch.host_cpu_context)); 182 struct kvm_guest_debug_arch *host_dbg;
183 struct kvm_guest_debug_arch *guest_dbg;
184
185 if (!has_vhe())
186 __debug_restore_spe_nvhe(vcpu->arch.host_debug_state.pmscr_el1);
187
188 if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
189 return;
190
191 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
192 guest_ctxt = &vcpu->arch.ctxt;
193 host_dbg = &vcpu->arch.host_debug_state.regs;
194 guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
195
196 __debug_save_state(vcpu, guest_dbg, guest_ctxt);
197 __debug_restore_state(vcpu, host_dbg, host_ctxt);
185 198
186 if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY) 199 vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
187 vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
188} 200}
189 201
190u32 __hyp_text __kvm_get_mdcr_el2(void) 202u32 __hyp_text __kvm_get_mdcr_el2(void)
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index fdd1068ee3a5..1f458f7c3b44 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -62,9 +62,6 @@ ENTRY(__guest_enter)
62 // Store the host regs 62 // Store the host regs
63 save_callee_saved_regs x1 63 save_callee_saved_regs x1
64 64
65 // Store host_ctxt and vcpu for use at exit time
66 stp x1, x0, [sp, #-16]!
67
68 add x18, x0, #VCPU_CONTEXT 65 add x18, x0, #VCPU_CONTEXT
69 66
70 // Restore guest regs x0-x17 67 // Restore guest regs x0-x17
@@ -118,8 +115,7 @@ ENTRY(__guest_exit)
118 // Store the guest regs x19-x29, lr 115 // Store the guest regs x19-x29, lr
119 save_callee_saved_regs x1 116 save_callee_saved_regs x1
120 117
121 // Restore the host_ctxt from the stack 118 get_host_ctxt x2, x3
122 ldr x2, [sp], #16
123 119
124 // Now restore the host regs 120 // Now restore the host regs
125 restore_callee_saved_regs x2 121 restore_callee_saved_regs x2
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index f36464bd57c5..87dfecce82b1 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -55,15 +55,9 @@ ENTRY(__vhe_hyp_call)
55ENDPROC(__vhe_hyp_call) 55ENDPROC(__vhe_hyp_call)
56 56
57el1_sync: // Guest trapped into EL2 57el1_sync: // Guest trapped into EL2
58 stp x0, x1, [sp, #-16]!
59
60alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
61 mrs x1, esr_el2
62alternative_else
63 mrs x1, esr_el1
64alternative_endif
65 lsr x0, x1, #ESR_ELx_EC_SHIFT
66 58
59 mrs x0, esr_el2
60 lsr x0, x0, #ESR_ELx_EC_SHIFT
67 cmp x0, #ESR_ELx_EC_HVC64 61 cmp x0, #ESR_ELx_EC_HVC64
68 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne 62 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
69 b.ne el1_trap 63 b.ne el1_trap
@@ -117,10 +111,14 @@ el1_hvc_guest:
117 eret 111 eret
118 112
119el1_trap: 113el1_trap:
114 get_vcpu_ptr x1, x0
115
116 mrs x0, esr_el2
117 lsr x0, x0, #ESR_ELx_EC_SHIFT
120 /* 118 /*
121 * x0: ESR_EC 119 * x0: ESR_EC
120 * x1: vcpu pointer
122 */ 121 */
123 ldr x1, [sp, #16 + 8] // vcpu stored by __guest_enter
124 122
125 /* 123 /*
126 * We trap the first access to the FP/SIMD to save the host context 124 * We trap the first access to the FP/SIMD to save the host context
@@ -137,18 +135,18 @@ alternative_else_nop_endif
137 b __guest_exit 135 b __guest_exit
138 136
139el1_irq: 137el1_irq:
140 stp x0, x1, [sp, #-16]! 138 get_vcpu_ptr x1, x0
141 ldr x1, [sp, #16 + 8]
142 mov x0, #ARM_EXCEPTION_IRQ 139 mov x0, #ARM_EXCEPTION_IRQ
143 b __guest_exit 140 b __guest_exit
144 141
145el1_error: 142el1_error:
146 stp x0, x1, [sp, #-16]! 143 get_vcpu_ptr x1, x0
147 ldr x1, [sp, #16 + 8]
148 mov x0, #ARM_EXCEPTION_EL1_SERROR 144 mov x0, #ARM_EXCEPTION_EL1_SERROR
149 b __guest_exit 145 b __guest_exit
150 146
151el2_error: 147el2_error:
148 ldp x0, x1, [sp], #16
149
152 /* 150 /*
153 * Only two possibilities: 151 * Only two possibilities:
154 * 1) Either we come from the exit path, having just unmasked 152 * 1) Either we come from the exit path, having just unmasked
@@ -180,14 +178,7 @@ ENTRY(__hyp_do_panic)
180ENDPROC(__hyp_do_panic) 178ENDPROC(__hyp_do_panic)
181 179
182ENTRY(__hyp_panic) 180ENTRY(__hyp_panic)
183 /* 181 get_host_ctxt x0, x1
184 * '=kvm_host_cpu_state' is a host VA from the constant pool, it may
185 * not be accessible by this address from EL2, hyp_panic() converts
186 * it with kern_hyp_va() before use.
187 */
188 ldr x0, =kvm_host_cpu_state
189 mrs x1, tpidr_el2
190 add x0, x0, x1
191 b hyp_panic 182 b hyp_panic
192ENDPROC(__hyp_panic) 183ENDPROC(__hyp_panic)
193 184
@@ -206,32 +197,43 @@ ENDPROC(\label)
206 invalid_vector el2h_sync_invalid 197 invalid_vector el2h_sync_invalid
207 invalid_vector el2h_irq_invalid 198 invalid_vector el2h_irq_invalid
208 invalid_vector el2h_fiq_invalid 199 invalid_vector el2h_fiq_invalid
209 invalid_vector el1_sync_invalid
210 invalid_vector el1_irq_invalid
211 invalid_vector el1_fiq_invalid 200 invalid_vector el1_fiq_invalid
212 201
213 .ltorg 202 .ltorg
214 203
215 .align 11 204 .align 11
216 205
206.macro valid_vect target
207 .align 7
208 stp x0, x1, [sp, #-16]!
209 b \target
210.endm
211
212.macro invalid_vect target
213 .align 7
214 b \target
215 ldp x0, x1, [sp], #16
216 b \target
217.endm
218
217ENTRY(__kvm_hyp_vector) 219ENTRY(__kvm_hyp_vector)
218 ventry el2t_sync_invalid // Synchronous EL2t 220 invalid_vect el2t_sync_invalid // Synchronous EL2t
219 ventry el2t_irq_invalid // IRQ EL2t 221 invalid_vect el2t_irq_invalid // IRQ EL2t
220 ventry el2t_fiq_invalid // FIQ EL2t 222 invalid_vect el2t_fiq_invalid // FIQ EL2t
221 ventry el2t_error_invalid // Error EL2t 223 invalid_vect el2t_error_invalid // Error EL2t
222 224
223 ventry el2h_sync_invalid // Synchronous EL2h 225 invalid_vect el2h_sync_invalid // Synchronous EL2h
224 ventry el2h_irq_invalid // IRQ EL2h 226 invalid_vect el2h_irq_invalid // IRQ EL2h
225 ventry el2h_fiq_invalid // FIQ EL2h 227 invalid_vect el2h_fiq_invalid // FIQ EL2h
226 ventry el2_error // Error EL2h 228 valid_vect el2_error // Error EL2h
227 229
228 ventry el1_sync // Synchronous 64-bit EL1 230 valid_vect el1_sync // Synchronous 64-bit EL1
229 ventry el1_irq // IRQ 64-bit EL1 231 valid_vect el1_irq // IRQ 64-bit EL1
230 ventry el1_fiq_invalid // FIQ 64-bit EL1 232 invalid_vect el1_fiq_invalid // FIQ 64-bit EL1
231 ventry el1_error // Error 64-bit EL1 233 valid_vect el1_error // Error 64-bit EL1
232 234
233 ventry el1_sync // Synchronous 32-bit EL1 235 valid_vect el1_sync // Synchronous 32-bit EL1
234 ventry el1_irq // IRQ 32-bit EL1 236 valid_vect el1_irq // IRQ 32-bit EL1
235 ventry el1_fiq_invalid // FIQ 32-bit EL1 237 invalid_vect el1_fiq_invalid // FIQ 32-bit EL1
236 ventry el1_error // Error 32-bit EL1 238 valid_vect el1_error // Error 32-bit EL1
237ENDPROC(__kvm_hyp_vector) 239ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 870f4b1587f9..07b572173265 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -33,49 +33,22 @@ static bool __hyp_text __fpsimd_enabled_nvhe(void)
33 return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP); 33 return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
34} 34}
35 35
36static bool __hyp_text __fpsimd_enabled_vhe(void) 36static bool fpsimd_enabled_vhe(void)
37{ 37{
38 return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN); 38 return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
39} 39}
40 40
41static hyp_alternate_select(__fpsimd_is_enabled, 41/* Save the 32-bit only FPSIMD system register state */
42 __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe, 42static void __hyp_text __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
43 ARM64_HAS_VIRT_HOST_EXTN);
44
45bool __hyp_text __fpsimd_enabled(void)
46{
47 return __fpsimd_is_enabled()();
48}
49
50static void __hyp_text __activate_traps_vhe(void)
51{
52 u64 val;
53
54 val = read_sysreg(cpacr_el1);
55 val |= CPACR_EL1_TTA;
56 val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN);
57 write_sysreg(val, cpacr_el1);
58
59 write_sysreg(kvm_get_hyp_vector(), vbar_el1);
60}
61
62static void __hyp_text __activate_traps_nvhe(void)
63{ 43{
64 u64 val; 44 if (!vcpu_el1_is_32bit(vcpu))
45 return;
65 46
66 val = CPTR_EL2_DEFAULT; 47 vcpu->arch.ctxt.sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
67 val |= CPTR_EL2_TTA | CPTR_EL2_TFP | CPTR_EL2_TZ;
68 write_sysreg(val, cptr_el2);
69} 48}
70 49
71static hyp_alternate_select(__activate_traps_arch, 50static void __hyp_text __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
72 __activate_traps_nvhe, __activate_traps_vhe,
73 ARM64_HAS_VIRT_HOST_EXTN);
74
75static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
76{ 51{
77 u64 val;
78
79 /* 52 /*
80 * We are about to set CPTR_EL2.TFP to trap all floating point 53 * We are about to set CPTR_EL2.TFP to trap all floating point
81 * register accesses to EL2, however, the ARM ARM clearly states that 54 * register accesses to EL2, however, the ARM ARM clearly states that
@@ -85,23 +58,17 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
85 * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to 58 * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
86 * it will cause an exception. 59 * it will cause an exception.
87 */ 60 */
88 val = vcpu->arch.hcr_el2; 61 if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) {
89
90 if (!(val & HCR_RW) && system_supports_fpsimd()) {
91 write_sysreg(1 << 30, fpexc32_el2); 62 write_sysreg(1 << 30, fpexc32_el2);
92 isb(); 63 isb();
93 } 64 }
65}
94 66
95 if (val & HCR_RW) /* for AArch64 only: */ 67static void __hyp_text __activate_traps_common(struct kvm_vcpu *vcpu)
96 val |= HCR_TID3; /* TID3: trap feature register accesses */ 68{
97 69 /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
98 write_sysreg(val, hcr_el2);
99
100 if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (val & HCR_VSE))
101 write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
102
103 /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
104 write_sysreg(1 << 15, hstr_el2); 70 write_sysreg(1 << 15, hstr_el2);
71
105 /* 72 /*
106 * Make sure we trap PMU access from EL0 to EL2. Also sanitize 73 * Make sure we trap PMU access from EL0 to EL2. Also sanitize
107 * PMSELR_EL0 to make sure it never contains the cycle 74 * PMSELR_EL0 to make sure it never contains the cycle
@@ -111,19 +78,56 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
111 write_sysreg(0, pmselr_el0); 78 write_sysreg(0, pmselr_el0);
112 write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); 79 write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
113 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 80 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
114 __activate_traps_arch()();
115} 81}
116 82
117static void __hyp_text __deactivate_traps_vhe(void) 83static void __hyp_text __deactivate_traps_common(void)
118{ 84{
119 extern char vectors[]; /* kernel exception vectors */ 85 write_sysreg(0, hstr_el2);
120 u64 mdcr_el2 = read_sysreg(mdcr_el2); 86 write_sysreg(0, pmuserenr_el0);
87}
121 88
122 mdcr_el2 &= MDCR_EL2_HPMN_MASK | 89static void activate_traps_vhe(struct kvm_vcpu *vcpu)
123 MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | 90{
124 MDCR_EL2_TPMS; 91 u64 val;
125 92
126 write_sysreg(mdcr_el2, mdcr_el2); 93 val = read_sysreg(cpacr_el1);
94 val |= CPACR_EL1_TTA;
95 val &= ~(CPACR_EL1_FPEN | CPACR_EL1_ZEN);
96 write_sysreg(val, cpacr_el1);
97
98 write_sysreg(kvm_get_hyp_vector(), vbar_el1);
99}
100
101static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
102{
103 u64 val;
104
105 __activate_traps_common(vcpu);
106
107 val = CPTR_EL2_DEFAULT;
108 val |= CPTR_EL2_TTA | CPTR_EL2_TFP | CPTR_EL2_TZ;
109 write_sysreg(val, cptr_el2);
110}
111
112static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
113{
114 u64 hcr = vcpu->arch.hcr_el2;
115
116 write_sysreg(hcr, hcr_el2);
117
118 if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
119 write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
120
121 __activate_traps_fpsimd32(vcpu);
122 if (has_vhe())
123 activate_traps_vhe(vcpu);
124 else
125 __activate_traps_nvhe(vcpu);
126}
127
128static void deactivate_traps_vhe(void)
129{
130 extern char vectors[]; /* kernel exception vectors */
127 write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); 131 write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
128 write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); 132 write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
129 write_sysreg(vectors, vbar_el1); 133 write_sysreg(vectors, vbar_el1);
@@ -133,6 +137,8 @@ static void __hyp_text __deactivate_traps_nvhe(void)
133{ 137{
134 u64 mdcr_el2 = read_sysreg(mdcr_el2); 138 u64 mdcr_el2 = read_sysreg(mdcr_el2);
135 139
140 __deactivate_traps_common();
141
136 mdcr_el2 &= MDCR_EL2_HPMN_MASK; 142 mdcr_el2 &= MDCR_EL2_HPMN_MASK;
137 mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; 143 mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT;
138 144
@@ -141,10 +147,6 @@ static void __hyp_text __deactivate_traps_nvhe(void)
141 write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); 147 write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
142} 148}
143 149
144static hyp_alternate_select(__deactivate_traps_arch,
145 __deactivate_traps_nvhe, __deactivate_traps_vhe,
146 ARM64_HAS_VIRT_HOST_EXTN);
147
148static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) 150static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
149{ 151{
150 /* 152 /*
@@ -156,14 +158,32 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
156 if (vcpu->arch.hcr_el2 & HCR_VSE) 158 if (vcpu->arch.hcr_el2 & HCR_VSE)
157 vcpu->arch.hcr_el2 = read_sysreg(hcr_el2); 159 vcpu->arch.hcr_el2 = read_sysreg(hcr_el2);
158 160
159 __deactivate_traps_arch()(); 161 if (has_vhe())
160 write_sysreg(0, hstr_el2); 162 deactivate_traps_vhe();
161 write_sysreg(0, pmuserenr_el0); 163 else
164 __deactivate_traps_nvhe();
165}
166
167void activate_traps_vhe_load(struct kvm_vcpu *vcpu)
168{
169 __activate_traps_common(vcpu);
170}
171
172void deactivate_traps_vhe_put(void)
173{
174 u64 mdcr_el2 = read_sysreg(mdcr_el2);
175
176 mdcr_el2 &= MDCR_EL2_HPMN_MASK |
177 MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT |
178 MDCR_EL2_TPMS;
179
180 write_sysreg(mdcr_el2, mdcr_el2);
181
182 __deactivate_traps_common();
162} 183}
163 184
164static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) 185static void __hyp_text __activate_vm(struct kvm *kvm)
165{ 186{
166 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
167 write_sysreg(kvm->arch.vttbr, vttbr_el2); 187 write_sysreg(kvm->arch.vttbr, vttbr_el2);
168} 188}
169 189
@@ -172,29 +192,22 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
172 write_sysreg(0, vttbr_el2); 192 write_sysreg(0, vttbr_el2);
173} 193}
174 194
175static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) 195/* Save VGICv3 state on non-VHE systems */
196static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
176{ 197{
177 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) 198 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
178 __vgic_v3_save_state(vcpu); 199 __vgic_v3_save_state(vcpu);
179 else 200 __vgic_v3_deactivate_traps(vcpu);
180 __vgic_v2_save_state(vcpu); 201 }
181
182 write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
183} 202}
184 203
185static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) 204/* Restore VGICv3 state on non_VEH systems */
205static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
186{ 206{
187 u64 val; 207 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
188 208 __vgic_v3_activate_traps(vcpu);
189 val = read_sysreg(hcr_el2);
190 val |= HCR_INT_OVERRIDE;
191 val |= vcpu->arch.irq_lines;
192 write_sysreg(val, hcr_el2);
193
194 if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
195 __vgic_v3_restore_state(vcpu); 209 __vgic_v3_restore_state(vcpu);
196 else 210 }
197 __vgic_v2_restore_state(vcpu);
198} 211}
199 212
200static bool __hyp_text __true_value(void) 213static bool __hyp_text __true_value(void)
@@ -305,54 +318,27 @@ static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu)
305 } 318 }
306} 319}
307 320
308int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu) 321/*
322 * Return true when we were able to fixup the guest exit and should return to
323 * the guest, false when we should restore the host state and return to the
324 * main run loop.
325 */
326static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
309{ 327{
310 struct kvm_cpu_context *host_ctxt; 328 if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
311 struct kvm_cpu_context *guest_ctxt;
312 bool fp_enabled;
313 u64 exit_code;
314
315 vcpu = kern_hyp_va(vcpu);
316
317 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
318 host_ctxt->__hyp_running_vcpu = vcpu;
319 guest_ctxt = &vcpu->arch.ctxt;
320
321 __sysreg_save_host_state(host_ctxt);
322 __debug_cond_save_host_state(vcpu);
323
324 __activate_traps(vcpu);
325 __activate_vm(vcpu);
326
327 __vgic_restore_state(vcpu);
328 __timer_enable_traps(vcpu);
329
330 /*
331 * We must restore the 32-bit state before the sysregs, thanks
332 * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
333 */
334 __sysreg32_restore_state(vcpu);
335 __sysreg_restore_guest_state(guest_ctxt);
336 __debug_restore_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
337
338 /* Jump in the fire! */
339again:
340 exit_code = __guest_enter(vcpu, host_ctxt);
341 /* And we're baaack! */
342
343 if (ARM_EXCEPTION_CODE(exit_code) != ARM_EXCEPTION_IRQ)
344 vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr); 329 vcpu->arch.fault.esr_el2 = read_sysreg_el2(esr);
330
345 /* 331 /*
346 * We're using the raw exception code in order to only process 332 * We're using the raw exception code in order to only process
347 * the trap if no SError is pending. We will come back to the 333 * the trap if no SError is pending. We will come back to the
348 * same PC once the SError has been injected, and replay the 334 * same PC once the SError has been injected, and replay the
349 * trapping instruction. 335 * trapping instruction.
350 */ 336 */
351 if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu)) 337 if (*exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
352 goto again; 338 return true;
353 339
354 if (static_branch_unlikely(&vgic_v2_cpuif_trap) && 340 if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
355 exit_code == ARM_EXCEPTION_TRAP) { 341 *exit_code == ARM_EXCEPTION_TRAP) {
356 bool valid; 342 bool valid;
357 343
358 valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && 344 valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW &&
@@ -366,9 +352,9 @@ again:
366 352
367 if (ret == 1) { 353 if (ret == 1) {
368 if (__skip_instr(vcpu)) 354 if (__skip_instr(vcpu))
369 goto again; 355 return true;
370 else 356 else
371 exit_code = ARM_EXCEPTION_TRAP; 357 *exit_code = ARM_EXCEPTION_TRAP;
372 } 358 }
373 359
374 if (ret == -1) { 360 if (ret == -1) {
@@ -380,29 +366,112 @@ again:
380 */ 366 */
381 if (!__skip_instr(vcpu)) 367 if (!__skip_instr(vcpu))
382 *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; 368 *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
383 exit_code = ARM_EXCEPTION_EL1_SERROR; 369 *exit_code = ARM_EXCEPTION_EL1_SERROR;
384 } 370 }
385
386 /* 0 falls through to be handler out of EL2 */
387 } 371 }
388 } 372 }
389 373
390 if (static_branch_unlikely(&vgic_v3_cpuif_trap) && 374 if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
391 exit_code == ARM_EXCEPTION_TRAP && 375 *exit_code == ARM_EXCEPTION_TRAP &&
392 (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || 376 (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 ||
393 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { 377 kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) {
394 int ret = __vgic_v3_perform_cpuif_access(vcpu); 378 int ret = __vgic_v3_perform_cpuif_access(vcpu);
395 379
396 if (ret == 1) { 380 if (ret == 1) {
397 if (__skip_instr(vcpu)) 381 if (__skip_instr(vcpu))
398 goto again; 382 return true;
399 else 383 else
400 exit_code = ARM_EXCEPTION_TRAP; 384 *exit_code = ARM_EXCEPTION_TRAP;
401 } 385 }
386 }
402 387
403 /* 0 falls through to be handled out of EL2 */ 388 /* Return to the host kernel and handle the exit */
389 return false;
390}
391
392/* Switch to the guest for VHE systems running in EL2 */
393int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
394{
395 struct kvm_cpu_context *host_ctxt;
396 struct kvm_cpu_context *guest_ctxt;
397 bool fp_enabled;
398 u64 exit_code;
399
400 host_ctxt = vcpu->arch.host_cpu_context;
401 host_ctxt->__hyp_running_vcpu = vcpu;
402 guest_ctxt = &vcpu->arch.ctxt;
403
404 sysreg_save_host_state_vhe(host_ctxt);
405
406 __activate_traps(vcpu);
407 __activate_vm(vcpu->kvm);
408
409 sysreg_restore_guest_state_vhe(guest_ctxt);
410 __debug_switch_to_guest(vcpu);
411
412 do {
413 /* Jump in the fire! */
414 exit_code = __guest_enter(vcpu, host_ctxt);
415
416 /* And we're baaack! */
417 } while (fixup_guest_exit(vcpu, &exit_code));
418
419 fp_enabled = fpsimd_enabled_vhe();
420
421 sysreg_save_guest_state_vhe(guest_ctxt);
422
423 __deactivate_traps(vcpu);
424
425 sysreg_restore_host_state_vhe(host_ctxt);
426
427 if (fp_enabled) {
428 __fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
429 __fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
430 __fpsimd_save_fpexc32(vcpu);
404 } 431 }
405 432
433 __debug_switch_to_host(vcpu);
434
435 return exit_code;
436}
437
438/* Switch to the guest for legacy non-VHE systems */
439int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu)
440{
441 struct kvm_cpu_context *host_ctxt;
442 struct kvm_cpu_context *guest_ctxt;
443 bool fp_enabled;
444 u64 exit_code;
445
446 vcpu = kern_hyp_va(vcpu);
447
448 host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
449 host_ctxt->__hyp_running_vcpu = vcpu;
450 guest_ctxt = &vcpu->arch.ctxt;
451
452 __sysreg_save_state_nvhe(host_ctxt);
453
454 __activate_traps(vcpu);
455 __activate_vm(kern_hyp_va(vcpu->kvm));
456
457 __hyp_vgic_restore_state(vcpu);
458 __timer_enable_traps(vcpu);
459
460 /*
461 * We must restore the 32-bit state before the sysregs, thanks
462 * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
463 */
464 __sysreg32_restore_state(vcpu);
465 __sysreg_restore_state_nvhe(guest_ctxt);
466 __debug_switch_to_guest(vcpu);
467
468 do {
469 /* Jump in the fire! */
470 exit_code = __guest_enter(vcpu, host_ctxt);
471
472 /* And we're baaack! */
473 } while (fixup_guest_exit(vcpu, &exit_code));
474
406 if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) { 475 if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) {
407 u32 midr = read_cpuid_id(); 476 u32 midr = read_cpuid_id();
408 477
@@ -413,29 +482,29 @@ again:
413 } 482 }
414 } 483 }
415 484
416 fp_enabled = __fpsimd_enabled(); 485 fp_enabled = __fpsimd_enabled_nvhe();
417 486
418 __sysreg_save_guest_state(guest_ctxt); 487 __sysreg_save_state_nvhe(guest_ctxt);
419 __sysreg32_save_state(vcpu); 488 __sysreg32_save_state(vcpu);
420 __timer_disable_traps(vcpu); 489 __timer_disable_traps(vcpu);
421 __vgic_save_state(vcpu); 490 __hyp_vgic_save_state(vcpu);
422 491
423 __deactivate_traps(vcpu); 492 __deactivate_traps(vcpu);
424 __deactivate_vm(vcpu); 493 __deactivate_vm(vcpu);
425 494
426 __sysreg_restore_host_state(host_ctxt); 495 __sysreg_restore_state_nvhe(host_ctxt);
427 496
428 if (fp_enabled) { 497 if (fp_enabled) {
429 __fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs); 498 __fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
430 __fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs); 499 __fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
500 __fpsimd_save_fpexc32(vcpu);
431 } 501 }
432 502
433 __debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
434 /* 503 /*
435 * This must come after restoring the host sysregs, since a non-VHE 504 * This must come after restoring the host sysregs, since a non-VHE
436 * system may enable SPE here and make use of the TTBRs. 505 * system may enable SPE here and make use of the TTBRs.
437 */ 506 */
438 __debug_cond_restore_host_state(vcpu); 507 __debug_switch_to_host(vcpu);
439 508
440 return exit_code; 509 return exit_code;
441} 510}
@@ -443,10 +512,20 @@ again:
443static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; 512static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
444 513
445static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par, 514static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par,
446 struct kvm_vcpu *vcpu) 515 struct kvm_cpu_context *__host_ctxt)
447{ 516{
517 struct kvm_vcpu *vcpu;
448 unsigned long str_va; 518 unsigned long str_va;
449 519
520 vcpu = __host_ctxt->__hyp_running_vcpu;
521
522 if (read_sysreg(vttbr_el2)) {
523 __timer_disable_traps(vcpu);
524 __deactivate_traps(vcpu);
525 __deactivate_vm(vcpu);
526 __sysreg_restore_state_nvhe(__host_ctxt);
527 }
528
450 /* 529 /*
451 * Force the panic string to be loaded from the literal pool, 530 * Force the panic string to be loaded from the literal pool,
452 * making sure it is a kernel address and not a PC-relative 531 * making sure it is a kernel address and not a PC-relative
@@ -460,40 +539,31 @@ static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par,
460 read_sysreg(hpfar_el2), par, vcpu); 539 read_sysreg(hpfar_el2), par, vcpu);
461} 540}
462 541
463static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par, 542static void __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par,
464 struct kvm_vcpu *vcpu) 543 struct kvm_cpu_context *host_ctxt)
465{ 544{
545 struct kvm_vcpu *vcpu;
546 vcpu = host_ctxt->__hyp_running_vcpu;
547
548 __deactivate_traps(vcpu);
549 sysreg_restore_host_state_vhe(host_ctxt);
550
466 panic(__hyp_panic_string, 551 panic(__hyp_panic_string,
467 spsr, elr, 552 spsr, elr,
468 read_sysreg_el2(esr), read_sysreg_el2(far), 553 read_sysreg_el2(esr), read_sysreg_el2(far),
469 read_sysreg(hpfar_el2), par, vcpu); 554 read_sysreg(hpfar_el2), par, vcpu);
470} 555}
471 556
472static hyp_alternate_select(__hyp_call_panic, 557void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
473 __hyp_call_panic_nvhe, __hyp_call_panic_vhe,
474 ARM64_HAS_VIRT_HOST_EXTN);
475
476void __hyp_text __noreturn hyp_panic(struct kvm_cpu_context *__host_ctxt)
477{ 558{
478 struct kvm_vcpu *vcpu = NULL;
479
480 u64 spsr = read_sysreg_el2(spsr); 559 u64 spsr = read_sysreg_el2(spsr);
481 u64 elr = read_sysreg_el2(elr); 560 u64 elr = read_sysreg_el2(elr);
482 u64 par = read_sysreg(par_el1); 561 u64 par = read_sysreg(par_el1);
483 562
484 if (read_sysreg(vttbr_el2)) { 563 if (!has_vhe())
485 struct kvm_cpu_context *host_ctxt; 564 __hyp_call_panic_nvhe(spsr, elr, par, host_ctxt);
486 565 else
487 host_ctxt = kern_hyp_va(__host_ctxt); 566 __hyp_call_panic_vhe(spsr, elr, par, host_ctxt);
488 vcpu = host_ctxt->__hyp_running_vcpu;
489 __timer_disable_traps(vcpu);
490 __deactivate_traps(vcpu);
491 __deactivate_vm(vcpu);
492 __sysreg_restore_host_state(host_ctxt);
493 }
494
495 /* Call panic for real */
496 __hyp_call_panic()(spsr, elr, par, vcpu);
497 567
498 unreachable(); 568 unreachable();
499} 569}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 2c17afd2be96..b3894df6bf1a 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -19,32 +19,43 @@
19#include <linux/kvm_host.h> 19#include <linux/kvm_host.h>
20 20
21#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
22#include <asm/kvm_emulate.h>
22#include <asm/kvm_hyp.h> 23#include <asm/kvm_hyp.h>
23 24
24/* Yes, this does nothing, on purpose */
25static void __hyp_text __sysreg_do_nothing(struct kvm_cpu_context *ctxt) { }
26
27/* 25/*
28 * Non-VHE: Both host and guest must save everything. 26 * Non-VHE: Both host and guest must save everything.
29 * 27 *
30 * VHE: Host must save tpidr*_el0, actlr_el1, mdscr_el1, sp_el0, 28 * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate,
31 * and guest must save everything. 29 * which are handled as part of the el2 return state) on every switch.
30 * tpidr_el0 and tpidrro_el0 only need to be switched when going
31 * to host userspace or a different VCPU. EL1 registers only need to be
32 * switched when potentially going to run a different VCPU. The latter two
33 * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put.
32 */ 34 */
33 35
34static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt) 36static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
35{ 37{
36 ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1);
37 ctxt->sys_regs[TPIDR_EL0] = read_sysreg(tpidr_el0);
38 ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0);
39 ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1); 38 ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1);
39
40 /*
41 * The host arm64 Linux uses sp_el0 to point to 'current' and it must
42 * therefore be saved/restored on every entry/exit to/from the guest.
43 */
40 ctxt->gp_regs.regs.sp = read_sysreg(sp_el0); 44 ctxt->gp_regs.regs.sp = read_sysreg(sp_el0);
41} 45}
42 46
43static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) 47static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
48{
49 ctxt->sys_regs[TPIDR_EL0] = read_sysreg(tpidr_el0);
50 ctxt->sys_regs[TPIDRRO_EL0] = read_sysreg(tpidrro_el0);
51}
52
53static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
44{ 54{
45 ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2); 55 ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2);
46 ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); 56 ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1);
47 ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); 57 ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr);
58 ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1);
48 ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(cpacr); 59 ctxt->sys_regs[CPACR_EL1] = read_sysreg_el1(cpacr);
49 ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(ttbr0); 60 ctxt->sys_regs[TTBR0_EL1] = read_sysreg_el1(ttbr0);
50 ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(ttbr1); 61 ctxt->sys_regs[TTBR1_EL1] = read_sysreg_el1(ttbr1);
@@ -64,6 +75,10 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
64 ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1); 75 ctxt->gp_regs.sp_el1 = read_sysreg(sp_el1);
65 ctxt->gp_regs.elr_el1 = read_sysreg_el1(elr); 76 ctxt->gp_regs.elr_el1 = read_sysreg_el1(elr);
66 ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr); 77 ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr);
78}
79
80static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
81{
67 ctxt->gp_regs.regs.pc = read_sysreg_el2(elr); 82 ctxt->gp_regs.regs.pc = read_sysreg_el2(elr);
68 ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr); 83 ctxt->gp_regs.regs.pstate = read_sysreg_el2(spsr);
69 84
@@ -71,36 +86,48 @@ static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
71 ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2); 86 ctxt->sys_regs[DISR_EL1] = read_sysreg_s(SYS_VDISR_EL2);
72} 87}
73 88
74static hyp_alternate_select(__sysreg_call_save_host_state, 89void __hyp_text __sysreg_save_state_nvhe(struct kvm_cpu_context *ctxt)
75 __sysreg_save_state, __sysreg_do_nothing, 90{
76 ARM64_HAS_VIRT_HOST_EXTN); 91 __sysreg_save_el1_state(ctxt);
92 __sysreg_save_common_state(ctxt);
93 __sysreg_save_user_state(ctxt);
94 __sysreg_save_el2_return_state(ctxt);
95}
77 96
78void __hyp_text __sysreg_save_host_state(struct kvm_cpu_context *ctxt) 97void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt)
79{ 98{
80 __sysreg_call_save_host_state()(ctxt);
81 __sysreg_save_common_state(ctxt); 99 __sysreg_save_common_state(ctxt);
82} 100}
83 101
84void __hyp_text __sysreg_save_guest_state(struct kvm_cpu_context *ctxt) 102void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt)
85{ 103{
86 __sysreg_save_state(ctxt);
87 __sysreg_save_common_state(ctxt); 104 __sysreg_save_common_state(ctxt);
105 __sysreg_save_el2_return_state(ctxt);
88} 106}
89 107
90static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) 108static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
91{ 109{
92 write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1);
93 write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0);
94 write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
95 write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1); 110 write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1);
111
112 /*
113 * The host arm64 Linux uses sp_el0 to point to 'current' and it must
114 * therefore be saved/restored on every entry/exit to/from the guest.
115 */
96 write_sysreg(ctxt->gp_regs.regs.sp, sp_el0); 116 write_sysreg(ctxt->gp_regs.regs.sp, sp_el0);
97} 117}
98 118
99static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt) 119static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
120{
121 write_sysreg(ctxt->sys_regs[TPIDR_EL0], tpidr_el0);
122 write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
123}
124
125static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
100{ 126{
101 write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2); 127 write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2);
102 write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1); 128 write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1);
103 write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], sctlr); 129 write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], sctlr);
130 write_sysreg(ctxt->sys_regs[ACTLR_EL1], actlr_el1);
104 write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], cpacr); 131 write_sysreg_el1(ctxt->sys_regs[CPACR_EL1], cpacr);
105 write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], ttbr0); 132 write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1], ttbr0);
106 write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], ttbr1); 133 write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1], ttbr1);
@@ -120,6 +147,11 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
120 write_sysreg(ctxt->gp_regs.sp_el1, sp_el1); 147 write_sysreg(ctxt->gp_regs.sp_el1, sp_el1);
121 write_sysreg_el1(ctxt->gp_regs.elr_el1, elr); 148 write_sysreg_el1(ctxt->gp_regs.elr_el1, elr);
122 write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr); 149 write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr);
150}
151
152static void __hyp_text
153__sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
154{
123 write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); 155 write_sysreg_el2(ctxt->gp_regs.regs.pc, elr);
124 write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); 156 write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
125 157
@@ -127,27 +159,30 @@ static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
127 write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); 159 write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
128} 160}
129 161
130static hyp_alternate_select(__sysreg_call_restore_host_state, 162void __hyp_text __sysreg_restore_state_nvhe(struct kvm_cpu_context *ctxt)
131 __sysreg_restore_state, __sysreg_do_nothing, 163{
132 ARM64_HAS_VIRT_HOST_EXTN); 164 __sysreg_restore_el1_state(ctxt);
165 __sysreg_restore_common_state(ctxt);
166 __sysreg_restore_user_state(ctxt);
167 __sysreg_restore_el2_return_state(ctxt);
168}
133 169
134void __hyp_text __sysreg_restore_host_state(struct kvm_cpu_context *ctxt) 170void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt)
135{ 171{
136 __sysreg_call_restore_host_state()(ctxt);
137 __sysreg_restore_common_state(ctxt); 172 __sysreg_restore_common_state(ctxt);
138} 173}
139 174
140void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt) 175void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
141{ 176{
142 __sysreg_restore_state(ctxt);
143 __sysreg_restore_common_state(ctxt); 177 __sysreg_restore_common_state(ctxt);
178 __sysreg_restore_el2_return_state(ctxt);
144} 179}
145 180
146void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu) 181void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
147{ 182{
148 u64 *spsr, *sysreg; 183 u64 *spsr, *sysreg;
149 184
150 if (read_sysreg(hcr_el2) & HCR_RW) 185 if (!vcpu_el1_is_32bit(vcpu))
151 return; 186 return;
152 187
153 spsr = vcpu->arch.ctxt.gp_regs.spsr; 188 spsr = vcpu->arch.ctxt.gp_regs.spsr;
@@ -161,10 +196,7 @@ void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
161 sysreg[DACR32_EL2] = read_sysreg(dacr32_el2); 196 sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
162 sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2); 197 sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
163 198
164 if (__fpsimd_enabled()) 199 if (has_vhe() || vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
165 sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
166
167 if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
168 sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2); 200 sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
169} 201}
170 202
@@ -172,7 +204,7 @@ void __hyp_text __sysreg32_restore_state(struct kvm_vcpu *vcpu)
172{ 204{
173 u64 *spsr, *sysreg; 205 u64 *spsr, *sysreg;
174 206
175 if (read_sysreg(hcr_el2) & HCR_RW) 207 if (!vcpu_el1_is_32bit(vcpu))
176 return; 208 return;
177 209
178 spsr = vcpu->arch.ctxt.gp_regs.spsr; 210 spsr = vcpu->arch.ctxt.gp_regs.spsr;
@@ -186,6 +218,78 @@ void __hyp_text __sysreg32_restore_state(struct kvm_vcpu *vcpu)
186 write_sysreg(sysreg[DACR32_EL2], dacr32_el2); 218 write_sysreg(sysreg[DACR32_EL2], dacr32_el2);
187 write_sysreg(sysreg[IFSR32_EL2], ifsr32_el2); 219 write_sysreg(sysreg[IFSR32_EL2], ifsr32_el2);
188 220
189 if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY) 221 if (has_vhe() || vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
190 write_sysreg(sysreg[DBGVCR32_EL2], dbgvcr32_el2); 222 write_sysreg(sysreg[DBGVCR32_EL2], dbgvcr32_el2);
191} 223}
224
225/**
226 * kvm_vcpu_load_sysregs - Load guest system registers to the physical CPU
227 *
228 * @vcpu: The VCPU pointer
229 *
230 * Load system registers that do not affect the host's execution, for
231 * example EL1 system registers on a VHE system where the host kernel
232 * runs at EL2. This function is called from KVM's vcpu_load() function
233 * and loading system register state early avoids having to load them on
234 * every entry to the VM.
235 */
236void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
237{
238 struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
239 struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
240
241 if (!has_vhe())
242 return;
243
244 __sysreg_save_user_state(host_ctxt);
245
246 /*
247 * Load guest EL1 and user state
248 *
249 * We must restore the 32-bit state before the sysregs, thanks
250 * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
251 */
252 __sysreg32_restore_state(vcpu);
253 __sysreg_restore_user_state(guest_ctxt);
254 __sysreg_restore_el1_state(guest_ctxt);
255
256 vcpu->arch.sysregs_loaded_on_cpu = true;
257
258 activate_traps_vhe_load(vcpu);
259}
260
261/**
262 * kvm_vcpu_put_sysregs - Restore host system registers to the physical CPU
263 *
264 * @vcpu: The VCPU pointer
265 *
266 * Save guest system registers that do not affect the host's execution, for
267 * example EL1 system registers on a VHE system where the host kernel
268 * runs at EL2. This function is called from KVM's vcpu_put() function
269 * and deferring saving system register state until we're no longer running the
270 * VCPU avoids having to save them on every exit from the VM.
271 */
272void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
273{
274 struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
275 struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
276
277 if (!has_vhe())
278 return;
279
280 deactivate_traps_vhe_put();
281
282 __sysreg_save_el1_state(guest_ctxt);
283 __sysreg_save_user_state(guest_ctxt);
284 __sysreg32_save_state(vcpu);
285
286 /* Restore host user state */
287 __sysreg_restore_user_state(host_ctxt);
288
289 vcpu->arch.sysregs_loaded_on_cpu = false;
290}
291
292void __hyp_text __kvm_set_tpidr_el2(u64 tpidr_el2)
293{
294 asm("msr tpidr_el2, %0": : "r" (tpidr_el2));
295}
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
new file mode 100644
index 000000000000..86801b6055d6
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -0,0 +1,78 @@
1/*
2 * Copyright (C) 2012-2015 - ARM Ltd
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <linux/compiler.h>
19#include <linux/irqchip/arm-gic.h>
20#include <linux/kvm_host.h>
21
22#include <asm/kvm_emulate.h>
23#include <asm/kvm_hyp.h>
24#include <asm/kvm_mmu.h>
25
26/*
27 * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
28 * guest.
29 *
30 * @vcpu: the offending vcpu
31 *
32 * Returns:
33 * 1: GICV access successfully performed
34 * 0: Not a GICV access
35 * -1: Illegal GICV access
36 */
37int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
38{
39 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
40 struct vgic_dist *vgic = &kvm->arch.vgic;
41 phys_addr_t fault_ipa;
42 void __iomem *addr;
43 int rd;
44
45 /* Build the full address */
46 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
47 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
48
49 /* If not for GICV, move on */
50 if (fault_ipa < vgic->vgic_cpu_base ||
51 fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
52 return 0;
53
54 /* Reject anything but a 32bit access */
55 if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
56 return -1;
57
58 /* Not aligned? Don't bother */
59 if (fault_ipa & 3)
60 return -1;
61
62 rd = kvm_vcpu_dabt_get_rd(vcpu);
63 addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va;
64 addr += fault_ipa - vgic->vgic_cpu_base;
65
66 if (kvm_vcpu_dabt_iswrite(vcpu)) {
67 u32 data = vcpu_data_guest_to_host(vcpu,
68 vcpu_get_reg(vcpu, rd),
69 sizeof(u32));
70 writel_relaxed(data, addr);
71 } else {
72 u32 data = readl_relaxed(addr);
73 vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
74 sizeof(u32)));
75 }
76
77 return 1;
78}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 60666a056944..d8e71659ba7e 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -58,7 +58,7 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
58 exc_offset = LOWER_EL_AArch32_VECTOR; 58 exc_offset = LOWER_EL_AArch32_VECTOR;
59 } 59 }
60 60
61 return vcpu_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; 61 return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
62} 62}
63 63
64static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) 64static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -67,13 +67,13 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
67 bool is_aarch32 = vcpu_mode_is_32bit(vcpu); 67 bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
68 u32 esr = 0; 68 u32 esr = 0;
69 69
70 *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu); 70 vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
71 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); 71 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
72 72
73 *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; 73 *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
74 *vcpu_spsr(vcpu) = cpsr; 74 vcpu_write_spsr(vcpu, cpsr);
75 75
76 vcpu_sys_reg(vcpu, FAR_EL1) = addr; 76 vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
77 77
78 /* 78 /*
79 * Build an {i,d}abort, depending on the level and the 79 * Build an {i,d}abort, depending on the level and the
@@ -94,7 +94,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
94 if (!is_iabt) 94 if (!is_iabt)
95 esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; 95 esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
96 96
97 vcpu_sys_reg(vcpu, ESR_EL1) = esr | ESR_ELx_FSC_EXTABT; 97 vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1);
98} 98}
99 99
100static void inject_undef64(struct kvm_vcpu *vcpu) 100static void inject_undef64(struct kvm_vcpu *vcpu)
@@ -102,11 +102,11 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
102 unsigned long cpsr = *vcpu_cpsr(vcpu); 102 unsigned long cpsr = *vcpu_cpsr(vcpu);
103 u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); 103 u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
104 104
105 *vcpu_elr_el1(vcpu) = *vcpu_pc(vcpu); 105 vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
106 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); 106 *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
107 107
108 *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; 108 *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64;
109 *vcpu_spsr(vcpu) = cpsr; 109 vcpu_write_spsr(vcpu, cpsr);
110 110
111 /* 111 /*
112 * Build an unknown exception, depending on the instruction 112 * Build an unknown exception, depending on the instruction
@@ -115,7 +115,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
115 if (kvm_vcpu_trap_il_is32bit(vcpu)) 115 if (kvm_vcpu_trap_il_is32bit(vcpu))
116 esr |= ESR_ELx_IL; 116 esr |= ESR_ELx_IL;
117 117
118 vcpu_sys_reg(vcpu, ESR_EL1) = esr; 118 vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
119} 119}
120 120
121/** 121/**
@@ -128,7 +128,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
128 */ 128 */
129void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) 129void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
130{ 130{
131 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 131 if (vcpu_el1_is_32bit(vcpu))
132 kvm_inject_dabt32(vcpu, addr); 132 kvm_inject_dabt32(vcpu, addr);
133 else 133 else
134 inject_abt64(vcpu, false, addr); 134 inject_abt64(vcpu, false, addr);
@@ -144,7 +144,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
144 */ 144 */
145void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) 145void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
146{ 146{
147 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 147 if (vcpu_el1_is_32bit(vcpu))
148 kvm_inject_pabt32(vcpu, addr); 148 kvm_inject_pabt32(vcpu, addr);
149 else 149 else
150 inject_abt64(vcpu, true, addr); 150 inject_abt64(vcpu, true, addr);
@@ -158,7 +158,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
158 */ 158 */
159void kvm_inject_undefined(struct kvm_vcpu *vcpu) 159void kvm_inject_undefined(struct kvm_vcpu *vcpu)
160{ 160{
161 if (!(vcpu->arch.hcr_el2 & HCR_RW)) 161 if (vcpu_el1_is_32bit(vcpu))
162 kvm_inject_undef32(vcpu); 162 kvm_inject_undef32(vcpu);
163 else 163 else
164 inject_undef64(vcpu); 164 inject_undef64(vcpu);
@@ -167,7 +167,7 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
167static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr) 167static void pend_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
168{ 168{
169 vcpu_set_vsesr(vcpu, esr); 169 vcpu_set_vsesr(vcpu, esr);
170 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) | HCR_VSE); 170 *vcpu_hcr(vcpu) |= HCR_VSE;
171} 171}
172 172
173/** 173/**
diff --git a/arch/arm64/kvm/regmap.c b/arch/arm64/kvm/regmap.c
index bbc6ae32e4af..eefe403a2e63 100644
--- a/arch/arm64/kvm/regmap.c
+++ b/arch/arm64/kvm/regmap.c
@@ -141,28 +141,61 @@ unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num)
141/* 141/*
142 * Return the SPSR for the current mode of the virtual CPU. 142 * Return the SPSR for the current mode of the virtual CPU.
143 */ 143 */
144unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu) 144static int vcpu_spsr32_mode(const struct kvm_vcpu *vcpu)
145{ 145{
146 unsigned long mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK; 146 unsigned long mode = *vcpu_cpsr(vcpu) & COMPAT_PSR_MODE_MASK;
147 switch (mode) { 147 switch (mode) {
148 case COMPAT_PSR_MODE_SVC: 148 case COMPAT_PSR_MODE_SVC: return KVM_SPSR_SVC;
149 mode = KVM_SPSR_SVC; 149 case COMPAT_PSR_MODE_ABT: return KVM_SPSR_ABT;
150 break; 150 case COMPAT_PSR_MODE_UND: return KVM_SPSR_UND;
151 case COMPAT_PSR_MODE_ABT: 151 case COMPAT_PSR_MODE_IRQ: return KVM_SPSR_IRQ;
152 mode = KVM_SPSR_ABT; 152 case COMPAT_PSR_MODE_FIQ: return KVM_SPSR_FIQ;
153 break; 153 default: BUG();
154 case COMPAT_PSR_MODE_UND: 154 }
155 mode = KVM_SPSR_UND; 155}
156 break; 156
157 case COMPAT_PSR_MODE_IRQ: 157unsigned long vcpu_read_spsr32(const struct kvm_vcpu *vcpu)
158 mode = KVM_SPSR_IRQ; 158{
159 break; 159 int spsr_idx = vcpu_spsr32_mode(vcpu);
160 case COMPAT_PSR_MODE_FIQ: 160
161 mode = KVM_SPSR_FIQ; 161 if (!vcpu->arch.sysregs_loaded_on_cpu)
162 break; 162 return vcpu_gp_regs(vcpu)->spsr[spsr_idx];
163
164 switch (spsr_idx) {
165 case KVM_SPSR_SVC:
166 return read_sysreg_el1(spsr);
167 case KVM_SPSR_ABT:
168 return read_sysreg(spsr_abt);
169 case KVM_SPSR_UND:
170 return read_sysreg(spsr_und);
171 case KVM_SPSR_IRQ:
172 return read_sysreg(spsr_irq);
173 case KVM_SPSR_FIQ:
174 return read_sysreg(spsr_fiq);
163 default: 175 default:
164 BUG(); 176 BUG();
165 } 177 }
178}
179
180void vcpu_write_spsr32(struct kvm_vcpu *vcpu, unsigned long v)
181{
182 int spsr_idx = vcpu_spsr32_mode(vcpu);
183
184 if (!vcpu->arch.sysregs_loaded_on_cpu) {
185 vcpu_gp_regs(vcpu)->spsr[spsr_idx] = v;
186 return;
187 }
166 188
167 return (unsigned long *)&vcpu_gp_regs(vcpu)->spsr[mode]; 189 switch (spsr_idx) {
190 case KVM_SPSR_SVC:
191 write_sysreg_el1(v, spsr);
192 case KVM_SPSR_ABT:
193 write_sysreg(v, spsr_abt);
194 case KVM_SPSR_UND:
195 write_sysreg(v, spsr_und);
196 case KVM_SPSR_IRQ:
197 write_sysreg(v, spsr_irq);
198 case KVM_SPSR_FIQ:
199 write_sysreg(v, spsr_fiq);
200 }
168} 201}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 50a43c7b97ca..806b0b126a64 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -35,6 +35,7 @@
35#include <asm/kvm_coproc.h> 35#include <asm/kvm_coproc.h>
36#include <asm/kvm_emulate.h> 36#include <asm/kvm_emulate.h>
37#include <asm/kvm_host.h> 37#include <asm/kvm_host.h>
38#include <asm/kvm_hyp.h>
38#include <asm/kvm_mmu.h> 39#include <asm/kvm_mmu.h>
39#include <asm/perf_event.h> 40#include <asm/perf_event.h>
40#include <asm/sysreg.h> 41#include <asm/sysreg.h>
@@ -76,6 +77,93 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
76 return false; 77 return false;
77} 78}
78 79
80u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg)
81{
82 if (!vcpu->arch.sysregs_loaded_on_cpu)
83 goto immediate_read;
84
85 /*
86 * System registers listed in the switch are not saved on every
87 * exit from the guest but are only saved on vcpu_put.
88 *
89 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
90 * should never be listed below, because the guest cannot modify its
91 * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
92 * thread when emulating cross-VCPU communication.
93 */
94 switch (reg) {
95 case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1);
96 case SCTLR_EL1: return read_sysreg_s(sctlr_EL12);
97 case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1);
98 case CPACR_EL1: return read_sysreg_s(cpacr_EL12);
99 case TTBR0_EL1: return read_sysreg_s(ttbr0_EL12);
100 case TTBR1_EL1: return read_sysreg_s(ttbr1_EL12);
101 case TCR_EL1: return read_sysreg_s(tcr_EL12);
102 case ESR_EL1: return read_sysreg_s(esr_EL12);
103 case AFSR0_EL1: return read_sysreg_s(afsr0_EL12);
104 case AFSR1_EL1: return read_sysreg_s(afsr1_EL12);
105 case FAR_EL1: return read_sysreg_s(far_EL12);
106 case MAIR_EL1: return read_sysreg_s(mair_EL12);
107 case VBAR_EL1: return read_sysreg_s(vbar_EL12);
108 case CONTEXTIDR_EL1: return read_sysreg_s(contextidr_EL12);
109 case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0);
110 case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0);
111 case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1);
112 case AMAIR_EL1: return read_sysreg_s(amair_EL12);
113 case CNTKCTL_EL1: return read_sysreg_s(cntkctl_EL12);
114 case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1);
115 case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2);
116 case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2);
117 case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2);
118 }
119
120immediate_read:
121 return __vcpu_sys_reg(vcpu, reg);
122}
123
124void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
125{
126 if (!vcpu->arch.sysregs_loaded_on_cpu)
127 goto immediate_write;
128
129 /*
130 * System registers listed in the switch are not restored on every
131 * entry to the guest but are only restored on vcpu_load.
132 *
133 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
134 * should never be listed below, because the the MPIDR should only be
135 * set once, before running the VCPU, and never changed later.
136 */
137 switch (reg) {
138 case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return;
139 case SCTLR_EL1: write_sysreg_s(val, sctlr_EL12); return;
140 case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return;
141 case CPACR_EL1: write_sysreg_s(val, cpacr_EL12); return;
142 case TTBR0_EL1: write_sysreg_s(val, ttbr0_EL12); return;
143 case TTBR1_EL1: write_sysreg_s(val, ttbr1_EL12); return;
144 case TCR_EL1: write_sysreg_s(val, tcr_EL12); return;
145 case ESR_EL1: write_sysreg_s(val, esr_EL12); return;
146 case AFSR0_EL1: write_sysreg_s(val, afsr0_EL12); return;
147 case AFSR1_EL1: write_sysreg_s(val, afsr1_EL12); return;
148 case FAR_EL1: write_sysreg_s(val, far_EL12); return;
149 case MAIR_EL1: write_sysreg_s(val, mair_EL12); return;
150 case VBAR_EL1: write_sysreg_s(val, vbar_EL12); return;
151 case CONTEXTIDR_EL1: write_sysreg_s(val, contextidr_EL12); return;
152 case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return;
153 case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return;
154 case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return;
155 case AMAIR_EL1: write_sysreg_s(val, amair_EL12); return;
156 case CNTKCTL_EL1: write_sysreg_s(val, cntkctl_EL12); return;
157 case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return;
158 case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return;
159 case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return;
160 case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return;
161 }
162
163immediate_write:
164 __vcpu_sys_reg(vcpu, reg) = val;
165}
166
79/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ 167/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
80static u32 cache_levels; 168static u32 cache_levels;
81 169
@@ -121,16 +209,26 @@ static bool access_vm_reg(struct kvm_vcpu *vcpu,
121 const struct sys_reg_desc *r) 209 const struct sys_reg_desc *r)
122{ 210{
123 bool was_enabled = vcpu_has_cache_enabled(vcpu); 211 bool was_enabled = vcpu_has_cache_enabled(vcpu);
212 u64 val;
213 int reg = r->reg;
124 214
125 BUG_ON(!p->is_write); 215 BUG_ON(!p->is_write);
126 216
127 if (!p->is_aarch32) { 217 /* See the 32bit mapping in kvm_host.h */
128 vcpu_sys_reg(vcpu, r->reg) = p->regval; 218 if (p->is_aarch32)
219 reg = r->reg / 2;
220
221 if (!p->is_aarch32 || !p->is_32bit) {
222 val = p->regval;
129 } else { 223 } else {
130 if (!p->is_32bit) 224 val = vcpu_read_sys_reg(vcpu, reg);
131 vcpu_cp15_64_high(vcpu, r->reg) = upper_32_bits(p->regval); 225 if (r->reg % 2)
132 vcpu_cp15_64_low(vcpu, r->reg) = lower_32_bits(p->regval); 226 val = (p->regval << 32) | (u64)lower_32_bits(val);
227 else
228 val = ((u64)upper_32_bits(val) << 32) |
229 lower_32_bits(p->regval);
133 } 230 }
231 vcpu_write_sys_reg(vcpu, val, reg);
134 232
135 kvm_toggle_cache(vcpu, was_enabled); 233 kvm_toggle_cache(vcpu, was_enabled);
136 return true; 234 return true;
@@ -175,6 +273,14 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
175 return read_zero(vcpu, p); 273 return read_zero(vcpu, p);
176} 274}
177 275
276static bool trap_undef(struct kvm_vcpu *vcpu,
277 struct sys_reg_params *p,
278 const struct sys_reg_desc *r)
279{
280 kvm_inject_undefined(vcpu);
281 return false;
282}
283
178static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, 284static bool trap_oslsr_el1(struct kvm_vcpu *vcpu,
179 struct sys_reg_params *p, 285 struct sys_reg_params *p,
180 const struct sys_reg_desc *r) 286 const struct sys_reg_desc *r)
@@ -231,10 +337,10 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
231 const struct sys_reg_desc *r) 337 const struct sys_reg_desc *r)
232{ 338{
233 if (p->is_write) { 339 if (p->is_write) {
234 vcpu_sys_reg(vcpu, r->reg) = p->regval; 340 vcpu_write_sys_reg(vcpu, p->regval, r->reg);
235 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY; 341 vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
236 } else { 342 } else {
237 p->regval = vcpu_sys_reg(vcpu, r->reg); 343 p->regval = vcpu_read_sys_reg(vcpu, r->reg);
238 } 344 }
239 345
240 trace_trap_reg(__func__, r->reg, p->is_write, p->regval); 346 trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
@@ -447,7 +553,8 @@ static void reset_wcr(struct kvm_vcpu *vcpu,
447 553
448static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 554static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
449{ 555{
450 vcpu_sys_reg(vcpu, AMAIR_EL1) = read_sysreg(amair_el1); 556 u64 amair = read_sysreg(amair_el1);
557 vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
451} 558}
452 559
453static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 560static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
@@ -464,7 +571,7 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
464 mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); 571 mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
465 mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); 572 mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
466 mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); 573 mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
467 vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr; 574 vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
468} 575}
469 576
470static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 577static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
@@ -478,12 +585,12 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
478 */ 585 */
479 val = ((pmcr & ~ARMV8_PMU_PMCR_MASK) 586 val = ((pmcr & ~ARMV8_PMU_PMCR_MASK)
480 | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E); 587 | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E);
481 vcpu_sys_reg(vcpu, PMCR_EL0) = val; 588 __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
482} 589}
483 590
484static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) 591static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
485{ 592{
486 u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0); 593 u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
487 bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); 594 bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);
488 595
489 if (!enabled) 596 if (!enabled)
@@ -525,14 +632,14 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
525 632
526 if (p->is_write) { 633 if (p->is_write) {
527 /* Only update writeable bits of PMCR */ 634 /* Only update writeable bits of PMCR */
528 val = vcpu_sys_reg(vcpu, PMCR_EL0); 635 val = __vcpu_sys_reg(vcpu, PMCR_EL0);
529 val &= ~ARMV8_PMU_PMCR_MASK; 636 val &= ~ARMV8_PMU_PMCR_MASK;
530 val |= p->regval & ARMV8_PMU_PMCR_MASK; 637 val |= p->regval & ARMV8_PMU_PMCR_MASK;
531 vcpu_sys_reg(vcpu, PMCR_EL0) = val; 638 __vcpu_sys_reg(vcpu, PMCR_EL0) = val;
532 kvm_pmu_handle_pmcr(vcpu, val); 639 kvm_pmu_handle_pmcr(vcpu, val);
533 } else { 640 } else {
534 /* PMCR.P & PMCR.C are RAZ */ 641 /* PMCR.P & PMCR.C are RAZ */
535 val = vcpu_sys_reg(vcpu, PMCR_EL0) 642 val = __vcpu_sys_reg(vcpu, PMCR_EL0)
536 & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); 643 & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
537 p->regval = val; 644 p->regval = val;
538 } 645 }
@@ -550,10 +657,10 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
550 return false; 657 return false;
551 658
552 if (p->is_write) 659 if (p->is_write)
553 vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; 660 __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
554 else 661 else
555 /* return PMSELR.SEL field */ 662 /* return PMSELR.SEL field */
556 p->regval = vcpu_sys_reg(vcpu, PMSELR_EL0) 663 p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
557 & ARMV8_PMU_COUNTER_MASK; 664 & ARMV8_PMU_COUNTER_MASK;
558 665
559 return true; 666 return true;
@@ -586,7 +693,7 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
586{ 693{
587 u64 pmcr, val; 694 u64 pmcr, val;
588 695
589 pmcr = vcpu_sys_reg(vcpu, PMCR_EL0); 696 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
590 val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; 697 val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
591 if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { 698 if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
592 kvm_inject_undefined(vcpu); 699 kvm_inject_undefined(vcpu);
@@ -611,7 +718,7 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
611 if (pmu_access_event_counter_el0_disabled(vcpu)) 718 if (pmu_access_event_counter_el0_disabled(vcpu))
612 return false; 719 return false;
613 720
614 idx = vcpu_sys_reg(vcpu, PMSELR_EL0) 721 idx = __vcpu_sys_reg(vcpu, PMSELR_EL0)
615 & ARMV8_PMU_COUNTER_MASK; 722 & ARMV8_PMU_COUNTER_MASK;
616 } else if (r->Op2 == 0) { 723 } else if (r->Op2 == 0) {
617 /* PMCCNTR_EL0 */ 724 /* PMCCNTR_EL0 */
@@ -666,7 +773,7 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
666 773
667 if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { 774 if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
668 /* PMXEVTYPER_EL0 */ 775 /* PMXEVTYPER_EL0 */
669 idx = vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK; 776 idx = __vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
670 reg = PMEVTYPER0_EL0 + idx; 777 reg = PMEVTYPER0_EL0 + idx;
671 } else if (r->CRn == 14 && (r->CRm & 12) == 12) { 778 } else if (r->CRn == 14 && (r->CRm & 12) == 12) {
672 idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); 779 idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
@@ -684,9 +791,9 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
684 791
685 if (p->is_write) { 792 if (p->is_write) {
686 kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); 793 kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
687 vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK; 794 __vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
688 } else { 795 } else {
689 p->regval = vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK; 796 p->regval = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
690 } 797 }
691 798
692 return true; 799 return true;
@@ -708,15 +815,15 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
708 val = p->regval & mask; 815 val = p->regval & mask;
709 if (r->Op2 & 0x1) { 816 if (r->Op2 & 0x1) {
710 /* accessing PMCNTENSET_EL0 */ 817 /* accessing PMCNTENSET_EL0 */
711 vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; 818 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
712 kvm_pmu_enable_counter(vcpu, val); 819 kvm_pmu_enable_counter(vcpu, val);
713 } else { 820 } else {
714 /* accessing PMCNTENCLR_EL0 */ 821 /* accessing PMCNTENCLR_EL0 */
715 vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; 822 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
716 kvm_pmu_disable_counter(vcpu, val); 823 kvm_pmu_disable_counter(vcpu, val);
717 } 824 }
718 } else { 825 } else {
719 p->regval = vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; 826 p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
720 } 827 }
721 828
722 return true; 829 return true;
@@ -740,12 +847,12 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
740 847
741 if (r->Op2 & 0x1) 848 if (r->Op2 & 0x1)
742 /* accessing PMINTENSET_EL1 */ 849 /* accessing PMINTENSET_EL1 */
743 vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; 850 __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
744 else 851 else
745 /* accessing PMINTENCLR_EL1 */ 852 /* accessing PMINTENCLR_EL1 */
746 vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; 853 __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
747 } else { 854 } else {
748 p->regval = vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask; 855 p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
749 } 856 }
750 857
751 return true; 858 return true;
@@ -765,12 +872,12 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
765 if (p->is_write) { 872 if (p->is_write) {
766 if (r->CRm & 0x2) 873 if (r->CRm & 0x2)
767 /* accessing PMOVSSET_EL0 */ 874 /* accessing PMOVSSET_EL0 */
768 vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); 875 __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
769 else 876 else
770 /* accessing PMOVSCLR_EL0 */ 877 /* accessing PMOVSCLR_EL0 */
771 vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); 878 __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
772 } else { 879 } else {
773 p->regval = vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask; 880 p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
774 } 881 }
775 882
776 return true; 883 return true;
@@ -807,10 +914,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
807 return false; 914 return false;
808 } 915 }
809 916
810 vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval 917 __vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
811 & ARMV8_PMU_USERENR_MASK; 918 p->regval & ARMV8_PMU_USERENR_MASK;
812 } else { 919 } else {
813 p->regval = vcpu_sys_reg(vcpu, PMUSERENR_EL0) 920 p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0)
814 & ARMV8_PMU_USERENR_MASK; 921 & ARMV8_PMU_USERENR_MASK;
815 } 922 }
816 923
@@ -893,6 +1000,12 @@ static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
893 task_pid_nr(current)); 1000 task_pid_nr(current));
894 1001
895 val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); 1002 val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
1003 } else if (id == SYS_ID_AA64MMFR1_EL1) {
1004 if (val & (0xfUL << ID_AA64MMFR1_LOR_SHIFT))
1005 pr_err_once("kvm [%i]: LORegions unsupported for guests, suppressing\n",
1006 task_pid_nr(current));
1007
1008 val &= ~(0xfUL << ID_AA64MMFR1_LOR_SHIFT);
896 } 1009 }
897 1010
898 return val; 1011 return val;
@@ -1178,6 +1291,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1178 { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, 1291 { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
1179 { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, 1292 { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
1180 1293
1294 { SYS_DESC(SYS_LORSA_EL1), trap_undef },
1295 { SYS_DESC(SYS_LOREA_EL1), trap_undef },
1296 { SYS_DESC(SYS_LORN_EL1), trap_undef },
1297 { SYS_DESC(SYS_LORC_EL1), trap_undef },
1298 { SYS_DESC(SYS_LORID_EL1), trap_undef },
1299
1181 { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, 1300 { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
1182 { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, 1301 { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
1183 1302
@@ -1545,6 +1664,11 @@ static const struct sys_reg_desc cp15_regs[] = {
1545 1664
1546 { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, 1665 { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
1547 1666
1667 /* CNTP_TVAL */
1668 { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval },
1669 /* CNTP_CTL */
1670 { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl },
1671
1548 /* PMEVCNTRn */ 1672 /* PMEVCNTRn */
1549 PMU_PMEVCNTR(0), 1673 PMU_PMEVCNTR(0),
1550 PMU_PMEVCNTR(1), 1674 PMU_PMEVCNTR(1),
@@ -1618,6 +1742,7 @@ static const struct sys_reg_desc cp15_64_regs[] = {
1618 { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr }, 1742 { Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr },
1619 { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, 1743 { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
1620 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, 1744 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
1745 { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval },
1621}; 1746};
1622 1747
1623/* Target specific emulation tables */ 1748/* Target specific emulation tables */
@@ -2194,7 +2319,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
2194 if (r->get_user) 2319 if (r->get_user)
2195 return (r->get_user)(vcpu, r, reg, uaddr); 2320 return (r->get_user)(vcpu, r, reg, uaddr);
2196 2321
2197 return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id); 2322 return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id);
2198} 2323}
2199 2324
2200int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) 2325int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
@@ -2215,7 +2340,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
2215 if (r->set_user) 2340 if (r->set_user)
2216 return (r->set_user)(vcpu, r, reg, uaddr); 2341 return (r->set_user)(vcpu, r, reg, uaddr);
2217 2342
2218 return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); 2343 return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
2219} 2344}
2220 2345
2221static unsigned int num_demux_regs(void) 2346static unsigned int num_demux_regs(void)
@@ -2421,6 +2546,6 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
2421 reset_sys_reg_descs(vcpu, table, num); 2546 reset_sys_reg_descs(vcpu, table, num);
2422 2547
2423 for (num = 1; num < NR_SYS_REGS; num++) 2548 for (num = 1; num < NR_SYS_REGS; num++)
2424 if (vcpu_sys_reg(vcpu, num) == 0x4242424242424242) 2549 if (__vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
2425 panic("Didn't reset vcpu_sys_reg(%zi)", num); 2550 panic("Didn't reset __vcpu_sys_reg(%zi)", num);
2426} 2551}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 060f5348ef25..cd710f8b63e0 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -89,14 +89,14 @@ static inline void reset_unknown(struct kvm_vcpu *vcpu,
89{ 89{
90 BUG_ON(!r->reg); 90 BUG_ON(!r->reg);
91 BUG_ON(r->reg >= NR_SYS_REGS); 91 BUG_ON(r->reg >= NR_SYS_REGS);
92 vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; 92 __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
93} 93}
94 94
95static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 95static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
96{ 96{
97 BUG_ON(!r->reg); 97 BUG_ON(!r->reg);
98 BUG_ON(r->reg >= NR_SYS_REGS); 98 BUG_ON(r->reg >= NR_SYS_REGS);
99 vcpu_sys_reg(vcpu, r->reg) = r->val; 99 __vcpu_sys_reg(vcpu, r->reg) = r->val;
100} 100}
101 101
102static inline int cmp_sys_reg(const struct sys_reg_desc *i1, 102static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c
index 969ade1d333d..ddb8497d18d6 100644
--- a/arch/arm64/kvm/sys_regs_generic_v8.c
+++ b/arch/arm64/kvm/sys_regs_generic_v8.c
@@ -38,13 +38,13 @@ static bool access_actlr(struct kvm_vcpu *vcpu,
38 if (p->is_write) 38 if (p->is_write)
39 return ignore_write(vcpu, p); 39 return ignore_write(vcpu, p);
40 40
41 p->regval = vcpu_sys_reg(vcpu, ACTLR_EL1); 41 p->regval = vcpu_read_sys_reg(vcpu, ACTLR_EL1);
42 return true; 42 return true;
43} 43}
44 44
45static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 45static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
46{ 46{
47 vcpu_sys_reg(vcpu, ACTLR_EL1) = read_sysreg(actlr_el1); 47 __vcpu_sys_reg(vcpu, ACTLR_EL1) = read_sysreg(actlr_el1);
48} 48}
49 49
50/* 50/*
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
new file mode 100644
index 000000000000..c712a7376bc1
--- /dev/null
+++ b/arch/arm64/kvm/va_layout.c
@@ -0,0 +1,227 @@
1/*
2 * Copyright (C) 2017 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <linux/kvm_host.h>
19#include <linux/random.h>
20#include <linux/memblock.h>
21#include <asm/alternative.h>
22#include <asm/debug-monitors.h>
23#include <asm/insn.h>
24#include <asm/kvm_mmu.h>
25
26/*
27 * The LSB of the random hyp VA tag or 0 if no randomization is used.
28 */
29static u8 tag_lsb;
30/*
31 * The random hyp VA tag value with the region bit if hyp randomization is used
32 */
33static u64 tag_val;
34static u64 va_mask;
35
36static void compute_layout(void)
37{
38 phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
39 u64 hyp_va_msb;
40 int kva_msb;
41
42 /* Where is my RAM region? */
43 hyp_va_msb = idmap_addr & BIT(VA_BITS - 1);
44 hyp_va_msb ^= BIT(VA_BITS - 1);
45
46 kva_msb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^
47 (u64)(high_memory - 1));
48
49 if (kva_msb == (VA_BITS - 1)) {
50 /*
51 * No space in the address, let's compute the mask so
52 * that it covers (VA_BITS - 1) bits, and the region
53 * bit. The tag stays set to zero.
54 */
55 va_mask = BIT(VA_BITS - 1) - 1;
56 va_mask |= hyp_va_msb;
57 } else {
58 /*
59 * We do have some free bits to insert a random tag.
60 * Hyp VAs are now created from kernel linear map VAs
61 * using the following formula (with V == VA_BITS):
62 *
63 * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0
64 * ---------------------------------------------------------
65 * | 0000000 | hyp_va_msb | random tag | kern linear VA |
66 */
67 tag_lsb = kva_msb;
68 va_mask = GENMASK_ULL(tag_lsb - 1, 0);
69 tag_val = get_random_long() & GENMASK_ULL(VA_BITS - 2, tag_lsb);
70 tag_val |= hyp_va_msb;
71 tag_val >>= tag_lsb;
72 }
73}
74
75static u32 compute_instruction(int n, u32 rd, u32 rn)
76{
77 u32 insn = AARCH64_BREAK_FAULT;
78
79 switch (n) {
80 case 0:
81 insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_AND,
82 AARCH64_INSN_VARIANT_64BIT,
83 rn, rd, va_mask);
84 break;
85
86 case 1:
87 /* ROR is a variant of EXTR with Rm = Rn */
88 insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT,
89 rn, rn, rd,
90 tag_lsb);
91 break;
92
93 case 2:
94 insn = aarch64_insn_gen_add_sub_imm(rd, rn,
95 tag_val & GENMASK(11, 0),
96 AARCH64_INSN_VARIANT_64BIT,
97 AARCH64_INSN_ADSB_ADD);
98 break;
99
100 case 3:
101 insn = aarch64_insn_gen_add_sub_imm(rd, rn,
102 tag_val & GENMASK(23, 12),
103 AARCH64_INSN_VARIANT_64BIT,
104 AARCH64_INSN_ADSB_ADD);
105 break;
106
107 case 4:
108 /* ROR is a variant of EXTR with Rm = Rn */
109 insn = aarch64_insn_gen_extr(AARCH64_INSN_VARIANT_64BIT,
110 rn, rn, rd, 64 - tag_lsb);
111 break;
112 }
113
114 return insn;
115}
116
117void __init kvm_update_va_mask(struct alt_instr *alt,
118 __le32 *origptr, __le32 *updptr, int nr_inst)
119{
120 int i;
121
122 BUG_ON(nr_inst != 5);
123
124 if (!has_vhe() && !va_mask)
125 compute_layout();
126
127 for (i = 0; i < nr_inst; i++) {
128 u32 rd, rn, insn, oinsn;
129
130 /*
131 * VHE doesn't need any address translation, let's NOP
132 * everything.
133 *
134 * Alternatively, if we don't have any spare bits in
135 * the address, NOP everything after masking that
136 * kernel VA.
137 */
138 if (has_vhe() || (!tag_lsb && i > 0)) {
139 updptr[i] = cpu_to_le32(aarch64_insn_gen_nop());
140 continue;
141 }
142
143 oinsn = le32_to_cpu(origptr[i]);
144 rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
145 rn = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, oinsn);
146
147 insn = compute_instruction(i, rd, rn);
148 BUG_ON(insn == AARCH64_BREAK_FAULT);
149
150 updptr[i] = cpu_to_le32(insn);
151 }
152}
153
154void *__kvm_bp_vect_base;
155int __kvm_harden_el2_vector_slot;
156
157void kvm_patch_vector_branch(struct alt_instr *alt,
158 __le32 *origptr, __le32 *updptr, int nr_inst)
159{
160 u64 addr;
161 u32 insn;
162
163 BUG_ON(nr_inst != 5);
164
165 if (has_vhe() || !cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
166 WARN_ON_ONCE(cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS));
167 return;
168 }
169
170 if (!va_mask)
171 compute_layout();
172
173 /*
174 * Compute HYP VA by using the same computation as kern_hyp_va()
175 */
176 addr = (uintptr_t)kvm_ksym_ref(__kvm_hyp_vector);
177 addr &= va_mask;
178 addr |= tag_val << tag_lsb;
179
180 /* Use PC[10:7] to branch to the same vector in KVM */
181 addr |= ((u64)origptr & GENMASK_ULL(10, 7));
182
183 /*
184 * Branch to the second instruction in the vectors in order to
185 * avoid the initial store on the stack (which we already
186 * perform in the hardening vectors).
187 */
188 addr += AARCH64_INSN_SIZE;
189
190 /* stp x0, x1, [sp, #-16]! */
191 insn = aarch64_insn_gen_load_store_pair(AARCH64_INSN_REG_0,
192 AARCH64_INSN_REG_1,
193 AARCH64_INSN_REG_SP,
194 -16,
195 AARCH64_INSN_VARIANT_64BIT,
196 AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX);
197 *updptr++ = cpu_to_le32(insn);
198
199 /* movz x0, #(addr & 0xffff) */
200 insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
201 (u16)addr,
202 0,
203 AARCH64_INSN_VARIANT_64BIT,
204 AARCH64_INSN_MOVEWIDE_ZERO);
205 *updptr++ = cpu_to_le32(insn);
206
207 /* movk x0, #((addr >> 16) & 0xffff), lsl #16 */
208 insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
209 (u16)(addr >> 16),
210 16,
211 AARCH64_INSN_VARIANT_64BIT,
212 AARCH64_INSN_MOVEWIDE_KEEP);
213 *updptr++ = cpu_to_le32(insn);
214
215 /* movk x0, #((addr >> 32) & 0xffff), lsl #32 */
216 insn = aarch64_insn_gen_movewide(AARCH64_INSN_REG_0,
217 (u16)(addr >> 32),
218 32,
219 AARCH64_INSN_VARIANT_64BIT,
220 AARCH64_INSN_MOVEWIDE_KEEP);
221 *updptr++ = cpu_to_le32(insn);
222
223 /* br x0 */
224 insn = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_0,
225 AARCH64_INSN_BRANCH_NOLINK);
226 *updptr++ = cpu_to_le32(insn);
227}
diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h
index 60b1aa0b7014..b57e978b0946 100644
--- a/arch/mips/include/asm/kvm_para.h
+++ b/arch/mips/include/asm/kvm_para.h
@@ -94,6 +94,11 @@ static inline unsigned int kvm_arch_para_features(void)
94 return 0; 94 return 0;
95} 95}
96 96
97static inline unsigned int kvm_arch_para_hints(void)
98{
99 return 0;
100}
101
97#ifdef CONFIG_MIPS_PARAVIRT 102#ifdef CONFIG_MIPS_PARAVIRT
98static inline bool kvm_para_available(void) 103static inline bool kvm_para_available(void)
99{ 104{
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index deb54293398c..17498e9a26e4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -60,7 +60,6 @@
60 60
61#define KVM_ARCH_WANT_MMU_NOTIFIER 61#define KVM_ARCH_WANT_MMU_NOTIFIER
62 62
63extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
64extern int kvm_unmap_hva_range(struct kvm *kvm, 63extern int kvm_unmap_hva_range(struct kvm *kvm,
65 unsigned long start, unsigned long end); 64 unsigned long start, unsigned long end);
66extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); 65extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 336a91acb8b1..5ceb4efca65f 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -61,6 +61,11 @@ static inline unsigned int kvm_arch_para_features(void)
61 return r; 61 return r;
62} 62}
63 63
64static inline unsigned int kvm_arch_para_hints(void)
65{
66 return 0;
67}
68
64static inline bool kvm_check_and_clear_guest_paused(void) 69static inline bool kvm_check_and_clear_guest_paused(void)
65{ 70{
66 return false; 71 return false;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b7d066b037da..abe7032cdb54 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -295,7 +295,6 @@ struct kvmppc_ops {
295 const struct kvm_userspace_memory_region *mem, 295 const struct kvm_userspace_memory_region *mem,
296 const struct kvm_memory_slot *old, 296 const struct kvm_memory_slot *old,
297 const struct kvm_memory_slot *new); 297 const struct kvm_memory_slot *new);
298 int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
299 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, 298 int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
300 unsigned long end); 299 unsigned long end);
301 int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); 300 int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 234531d1bee1..97d4a112648f 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -819,12 +819,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
819 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); 819 kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
820} 820}
821 821
822int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
823{
824 return kvm->arch.kvm_ops->unmap_hva(kvm, hva);
825}
826EXPORT_SYMBOL_GPL(kvm_unmap_hva);
827
828int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 822int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
829{ 823{
830 return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); 824 return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index d2b3ec088b8c..4ad5e287b8bc 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -14,7 +14,6 @@
14 14
15extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, 15extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
16 struct kvm_memory_slot *memslot); 16 struct kvm_memory_slot *memslot);
17extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva);
18extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, 17extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
19 unsigned long end); 18 unsigned long end);
20extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, 19extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ef243fed2f2b..a670fa5fbe50 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -877,15 +877,6 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
877 return 0; 877 return 0;
878} 878}
879 879
880int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
881{
882 hva_handler_fn handler;
883
884 handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
885 kvm_handle_hva(kvm, hva, handler);
886 return 0;
887}
888
889int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) 880int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
890{ 881{
891 hva_handler_fn handler; 882 hva_handler_fn handler;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 5d9bafe9a371..a57eafec4dc2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -150,7 +150,9 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
150{ 150{
151 int psize = MMU_BASE_PSIZE; 151 int psize = MMU_BASE_PSIZE;
152 152
153 if (pshift >= PMD_SHIFT) 153 if (pshift >= PUD_SHIFT)
154 psize = MMU_PAGE_1G;
155 else if (pshift >= PMD_SHIFT)
154 psize = MMU_PAGE_2M; 156 psize = MMU_PAGE_2M;
155 addr &= ~0xfffUL; 157 addr &= ~0xfffUL;
156 addr |= mmu_psize_defs[psize].ap << 5; 158 addr |= mmu_psize_defs[psize].ap << 5;
@@ -163,6 +165,17 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
163 asm volatile("ptesync": : :"memory"); 165 asm volatile("ptesync": : :"memory");
164} 166}
165 167
168static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
169{
170 unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */
171
172 asm volatile("ptesync": : :"memory");
173 /* RIC=1 PRS=0 R=1 IS=2 */
174 asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1)
175 : : "r" (rb), "r" (kvm->arch.lpid) : "memory");
176 asm volatile("ptesync": : :"memory");
177}
178
166unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, 179unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
167 unsigned long clr, unsigned long set, 180 unsigned long clr, unsigned long set,
168 unsigned long addr, unsigned int shift) 181 unsigned long addr, unsigned int shift)
@@ -223,9 +236,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
223 new_pud = pud_alloc_one(kvm->mm, gpa); 236 new_pud = pud_alloc_one(kvm->mm, gpa);
224 237
225 pmd = NULL; 238 pmd = NULL;
226 if (pud && pud_present(*pud)) 239 if (pud && pud_present(*pud) && !pud_huge(*pud))
227 pmd = pmd_offset(pud, gpa); 240 pmd = pmd_offset(pud, gpa);
228 else 241 else if (level <= 1)
229 new_pmd = pmd_alloc_one(kvm->mm, gpa); 242 new_pmd = pmd_alloc_one(kvm->mm, gpa);
230 243
231 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) 244 if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
@@ -246,6 +259,50 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
246 new_pud = NULL; 259 new_pud = NULL;
247 } 260 }
248 pud = pud_offset(pgd, gpa); 261 pud = pud_offset(pgd, gpa);
262 if (pud_huge(*pud)) {
263 unsigned long hgpa = gpa & PUD_MASK;
264
265 /*
266 * If we raced with another CPU which has just put
267 * a 1GB pte in after we saw a pmd page, try again.
268 */
269 if (level <= 1 && !new_pmd) {
270 ret = -EAGAIN;
271 goto out_unlock;
272 }
273 /* Check if we raced and someone else has set the same thing */
274 if (level == 2 && pud_raw(*pud) == pte_raw(pte)) {
275 ret = 0;
276 goto out_unlock;
277 }
278 /* Valid 1GB page here already, remove it */
279 old = kvmppc_radix_update_pte(kvm, (pte_t *)pud,
280 ~0UL, 0, hgpa, PUD_SHIFT);
281 kvmppc_radix_tlbie_page(kvm, hgpa, PUD_SHIFT);
282 if (old & _PAGE_DIRTY) {
283 unsigned long gfn = hgpa >> PAGE_SHIFT;
284 struct kvm_memory_slot *memslot;
285 memslot = gfn_to_memslot(kvm, gfn);
286 if (memslot && memslot->dirty_bitmap)
287 kvmppc_update_dirty_map(memslot,
288 gfn, PUD_SIZE);
289 }
290 }
291 if (level == 2) {
292 if (!pud_none(*pud)) {
293 /*
294 * There's a page table page here, but we wanted to
295 * install a large page, so remove and free the page
296 * table page. new_pmd will be NULL since level == 2.
297 */
298 new_pmd = pmd_offset(pud, 0);
299 pud_clear(pud);
300 kvmppc_radix_flush_pwc(kvm, gpa);
301 }
302 kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
303 ret = 0;
304 goto out_unlock;
305 }
249 if (pud_none(*pud)) { 306 if (pud_none(*pud)) {
250 if (!new_pmd) 307 if (!new_pmd)
251 goto out_unlock; 308 goto out_unlock;
@@ -264,6 +321,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
264 ret = -EAGAIN; 321 ret = -EAGAIN;
265 goto out_unlock; 322 goto out_unlock;
266 } 323 }
324 /* Check if we raced and someone else has set the same thing */
325 if (level == 1 && pmd_raw(*pmd) == pte_raw(pte)) {
326 ret = 0;
327 goto out_unlock;
328 }
267 /* Valid 2MB page here already, remove it */ 329 /* Valid 2MB page here already, remove it */
268 old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), 330 old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
269 ~0UL, 0, lgpa, PMD_SHIFT); 331 ~0UL, 0, lgpa, PMD_SHIFT);
@@ -276,35 +338,43 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
276 kvmppc_update_dirty_map(memslot, 338 kvmppc_update_dirty_map(memslot,
277 gfn, PMD_SIZE); 339 gfn, PMD_SIZE);
278 } 340 }
279 } else if (level == 1 && !pmd_none(*pmd)) {
280 /*
281 * There's a page table page here, but we wanted
282 * to install a large page. Tell the caller and let
283 * it try installing a normal page if it wants.
284 */
285 ret = -EBUSY;
286 goto out_unlock;
287 } 341 }
288 if (level == 0) { 342 if (level == 1) {
289 if (pmd_none(*pmd)) { 343 if (!pmd_none(*pmd)) {
290 if (!new_ptep) 344 /*
291 goto out_unlock; 345 * There's a page table page here, but we wanted to
292 pmd_populate(kvm->mm, pmd, new_ptep); 346 * install a large page, so remove and free the page
293 new_ptep = NULL; 347 * table page. new_ptep will be NULL since level == 1.
294 } 348 */
295 ptep = pte_offset_kernel(pmd, gpa); 349 new_ptep = pte_offset_kernel(pmd, 0);
296 if (pte_present(*ptep)) { 350 pmd_clear(pmd);
297 /* PTE was previously valid, so invalidate it */ 351 kvmppc_radix_flush_pwc(kvm, gpa);
298 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
299 0, gpa, 0);
300 kvmppc_radix_tlbie_page(kvm, gpa, 0);
301 if (old & _PAGE_DIRTY)
302 mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
303 } 352 }
304 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
305 } else {
306 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); 353 kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
354 ret = 0;
355 goto out_unlock;
307 } 356 }
357 if (pmd_none(*pmd)) {
358 if (!new_ptep)
359 goto out_unlock;
360 pmd_populate(kvm->mm, pmd, new_ptep);
361 new_ptep = NULL;
362 }
363 ptep = pte_offset_kernel(pmd, gpa);
364 if (pte_present(*ptep)) {
365 /* Check if someone else set the same thing */
366 if (pte_raw(*ptep) == pte_raw(pte)) {
367 ret = 0;
368 goto out_unlock;
369 }
370 /* PTE was previously valid, so invalidate it */
371 old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
372 0, gpa, 0);
373 kvmppc_radix_tlbie_page(kvm, gpa, 0);
374 if (old & _PAGE_DIRTY)
375 mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
376 }
377 kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
308 ret = 0; 378 ret = 0;
309 379
310 out_unlock: 380 out_unlock:
@@ -325,11 +395,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
325 unsigned long mmu_seq, pte_size; 395 unsigned long mmu_seq, pte_size;
326 unsigned long gpa, gfn, hva, pfn; 396 unsigned long gpa, gfn, hva, pfn;
327 struct kvm_memory_slot *memslot; 397 struct kvm_memory_slot *memslot;
328 struct page *page = NULL, *pages[1]; 398 struct page *page = NULL;
329 long ret, npages, ok; 399 long ret;
330 unsigned int writing; 400 bool writing;
331 struct vm_area_struct *vma; 401 bool upgrade_write = false;
332 unsigned long flags; 402 bool *upgrade_p = &upgrade_write;
333 pte_t pte, *ptep; 403 pte_t pte, *ptep;
334 unsigned long pgflags; 404 unsigned long pgflags;
335 unsigned int shift, level; 405 unsigned int shift, level;
@@ -369,122 +439,131 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
369 dsisr & DSISR_ISSTORE); 439 dsisr & DSISR_ISSTORE);
370 } 440 }
371 441
372 /* used to check for invalidations in progress */
373 mmu_seq = kvm->mmu_notifier_seq;
374 smp_rmb();
375
376 writing = (dsisr & DSISR_ISSTORE) != 0; 442 writing = (dsisr & DSISR_ISSTORE) != 0;
377 hva = gfn_to_hva_memslot(memslot, gfn); 443 if (memslot->flags & KVM_MEM_READONLY) {
444 if (writing) {
445 /* give the guest a DSI */
446 dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
447 kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
448 return RESUME_GUEST;
449 }
450 upgrade_p = NULL;
451 }
452
378 if (dsisr & DSISR_SET_RC) { 453 if (dsisr & DSISR_SET_RC) {
379 /* 454 /*
380 * Need to set an R or C bit in the 2nd-level tables; 455 * Need to set an R or C bit in the 2nd-level tables;
381 * if the relevant bits aren't already set in the linux 456 * since we are just helping out the hardware here,
382 * page tables, fall through to do the gup_fast to 457 * it is sufficient to do what the hardware does.
383 * set them in the linux page tables too.
384 */ 458 */
385 ok = 0;
386 pgflags = _PAGE_ACCESSED; 459 pgflags = _PAGE_ACCESSED;
387 if (writing) 460 if (writing)
388 pgflags |= _PAGE_DIRTY; 461 pgflags |= _PAGE_DIRTY;
389 local_irq_save(flags); 462 /*
390 ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); 463 * We are walking the secondary page table here. We can do this
391 if (ptep) { 464 * without disabling irq.
392 pte = READ_ONCE(*ptep); 465 */
393 if (pte_present(pte) && 466 spin_lock(&kvm->mmu_lock);
394 (pte_val(pte) & pgflags) == pgflags) 467 ptep = __find_linux_pte(kvm->arch.pgtable,
395 ok = 1; 468 gpa, NULL, &shift);
396 } 469 if (ptep && pte_present(*ptep) &&
397 local_irq_restore(flags); 470 (!writing || pte_write(*ptep))) {
398 if (ok) { 471 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
399 spin_lock(&kvm->mmu_lock); 472 gpa, shift);
400 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 473 dsisr &= ~DSISR_SET_RC;
401 spin_unlock(&kvm->mmu_lock);
402 return RESUME_GUEST;
403 }
404 /*
405 * We are walking the secondary page table here. We can do this
406 * without disabling irq.
407 */
408 ptep = __find_linux_pte(kvm->arch.pgtable,
409 gpa, NULL, &shift);
410 if (ptep && pte_present(*ptep)) {
411 kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
412 gpa, shift);
413 spin_unlock(&kvm->mmu_lock);
414 return RESUME_GUEST;
415 }
416 spin_unlock(&kvm->mmu_lock);
417 } 474 }
475 spin_unlock(&kvm->mmu_lock);
476 if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
477 DSISR_PROTFAULT | DSISR_SET_RC)))
478 return RESUME_GUEST;
418 } 479 }
419 480
420 ret = -EFAULT; 481 /* used to check for invalidations in progress */
421 pfn = 0; 482 mmu_seq = kvm->mmu_notifier_seq;
422 pte_size = PAGE_SIZE; 483 smp_rmb();
423 pgflags = _PAGE_READ | _PAGE_EXEC; 484
424 level = 0; 485 /*
425 npages = get_user_pages_fast(hva, 1, writing, pages); 486 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
426 if (npages < 1) { 487 * do it with !atomic && !async, which is how we call it.
427 /* Check if it's an I/O mapping */ 488 * We always ask for write permission since the common case
428 down_read(&current->mm->mmap_sem); 489 * is that the page is writable.
429 vma = find_vma(current->mm, hva); 490 */
430 if (vma && vma->vm_start <= hva && hva < vma->vm_end && 491 hva = gfn_to_hva_memslot(memslot, gfn);
431 (vma->vm_flags & VM_PFNMAP)) { 492 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
432 pfn = vma->vm_pgoff +
433 ((hva - vma->vm_start) >> PAGE_SHIFT);
434 pgflags = pgprot_val(vma->vm_page_prot);
435 }
436 up_read(&current->mm->mmap_sem);
437 if (!pfn)
438 return -EFAULT;
439 } else {
440 page = pages[0];
441 pfn = page_to_pfn(page); 493 pfn = page_to_pfn(page);
442 if (PageCompound(page)) { 494 upgrade_write = true;
443 pte_size <<= compound_order(compound_head(page)); 495 } else {
444 /* See if we can insert a 2MB large-page PTE here */ 496 /* Call KVM generic code to do the slow-path check */
445 if (pte_size >= PMD_SIZE && 497 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
446 (gpa & (PMD_SIZE - PAGE_SIZE)) == 498 writing, upgrade_p);
447 (hva & (PMD_SIZE - PAGE_SIZE))) { 499 if (is_error_noslot_pfn(pfn))
448 level = 1; 500 return -EFAULT;
449 pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); 501 page = NULL;
450 } 502 if (pfn_valid(pfn)) {
503 page = pfn_to_page(pfn);
504 if (PageReserved(page))
505 page = NULL;
451 } 506 }
452 /* See if we can provide write access */ 507 }
453 if (writing) { 508
454 pgflags |= _PAGE_WRITE; 509 /* See if we can insert a 1GB or 2MB large PTE here */
455 } else { 510 level = 0;
456 local_irq_save(flags); 511 if (page && PageCompound(page)) {
457 ptep = find_current_mm_pte(current->mm->pgd, 512 pte_size = PAGE_SIZE << compound_order(compound_head(page));
458 hva, NULL, NULL); 513 if (pte_size >= PUD_SIZE &&
459 if (ptep && pte_write(*ptep)) 514 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
460 pgflags |= _PAGE_WRITE; 515 (hva & (PUD_SIZE - PAGE_SIZE))) {
461 local_irq_restore(flags); 516 level = 2;
517 pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
518 } else if (pte_size >= PMD_SIZE &&
519 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
520 (hva & (PMD_SIZE - PAGE_SIZE))) {
521 level = 1;
522 pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
462 } 523 }
463 } 524 }
464 525
465 /* 526 /*
466 * Compute the PTE value that we need to insert. 527 * Compute the PTE value that we need to insert.
467 */ 528 */
468 pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; 529 if (page) {
469 if (pgflags & _PAGE_WRITE) 530 pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
470 pgflags |= _PAGE_DIRTY; 531 _PAGE_ACCESSED;
471 pte = pfn_pte(pfn, __pgprot(pgflags)); 532 if (writing || upgrade_write)
472 533 pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
473 /* Allocate space in the tree and write the PTE */ 534 pte = pfn_pte(pfn, __pgprot(pgflags));
474 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); 535 } else {
475 if (ret == -EBUSY) {
476 /* 536 /*
477 * There's already a PMD where wanted to install a large page; 537 * Read the PTE from the process' radix tree and use that
478 * for now, fall back to installing a small page. 538 * so we get the attribute bits.
479 */ 539 */
480 level = 0; 540 local_irq_disable();
481 pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); 541 ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
482 pte = pfn_pte(pfn, __pgprot(pgflags)); 542 pte = *ptep;
483 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); 543 local_irq_enable();
544 if (shift == PUD_SHIFT &&
545 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
546 (hva & (PUD_SIZE - PAGE_SIZE))) {
547 level = 2;
548 } else if (shift == PMD_SHIFT &&
549 (gpa & (PMD_SIZE - PAGE_SIZE)) ==
550 (hva & (PMD_SIZE - PAGE_SIZE))) {
551 level = 1;
552 } else if (shift && shift != PAGE_SHIFT) {
553 /* Adjust PFN */
554 unsigned long mask = (1ul << shift) - PAGE_SIZE;
555 pte = __pte(pte_val(pte) | (hva & mask));
556 }
557 if (!(writing || upgrade_write))
558 pte = __pte(pte_val(pte) & ~ _PAGE_WRITE);
559 pte = __pte(pte_val(pte) | _PAGE_EXEC);
484 } 560 }
485 561
562 /* Allocate space in the tree and write the PTE */
563 ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
564
486 if (page) { 565 if (page) {
487 if (!ret && (pgflags & _PAGE_WRITE)) 566 if (!ret && (pte_val(pte) & _PAGE_WRITE))
488 set_page_dirty_lock(page); 567 set_page_dirty_lock(page);
489 put_page(page); 568 put_page(page);
490 } 569 }
@@ -662,6 +741,10 @@ void kvmppc_free_radix(struct kvm *kvm)
662 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { 741 for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
663 if (!pud_present(*pud)) 742 if (!pud_present(*pud))
664 continue; 743 continue;
744 if (pud_huge(*pud)) {
745 pud_clear(pud);
746 continue;
747 }
665 pmd = pmd_offset(pud, 0); 748 pmd = pmd_offset(pud, 0);
666 for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { 749 for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
667 if (pmd_is_leaf(*pmd)) { 750 if (pmd_is_leaf(*pmd)) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index c32e9bfe75b1..6651f736a0b1 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -450,7 +450,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
450 450
451 /* 451 /*
452 * Synchronize with the MMU notifier callbacks in 452 * Synchronize with the MMU notifier callbacks in
453 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). 453 * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.).
454 * While we have the rmap lock, code running on other CPUs 454 * While we have the rmap lock, code running on other CPUs
455 * cannot finish unmapping the host real page that backs 455 * cannot finish unmapping the host real page that backs
456 * this guest real page, so we are OK to access the host 456 * this guest real page, so we are OK to access the host
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 81e2ea882d97..4d07fca5121c 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4375,7 +4375,6 @@ static struct kvmppc_ops kvm_ops_hv = {
4375 .flush_memslot = kvmppc_core_flush_memslot_hv, 4375 .flush_memslot = kvmppc_core_flush_memslot_hv,
4376 .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, 4376 .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
4377 .commit_memory_region = kvmppc_core_commit_memory_region_hv, 4377 .commit_memory_region = kvmppc_core_commit_memory_region_hv,
4378 .unmap_hva = kvm_unmap_hva_hv,
4379 .unmap_hva_range = kvm_unmap_hva_range_hv, 4378 .unmap_hva_range = kvm_unmap_hva_range_hv,
4380 .age_hva = kvm_age_hva_hv, 4379 .age_hva = kvm_age_hva_hv,
4381 .test_age_hva = kvm_test_age_hva_hv, 4380 .test_age_hva = kvm_test_age_hva_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 3ae752314b34..d3f304d06adf 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -277,15 +277,6 @@ static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
277 } 277 }
278} 278}
279 279
280static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva)
281{
282 trace_kvm_unmap_hva(hva);
283
284 do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
285
286 return 0;
287}
288
289static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, 280static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
290 unsigned long end) 281 unsigned long end)
291{ 282{
@@ -1773,7 +1764,6 @@ static struct kvmppc_ops kvm_ops_pr = {
1773 .flush_memslot = kvmppc_core_flush_memslot_pr, 1764 .flush_memslot = kvmppc_core_flush_memslot_pr,
1774 .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, 1765 .prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
1775 .commit_memory_region = kvmppc_core_commit_memory_region_pr, 1766 .commit_memory_region = kvmppc_core_commit_memory_region_pr,
1776 .unmap_hva = kvm_unmap_hva_pr,
1777 .unmap_hva_range = kvm_unmap_hva_range_pr, 1767 .unmap_hva_range = kvm_unmap_hva_range_pr,
1778 .age_hva = kvm_age_hva_pr, 1768 .age_hva = kvm_age_hva_pr,
1779 .test_age_hva = kvm_test_age_hva_pr, 1769 .test_age_hva = kvm_test_age_hva_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 423b21393bc9..c878b4ffb86f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -724,7 +724,7 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
724 724
725/************* MMU Notifiers *************/ 725/************* MMU Notifiers *************/
726 726
727int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 727static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
728{ 728{
729 trace_kvm_unmap_hva(hva); 729 trace_kvm_unmap_hva(hva);
730 730
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
index 85785a370c0e..2f9a8829552b 100644
--- a/arch/powerpc/kvm/trace_pr.h
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -254,21 +254,6 @@ TRACE_EVENT(kvm_exit,
254 ) 254 )
255); 255);
256 256
257TRACE_EVENT(kvm_unmap_hva,
258 TP_PROTO(unsigned long hva),
259 TP_ARGS(hva),
260
261 TP_STRUCT__entry(
262 __field( unsigned long, hva )
263 ),
264
265 TP_fast_assign(
266 __entry->hva = hva;
267 ),
268
269 TP_printk("unmap hva 0x%lx\n", __entry->hva)
270);
271
272#endif /* _TRACE_KVM_H */ 257#endif /* _TRACE_KVM_H */
273 258
274/* This part must be outside protection */ 259/* This part must be outside protection */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index afb0f08b8021..81cdb6b55118 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -294,6 +294,7 @@ struct kvm_vcpu_stat {
294 u64 exit_userspace; 294 u64 exit_userspace;
295 u64 exit_null; 295 u64 exit_null;
296 u64 exit_external_request; 296 u64 exit_external_request;
297 u64 exit_io_request;
297 u64 exit_external_interrupt; 298 u64 exit_external_interrupt;
298 u64 exit_stop_request; 299 u64 exit_stop_request;
299 u64 exit_validity; 300 u64 exit_validity;
@@ -310,16 +311,29 @@ struct kvm_vcpu_stat {
310 u64 exit_program_interruption; 311 u64 exit_program_interruption;
311 u64 exit_instr_and_program; 312 u64 exit_instr_and_program;
312 u64 exit_operation_exception; 313 u64 exit_operation_exception;
314 u64 deliver_ckc;
315 u64 deliver_cputm;
313 u64 deliver_external_call; 316 u64 deliver_external_call;
314 u64 deliver_emergency_signal; 317 u64 deliver_emergency_signal;
315 u64 deliver_service_signal; 318 u64 deliver_service_signal;
316 u64 deliver_virtio_interrupt; 319 u64 deliver_virtio;
317 u64 deliver_stop_signal; 320 u64 deliver_stop_signal;
318 u64 deliver_prefix_signal; 321 u64 deliver_prefix_signal;
319 u64 deliver_restart_signal; 322 u64 deliver_restart_signal;
320 u64 deliver_program_int; 323 u64 deliver_program;
321 u64 deliver_io_int; 324 u64 deliver_io;
325 u64 deliver_machine_check;
322 u64 exit_wait_state; 326 u64 exit_wait_state;
327 u64 inject_ckc;
328 u64 inject_cputm;
329 u64 inject_external_call;
330 u64 inject_emergency_signal;
331 u64 inject_mchk;
332 u64 inject_pfault_init;
333 u64 inject_program;
334 u64 inject_restart;
335 u64 inject_set_prefix;
336 u64 inject_stop_signal;
323 u64 instruction_epsw; 337 u64 instruction_epsw;
324 u64 instruction_gs; 338 u64 instruction_gs;
325 u64 instruction_io_other; 339 u64 instruction_io_other;
@@ -644,7 +658,12 @@ struct kvm_vcpu_arch {
644}; 658};
645 659
646struct kvm_vm_stat { 660struct kvm_vm_stat {
647 ulong remote_tlb_flush; 661 u64 inject_io;
662 u64 inject_float_mchk;
663 u64 inject_pfault_done;
664 u64 inject_service_signal;
665 u64 inject_virtio;
666 u64 remote_tlb_flush;
648}; 667};
649 668
650struct kvm_arch_memory_slot { 669struct kvm_arch_memory_slot {
@@ -792,6 +811,7 @@ struct kvm_arch{
792 int css_support; 811 int css_support;
793 int use_irqchip; 812 int use_irqchip;
794 int use_cmma; 813 int use_cmma;
814 int use_pfmfi;
795 int user_cpu_state_ctrl; 815 int user_cpu_state_ctrl;
796 int user_sigp; 816 int user_sigp;
797 int user_stsi; 817 int user_stsi;
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 74eeec9c0a80..cbc7c3a68e4d 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -193,6 +193,11 @@ static inline unsigned int kvm_arch_para_features(void)
193 return 0; 193 return 0;
194} 194}
195 195
196static inline unsigned int kvm_arch_para_hints(void)
197{
198 return 0;
199}
200
196static inline bool kvm_check_and_clear_guest_paused(void) 201static inline bool kvm_check_and_clear_guest_paused(void)
197{ 202{
198 return false; 203 return false;
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index db35c41a59d5..c639c95850e4 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -22,8 +22,8 @@ typedef struct {
22 unsigned int has_pgste:1; 22 unsigned int has_pgste:1;
23 /* The mmu context uses storage keys. */ 23 /* The mmu context uses storage keys. */
24 unsigned int use_skey:1; 24 unsigned int use_skey:1;
25 /* The mmu context uses CMMA. */ 25 /* The mmu context uses CMM. */
26 unsigned int use_cmma:1; 26 unsigned int uses_cmm:1;
27} mm_context_t; 27} mm_context_t;
28 28
29#define INIT_MM_CONTEXT(name) \ 29#define INIT_MM_CONTEXT(name) \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 6c8ce15cde7b..324f6f452982 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -31,7 +31,7 @@ static inline int init_new_context(struct task_struct *tsk,
31 (current->mm && current->mm->context.alloc_pgste); 31 (current->mm && current->mm->context.alloc_pgste);
32 mm->context.has_pgste = 0; 32 mm->context.has_pgste = 0;
33 mm->context.use_skey = 0; 33 mm->context.use_skey = 0;
34 mm->context.use_cmma = 0; 34 mm->context.uses_cmm = 0;
35#endif 35#endif
36 switch (mm->context.asce_limit) { 36 switch (mm->context.asce_limit) {
37 case _REGION2_SIZE: 37 case _REGION2_SIZE:
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index c24bfa72baf7..8e2b8647ee12 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1050,8 +1050,7 @@ shadow_r2t:
1050 rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); 1050 rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
1051 if (rc) 1051 if (rc)
1052 return rc; 1052 return rc;
1053 /* fallthrough */ 1053 } /* fallthrough */
1054 }
1055 case ASCE_TYPE_REGION2: { 1054 case ASCE_TYPE_REGION2: {
1056 union region2_table_entry rste; 1055 union region2_table_entry rste;
1057 1056
@@ -1077,8 +1076,7 @@ shadow_r3t:
1077 rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); 1076 rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
1078 if (rc) 1077 if (rc)
1079 return rc; 1078 return rc;
1080 /* fallthrough */ 1079 } /* fallthrough */
1081 }
1082 case ASCE_TYPE_REGION3: { 1080 case ASCE_TYPE_REGION3: {
1083 union region3_table_entry rtte; 1081 union region3_table_entry rtte;
1084 1082
@@ -1113,8 +1111,7 @@ shadow_sgt:
1113 rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); 1111 rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
1114 if (rc) 1112 if (rc)
1115 return rc; 1113 return rc;
1116 /* fallthrough */ 1114 } /* fallthrough */
1117 }
1118 case ASCE_TYPE_SEGMENT: { 1115 case ASCE_TYPE_SEGMENT: {
1119 union segment_table_entry ste; 1116 union segment_table_entry ste;
1120 1117
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 07c6e81163bf..a389fa85cca2 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -50,18 +50,6 @@ u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
50 return ilen; 50 return ilen;
51} 51}
52 52
53static int handle_noop(struct kvm_vcpu *vcpu)
54{
55 switch (vcpu->arch.sie_block->icptcode) {
56 case 0x10:
57 vcpu->stat.exit_external_request++;
58 break;
59 default:
60 break; /* nothing */
61 }
62 return 0;
63}
64
65static int handle_stop(struct kvm_vcpu *vcpu) 53static int handle_stop(struct kvm_vcpu *vcpu)
66{ 54{
67 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 55 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -465,8 +453,11 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
465 453
466 switch (vcpu->arch.sie_block->icptcode) { 454 switch (vcpu->arch.sie_block->icptcode) {
467 case ICPT_EXTREQ: 455 case ICPT_EXTREQ:
456 vcpu->stat.exit_external_request++;
457 return 0;
468 case ICPT_IOREQ: 458 case ICPT_IOREQ:
469 return handle_noop(vcpu); 459 vcpu->stat.exit_io_request++;
460 return 0;
470 case ICPT_INST: 461 case ICPT_INST:
471 rc = handle_instruction(vcpu); 462 rc = handle_instruction(vcpu);
472 break; 463 break;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b04616b57a94..37d06e022238 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -391,6 +391,7 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
391 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 391 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
392 int rc; 392 int rc;
393 393
394 vcpu->stat.deliver_cputm++;
394 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 395 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
395 0, 0); 396 0, 0);
396 397
@@ -410,6 +411,7 @@ static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu)
410 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 411 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
411 int rc; 412 int rc;
412 413
414 vcpu->stat.deliver_ckc++;
413 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 415 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
414 0, 0); 416 0, 0);
415 417
@@ -595,6 +597,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
595 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, 597 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
596 KVM_S390_MCHK, 598 KVM_S390_MCHK,
597 mchk.cr14, mchk.mcic); 599 mchk.cr14, mchk.mcic);
600 vcpu->stat.deliver_machine_check++;
598 rc = __write_machine_check(vcpu, &mchk); 601 rc = __write_machine_check(vcpu, &mchk);
599 } 602 }
600 return rc; 603 return rc;
@@ -710,7 +713,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
710 ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK; 713 ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
711 VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d", 714 VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
712 pgm_info.code, ilen); 715 pgm_info.code, ilen);
713 vcpu->stat.deliver_program_int++; 716 vcpu->stat.deliver_program++;
714 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, 717 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
715 pgm_info.code, 0); 718 pgm_info.code, 0);
716 719
@@ -899,7 +902,7 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu)
899 VCPU_EVENT(vcpu, 4, 902 VCPU_EVENT(vcpu, 4,
900 "deliver: virtio parm: 0x%x,parm64: 0x%llx", 903 "deliver: virtio parm: 0x%x,parm64: 0x%llx",
901 inti->ext.ext_params, inti->ext.ext_params2); 904 inti->ext.ext_params, inti->ext.ext_params2);
902 vcpu->stat.deliver_virtio_interrupt++; 905 vcpu->stat.deliver_virtio++;
903 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, 906 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
904 inti->type, 907 inti->type,
905 inti->ext.ext_params, 908 inti->ext.ext_params,
@@ -975,7 +978,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
975 inti->io.subchannel_id >> 1 & 0x3, 978 inti->io.subchannel_id >> 1 & 0x3,
976 inti->io.subchannel_nr); 979 inti->io.subchannel_nr);
977 980
978 vcpu->stat.deliver_io_int++; 981 vcpu->stat.deliver_io++;
979 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, 982 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
980 inti->type, 983 inti->type,
981 ((__u32)inti->io.subchannel_id << 16) | 984 ((__u32)inti->io.subchannel_id << 16) |
@@ -1004,7 +1007,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
1004 VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc); 1007 VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc);
1005 memset(&io, 0, sizeof(io)); 1008 memset(&io, 0, sizeof(io));
1006 io.io_int_word = isc_to_int_word(isc); 1009 io.io_int_word = isc_to_int_word(isc);
1007 vcpu->stat.deliver_io_int++; 1010 vcpu->stat.deliver_io++;
1008 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, 1011 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
1009 KVM_S390_INT_IO(1, 0, 0, 0), 1012 KVM_S390_INT_IO(1, 0, 0, 0),
1010 ((__u32)io.subchannel_id << 16) | 1013 ((__u32)io.subchannel_id << 16) |
@@ -1268,6 +1271,7 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1268{ 1271{
1269 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1272 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1270 1273
1274 vcpu->stat.inject_program++;
1271 VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code); 1275 VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code);
1272 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, 1276 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
1273 irq->u.pgm.code, 0); 1277 irq->u.pgm.code, 0);
@@ -1309,6 +1313,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1309{ 1313{
1310 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1314 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1311 1315
1316 vcpu->stat.inject_pfault_init++;
1312 VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx", 1317 VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx",
1313 irq->u.ext.ext_params2); 1318 irq->u.ext.ext_params2);
1314 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, 1319 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT,
@@ -1327,6 +1332,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1327 struct kvm_s390_extcall_info *extcall = &li->irq.extcall; 1332 struct kvm_s390_extcall_info *extcall = &li->irq.extcall;
1328 uint16_t src_id = irq->u.extcall.code; 1333 uint16_t src_id = irq->u.extcall.code;
1329 1334
1335 vcpu->stat.inject_external_call++;
1330 VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u", 1336 VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u",
1331 src_id); 1337 src_id);
1332 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, 1338 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL,
@@ -1351,6 +1357,7 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1351 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1357 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1352 struct kvm_s390_prefix_info *prefix = &li->irq.prefix; 1358 struct kvm_s390_prefix_info *prefix = &li->irq.prefix;
1353 1359
1360 vcpu->stat.inject_set_prefix++;
1354 VCPU_EVENT(vcpu, 3, "inject: set prefix to %x", 1361 VCPU_EVENT(vcpu, 3, "inject: set prefix to %x",
1355 irq->u.prefix.address); 1362 irq->u.prefix.address);
1356 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, 1363 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX,
@@ -1371,6 +1378,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1371 struct kvm_s390_stop_info *stop = &li->irq.stop; 1378 struct kvm_s390_stop_info *stop = &li->irq.stop;
1372 int rc = 0; 1379 int rc = 0;
1373 1380
1381 vcpu->stat.inject_stop_signal++;
1374 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0); 1382 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0);
1375 1383
1376 if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS) 1384 if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS)
@@ -1395,6 +1403,7 @@ static int __inject_sigp_restart(struct kvm_vcpu *vcpu,
1395{ 1403{
1396 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1404 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1397 1405
1406 vcpu->stat.inject_restart++;
1398 VCPU_EVENT(vcpu, 3, "%s", "inject: restart int"); 1407 VCPU_EVENT(vcpu, 3, "%s", "inject: restart int");
1399 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); 1408 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0);
1400 1409
@@ -1407,6 +1416,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
1407{ 1416{
1408 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1417 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1409 1418
1419 vcpu->stat.inject_emergency_signal++;
1410 VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u", 1420 VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u",
1411 irq->u.emerg.code); 1421 irq->u.emerg.code);
1412 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, 1422 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY,
@@ -1427,6 +1437,7 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
1427 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1437 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1428 struct kvm_s390_mchk_info *mchk = &li->irq.mchk; 1438 struct kvm_s390_mchk_info *mchk = &li->irq.mchk;
1429 1439
1440 vcpu->stat.inject_mchk++;
1430 VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx", 1441 VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx",
1431 irq->u.mchk.mcic); 1442 irq->u.mchk.mcic);
1432 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, 1443 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0,
@@ -1457,6 +1468,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu)
1457{ 1468{
1458 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1469 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1459 1470
1471 vcpu->stat.inject_ckc++;
1460 VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external"); 1472 VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external");
1461 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 1473 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP,
1462 0, 0); 1474 0, 0);
@@ -1470,6 +1482,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu)
1470{ 1482{
1471 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 1483 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
1472 1484
1485 vcpu->stat.inject_cputm++;
1473 VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external"); 1486 VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external");
1474 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 1487 trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER,
1475 0, 0); 1488 0, 0);
@@ -1596,6 +1609,7 @@ static int __inject_service(struct kvm *kvm,
1596{ 1609{
1597 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; 1610 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1598 1611
1612 kvm->stat.inject_service_signal++;
1599 spin_lock(&fi->lock); 1613 spin_lock(&fi->lock);
1600 fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; 1614 fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING;
1601 /* 1615 /*
@@ -1621,6 +1635,7 @@ static int __inject_virtio(struct kvm *kvm,
1621{ 1635{
1622 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; 1636 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1623 1637
1638 kvm->stat.inject_virtio++;
1624 spin_lock(&fi->lock); 1639 spin_lock(&fi->lock);
1625 if (fi->counters[FIRQ_CNTR_VIRTIO] >= KVM_S390_MAX_VIRTIO_IRQS) { 1640 if (fi->counters[FIRQ_CNTR_VIRTIO] >= KVM_S390_MAX_VIRTIO_IRQS) {
1626 spin_unlock(&fi->lock); 1641 spin_unlock(&fi->lock);
@@ -1638,6 +1653,7 @@ static int __inject_pfault_done(struct kvm *kvm,
1638{ 1653{
1639 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; 1654 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1640 1655
1656 kvm->stat.inject_pfault_done++;
1641 spin_lock(&fi->lock); 1657 spin_lock(&fi->lock);
1642 if (fi->counters[FIRQ_CNTR_PFAULT] >= 1658 if (fi->counters[FIRQ_CNTR_PFAULT] >=
1643 (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)) { 1659 (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)) {
@@ -1657,6 +1673,7 @@ static int __inject_float_mchk(struct kvm *kvm,
1657{ 1673{
1658 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; 1674 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1659 1675
1676 kvm->stat.inject_float_mchk++;
1660 spin_lock(&fi->lock); 1677 spin_lock(&fi->lock);
1661 fi->mchk.cr14 |= inti->mchk.cr14 & (1UL << CR_PENDING_SUBCLASS); 1678 fi->mchk.cr14 |= inti->mchk.cr14 & (1UL << CR_PENDING_SUBCLASS);
1662 fi->mchk.mcic |= inti->mchk.mcic; 1679 fi->mchk.mcic |= inti->mchk.mcic;
@@ -1672,6 +1689,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1672 struct list_head *list; 1689 struct list_head *list;
1673 int isc; 1690 int isc;
1674 1691
1692 kvm->stat.inject_io++;
1675 isc = int_word_to_isc(inti->io.io_int_word); 1693 isc = int_word_to_isc(inti->io.io_int_word);
1676 1694
1677 if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { 1695 if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 339ac0964590..64c986243018 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -57,6 +57,7 @@
57 (KVM_MAX_VCPUS + LOCAL_IRQS)) 57 (KVM_MAX_VCPUS + LOCAL_IRQS))
58 58
59#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 59#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
60#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
60 61
61struct kvm_stats_debugfs_item debugfs_entries[] = { 62struct kvm_stats_debugfs_item debugfs_entries[] = {
62 { "userspace_handled", VCPU_STAT(exit_userspace) }, 63 { "userspace_handled", VCPU_STAT(exit_userspace) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
64 { "exit_validity", VCPU_STAT(exit_validity) }, 65 { "exit_validity", VCPU_STAT(exit_validity) },
65 { "exit_stop_request", VCPU_STAT(exit_stop_request) }, 66 { "exit_stop_request", VCPU_STAT(exit_stop_request) },
66 { "exit_external_request", VCPU_STAT(exit_external_request) }, 67 { "exit_external_request", VCPU_STAT(exit_external_request) },
68 { "exit_io_request", VCPU_STAT(exit_io_request) },
67 { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, 69 { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) },
68 { "exit_instruction", VCPU_STAT(exit_instruction) }, 70 { "exit_instruction", VCPU_STAT(exit_instruction) },
69 { "exit_pei", VCPU_STAT(exit_pei) }, 71 { "exit_pei", VCPU_STAT(exit_pei) },
@@ -78,16 +80,34 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
78 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 80 { "instruction_lctl", VCPU_STAT(instruction_lctl) },
79 { "instruction_stctl", VCPU_STAT(instruction_stctl) }, 81 { "instruction_stctl", VCPU_STAT(instruction_stctl) },
80 { "instruction_stctg", VCPU_STAT(instruction_stctg) }, 82 { "instruction_stctg", VCPU_STAT(instruction_stctg) },
83 { "deliver_ckc", VCPU_STAT(deliver_ckc) },
84 { "deliver_cputm", VCPU_STAT(deliver_cputm) },
81 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, 85 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
82 { "deliver_external_call", VCPU_STAT(deliver_external_call) }, 86 { "deliver_external_call", VCPU_STAT(deliver_external_call) },
83 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, 87 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
84 { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) }, 88 { "deliver_virtio", VCPU_STAT(deliver_virtio) },
85 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, 89 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
86 { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, 90 { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) },
87 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, 91 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
88 { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, 92 { "deliver_program", VCPU_STAT(deliver_program) },
89 { "deliver_io_interrupt", VCPU_STAT(deliver_io_int) }, 93 { "deliver_io", VCPU_STAT(deliver_io) },
94 { "deliver_machine_check", VCPU_STAT(deliver_machine_check) },
90 { "exit_wait_state", VCPU_STAT(exit_wait_state) }, 95 { "exit_wait_state", VCPU_STAT(exit_wait_state) },
96 { "inject_ckc", VCPU_STAT(inject_ckc) },
97 { "inject_cputm", VCPU_STAT(inject_cputm) },
98 { "inject_external_call", VCPU_STAT(inject_external_call) },
99 { "inject_float_mchk", VM_STAT(inject_float_mchk) },
100 { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) },
101 { "inject_io", VM_STAT(inject_io) },
102 { "inject_mchk", VCPU_STAT(inject_mchk) },
103 { "inject_pfault_done", VM_STAT(inject_pfault_done) },
104 { "inject_program", VCPU_STAT(inject_program) },
105 { "inject_restart", VCPU_STAT(inject_restart) },
106 { "inject_service_signal", VM_STAT(inject_service_signal) },
107 { "inject_set_prefix", VCPU_STAT(inject_set_prefix) },
108 { "inject_stop_signal", VCPU_STAT(inject_stop_signal) },
109 { "inject_pfault_init", VCPU_STAT(inject_pfault_init) },
110 { "inject_virtio", VM_STAT(inject_virtio) },
91 { "instruction_epsw", VCPU_STAT(instruction_epsw) }, 111 { "instruction_epsw", VCPU_STAT(instruction_epsw) },
92 { "instruction_gs", VCPU_STAT(instruction_gs) }, 112 { "instruction_gs", VCPU_STAT(instruction_gs) },
93 { "instruction_io_other", VCPU_STAT(instruction_io_other) }, 113 { "instruction_io_other", VCPU_STAT(instruction_io_other) },
@@ -152,13 +172,33 @@ static int nested;
152module_param(nested, int, S_IRUGO); 172module_param(nested, int, S_IRUGO);
153MODULE_PARM_DESC(nested, "Nested virtualization support"); 173MODULE_PARM_DESC(nested, "Nested virtualization support");
154 174
155/* upper facilities limit for kvm */
156unsigned long kvm_s390_fac_list_mask[16] = { FACILITIES_KVM };
157 175
158unsigned long kvm_s390_fac_list_mask_size(void) 176/*
177 * For now we handle at most 16 double words as this is what the s390 base
178 * kernel handles and stores in the prefix page. If we ever need to go beyond
179 * this, this requires changes to code, but the external uapi can stay.
180 */
181#define SIZE_INTERNAL 16
182
183/*
184 * Base feature mask that defines default mask for facilities. Consists of the
185 * defines in FACILITIES_KVM and the non-hypervisor managed bits.
186 */
187static unsigned long kvm_s390_fac_base[SIZE_INTERNAL] = { FACILITIES_KVM };
188/*
189 * Extended feature mask. Consists of the defines in FACILITIES_KVM_CPUMODEL
190 * and defines the facilities that can be enabled via a cpu model.
191 */
192static unsigned long kvm_s390_fac_ext[SIZE_INTERNAL] = { FACILITIES_KVM_CPUMODEL };
193
194static unsigned long kvm_s390_fac_size(void)
159{ 195{
160 BUILD_BUG_ON(ARRAY_SIZE(kvm_s390_fac_list_mask) > S390_ARCH_FAC_MASK_SIZE_U64); 196 BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64);
161 return ARRAY_SIZE(kvm_s390_fac_list_mask); 197 BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64);
198 BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) >
199 sizeof(S390_lowcore.stfle_fac_list));
200
201 return SIZE_INTERNAL;
162} 202}
163 203
164/* available cpu features supported by kvm */ 204/* available cpu features supported by kvm */
@@ -679,6 +719,8 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
679 mutex_lock(&kvm->lock); 719 mutex_lock(&kvm->lock);
680 if (!kvm->created_vcpus) { 720 if (!kvm->created_vcpus) {
681 kvm->arch.use_cmma = 1; 721 kvm->arch.use_cmma = 1;
722 /* Not compatible with cmma. */
723 kvm->arch.use_pfmfi = 0;
682 ret = 0; 724 ret = 0;
683 } 725 }
684 mutex_unlock(&kvm->lock); 726 mutex_unlock(&kvm->lock);
@@ -1583,7 +1625,7 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
1583 return -EINVAL; 1625 return -EINVAL;
1584 /* CMMA is disabled or was not used, or the buffer has length zero */ 1626 /* CMMA is disabled or was not used, or the buffer has length zero */
1585 bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); 1627 bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
1586 if (!bufsize || !kvm->mm->context.use_cmma) { 1628 if (!bufsize || !kvm->mm->context.uses_cmm) {
1587 memset(args, 0, sizeof(*args)); 1629 memset(args, 0, sizeof(*args));
1588 return 0; 1630 return 0;
1589 } 1631 }
@@ -1660,7 +1702,7 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
1660/* 1702/*
1661 * This function sets the CMMA attributes for the given pages. If the input 1703 * This function sets the CMMA attributes for the given pages. If the input
1662 * buffer has zero length, no action is taken, otherwise the attributes are 1704 * buffer has zero length, no action is taken, otherwise the attributes are
1663 * set and the mm->context.use_cmma flag is set. 1705 * set and the mm->context.uses_cmm flag is set.
1664 */ 1706 */
1665static int kvm_s390_set_cmma_bits(struct kvm *kvm, 1707static int kvm_s390_set_cmma_bits(struct kvm *kvm,
1666 const struct kvm_s390_cmma_log *args) 1708 const struct kvm_s390_cmma_log *args)
@@ -1710,9 +1752,9 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
1710 srcu_read_unlock(&kvm->srcu, srcu_idx); 1752 srcu_read_unlock(&kvm->srcu, srcu_idx);
1711 up_read(&kvm->mm->mmap_sem); 1753 up_read(&kvm->mm->mmap_sem);
1712 1754
1713 if (!kvm->mm->context.use_cmma) { 1755 if (!kvm->mm->context.uses_cmm) {
1714 down_write(&kvm->mm->mmap_sem); 1756 down_write(&kvm->mm->mmap_sem);
1715 kvm->mm->context.use_cmma = 1; 1757 kvm->mm->context.uses_cmm = 1;
1716 up_write(&kvm->mm->mmap_sem); 1758 up_write(&kvm->mm->mmap_sem);
1717 } 1759 }
1718out: 1760out:
@@ -1967,20 +2009,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
1967 if (!kvm->arch.sie_page2) 2009 if (!kvm->arch.sie_page2)
1968 goto out_err; 2010 goto out_err;
1969 2011
1970 /* Populate the facility mask initially. */
1971 memcpy(kvm->arch.model.fac_mask, S390_lowcore.stfle_fac_list,
1972 sizeof(S390_lowcore.stfle_fac_list));
1973 for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
1974 if (i < kvm_s390_fac_list_mask_size())
1975 kvm->arch.model.fac_mask[i] &= kvm_s390_fac_list_mask[i];
1976 else
1977 kvm->arch.model.fac_mask[i] = 0UL;
1978 }
1979
1980 /* Populate the facility list initially. */
1981 kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; 2012 kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
1982 memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, 2013
1983 S390_ARCH_FAC_LIST_SIZE_BYTE); 2014 for (i = 0; i < kvm_s390_fac_size(); i++) {
2015 kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] &
2016 (kvm_s390_fac_base[i] |
2017 kvm_s390_fac_ext[i]);
2018 kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
2019 kvm_s390_fac_base[i];
2020 }
1984 2021
1985 /* we are always in czam mode - even on pre z14 machines */ 2022 /* we are always in czam mode - even on pre z14 machines */
1986 set_kvm_facility(kvm->arch.model.fac_mask, 138); 2023 set_kvm_facility(kvm->arch.model.fac_mask, 138);
@@ -2028,6 +2065,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
2028 2065
2029 kvm->arch.css_support = 0; 2066 kvm->arch.css_support = 0;
2030 kvm->arch.use_irqchip = 0; 2067 kvm->arch.use_irqchip = 0;
2068 kvm->arch.use_pfmfi = sclp.has_pfmfi;
2031 kvm->arch.epoch = 0; 2069 kvm->arch.epoch = 0;
2032 2070
2033 spin_lock_init(&kvm->arch.start_stop_lock); 2071 spin_lock_init(&kvm->arch.start_stop_lock);
@@ -2454,8 +2492,6 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu)
2454 vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL); 2492 vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL);
2455 if (!vcpu->arch.sie_block->cbrlo) 2493 if (!vcpu->arch.sie_block->cbrlo)
2456 return -ENOMEM; 2494 return -ENOMEM;
2457
2458 vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI;
2459 return 0; 2495 return 0;
2460} 2496}
2461 2497
@@ -2491,7 +2527,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2491 if (test_kvm_facility(vcpu->kvm, 73)) 2527 if (test_kvm_facility(vcpu->kvm, 73))
2492 vcpu->arch.sie_block->ecb |= ECB_TE; 2528 vcpu->arch.sie_block->ecb |= ECB_TE;
2493 2529
2494 if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) 2530 if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi)
2495 vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; 2531 vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI;
2496 if (test_kvm_facility(vcpu->kvm, 130)) 2532 if (test_kvm_facility(vcpu->kvm, 130))
2497 vcpu->arch.sie_block->ecb2 |= ECB2_IEP; 2533 vcpu->arch.sie_block->ecb2 |= ECB2_IEP;
@@ -3023,7 +3059,7 @@ retry:
3023 3059
3024 if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) { 3060 if (kvm_check_request(KVM_REQ_START_MIGRATION, vcpu)) {
3025 /* 3061 /*
3026 * Disable CMMA virtualization; we will emulate the ESSA 3062 * Disable CMM virtualization; we will emulate the ESSA
3027 * instruction manually, in order to provide additional 3063 * instruction manually, in order to provide additional
3028 * functionalities needed for live migration. 3064 * functionalities needed for live migration.
3029 */ 3065 */
@@ -3033,11 +3069,11 @@ retry:
3033 3069
3034 if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) { 3070 if (kvm_check_request(KVM_REQ_STOP_MIGRATION, vcpu)) {
3035 /* 3071 /*
3036 * Re-enable CMMA virtualization if CMMA is available and 3072 * Re-enable CMM virtualization if CMMA is available and
3037 * was used. 3073 * CMM has been used.
3038 */ 3074 */
3039 if ((vcpu->kvm->arch.use_cmma) && 3075 if ((vcpu->kvm->arch.use_cmma) &&
3040 (vcpu->kvm->mm->context.use_cmma)) 3076 (vcpu->kvm->mm->context.uses_cmm))
3041 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; 3077 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
3042 goto retry; 3078 goto retry;
3043 } 3079 }
@@ -4044,7 +4080,7 @@ static int __init kvm_s390_init(void)
4044 } 4080 }
4045 4081
4046 for (i = 0; i < 16; i++) 4082 for (i = 0; i < 16; i++)
4047 kvm_s390_fac_list_mask[i] |= 4083 kvm_s390_fac_base[i] |=
4048 S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); 4084 S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i);
4049 4085
4050 return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); 4086 return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f55ac0ef99ea..1b5621f4fe5b 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -294,8 +294,6 @@ void exit_sie(struct kvm_vcpu *vcpu);
294void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); 294void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
295int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); 295int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
296void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); 296void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
297unsigned long kvm_s390_fac_list_mask_size(void);
298extern unsigned long kvm_s390_fac_list_mask[];
299void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); 297void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
300__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); 298__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
301 299
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f0b4185158af..ebfa0442e569 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1078,9 +1078,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
1078 * value really needs to be written to; if the value is 1078 * value really needs to be written to; if the value is
1079 * already correct, we do nothing and avoid the lock. 1079 * already correct, we do nothing and avoid the lock.
1080 */ 1080 */
1081 if (vcpu->kvm->mm->context.use_cmma == 0) { 1081 if (vcpu->kvm->mm->context.uses_cmm == 0) {
1082 down_write(&vcpu->kvm->mm->mmap_sem); 1082 down_write(&vcpu->kvm->mm->mmap_sem);
1083 vcpu->kvm->mm->context.use_cmma = 1; 1083 vcpu->kvm->mm->context.uses_cmm = 1;
1084 up_write(&vcpu->kvm->mm->mmap_sem); 1084 up_write(&vcpu->kvm->mm->mmap_sem);
1085 } 1085 }
1086 /* 1086 /*
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 424a1ba4f874..90a8c9e84ca6 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -62,6 +62,13 @@ static struct facility_def facility_defs[] = {
62 } 62 }
63 }, 63 },
64 { 64 {
65 /*
66 * FACILITIES_KVM contains the list of facilities that are part
67 * of the default facility mask and list that are passed to the
68 * initial CPU model. If no CPU model is used, this, together
69 * with the non-hypervisor managed bits, is the maximum list of
70 * guest facilities supported by KVM.
71 */
65 .name = "FACILITIES_KVM", 72 .name = "FACILITIES_KVM",
66 .bits = (int[]){ 73 .bits = (int[]){
67 0, /* N3 instructions */ 74 0, /* N3 instructions */
@@ -89,6 +96,19 @@ static struct facility_def facility_defs[] = {
89 -1 /* END */ 96 -1 /* END */
90 } 97 }
91 }, 98 },
99 {
100 /*
101 * FACILITIES_KVM_CPUMODEL contains the list of facilities
102 * that can be enabled by CPU model code if the host supports
103 * it. These facilities are not passed to the guest without
104 * CPU model support.
105 */
106
107 .name = "FACILITIES_KVM_CPUMODEL",
108 .bits = (int[]){
109 -1 /* END */
110 }
111 },
92}; 112};
93 113
94static void print_facility_list(struct facility_def *def) 114static void print_facility_list(struct facility_def *def)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 2edc49e7409b..cfecc2272f2d 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -21,7 +21,7 @@
21#include <asm/apic.h> 21#include <asm/apic.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/hypervisor.h> 23#include <asm/hypervisor.h>
24#include <asm/hyperv.h> 24#include <asm/hyperv-tlfs.h>
25#include <asm/mshyperv.h> 25#include <asm/mshyperv.h>
26#include <linux/version.h> 26#include <linux/version.h>
27#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
@@ -88,11 +88,15 @@ EXPORT_SYMBOL_GPL(hyperv_cs);
88u32 *hv_vp_index; 88u32 *hv_vp_index;
89EXPORT_SYMBOL_GPL(hv_vp_index); 89EXPORT_SYMBOL_GPL(hv_vp_index);
90 90
91struct hv_vp_assist_page **hv_vp_assist_page;
92EXPORT_SYMBOL_GPL(hv_vp_assist_page);
93
91u32 hv_max_vp_index; 94u32 hv_max_vp_index;
92 95
93static int hv_cpu_init(unsigned int cpu) 96static int hv_cpu_init(unsigned int cpu)
94{ 97{
95 u64 msr_vp_index; 98 u64 msr_vp_index;
99 struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
96 100
97 hv_get_vp_index(msr_vp_index); 101 hv_get_vp_index(msr_vp_index);
98 102
@@ -101,6 +105,22 @@ static int hv_cpu_init(unsigned int cpu)
101 if (msr_vp_index > hv_max_vp_index) 105 if (msr_vp_index > hv_max_vp_index)
102 hv_max_vp_index = msr_vp_index; 106 hv_max_vp_index = msr_vp_index;
103 107
108 if (!hv_vp_assist_page)
109 return 0;
110
111 if (!*hvp)
112 *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
113
114 if (*hvp) {
115 u64 val;
116
117 val = vmalloc_to_pfn(*hvp);
118 val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
119 HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
120
121 wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
122 }
123
104 return 0; 124 return 0;
105} 125}
106 126
@@ -198,6 +218,9 @@ static int hv_cpu_die(unsigned int cpu)
198 struct hv_reenlightenment_control re_ctrl; 218 struct hv_reenlightenment_control re_ctrl;
199 unsigned int new_cpu; 219 unsigned int new_cpu;
200 220
221 if (hv_vp_assist_page && hv_vp_assist_page[cpu])
222 wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
223
201 if (hv_reenlightenment_cb == NULL) 224 if (hv_reenlightenment_cb == NULL)
202 return 0; 225 return 0;
203 226
@@ -224,6 +247,7 @@ void hyperv_init(void)
224{ 247{
225 u64 guest_id, required_msrs; 248 u64 guest_id, required_msrs;
226 union hv_x64_msr_hypercall_contents hypercall_msr; 249 union hv_x64_msr_hypercall_contents hypercall_msr;
250 int cpuhp;
227 251
228 if (x86_hyper_type != X86_HYPER_MS_HYPERV) 252 if (x86_hyper_type != X86_HYPER_MS_HYPERV)
229 return; 253 return;
@@ -241,9 +265,17 @@ void hyperv_init(void)
241 if (!hv_vp_index) 265 if (!hv_vp_index)
242 return; 266 return;
243 267
244 if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", 268 hv_vp_assist_page = kcalloc(num_possible_cpus(),
245 hv_cpu_init, hv_cpu_die) < 0) 269 sizeof(*hv_vp_assist_page), GFP_KERNEL);
270 if (!hv_vp_assist_page) {
271 ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
246 goto free_vp_index; 272 goto free_vp_index;
273 }
274
275 cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
276 hv_cpu_init, hv_cpu_die);
277 if (cpuhp < 0)
278 goto free_vp_assist_page;
247 279
248 /* 280 /*
249 * Setup the hypercall page and enable hypercalls. 281 * Setup the hypercall page and enable hypercalls.
@@ -256,7 +288,7 @@ void hyperv_init(void)
256 hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); 288 hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
257 if (hv_hypercall_pg == NULL) { 289 if (hv_hypercall_pg == NULL) {
258 wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); 290 wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
259 goto free_vp_index; 291 goto remove_cpuhp_state;
260 } 292 }
261 293
262 rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); 294 rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -304,6 +336,11 @@ register_msr_cs:
304 336
305 return; 337 return;
306 338
339remove_cpuhp_state:
340 cpuhp_remove_state(cpuhp);
341free_vp_assist_page:
342 kfree(hv_vp_assist_page);
343 hv_vp_assist_page = NULL;
307free_vp_index: 344free_vp_index:
308 kfree(hv_vp_index); 345 kfree(hv_vp_index);
309 hv_vp_index = NULL; 346 hv_vp_index = NULL;
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/asm/hyperv-tlfs.h
index 6c0c3a3b631c..416cb0e0c496 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,6 +1,13 @@
1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 1/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2#ifndef _ASM_X86_HYPERV_H 2
3#define _ASM_X86_HYPERV_H 3/*
4 * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
5 * Specification (TLFS):
6 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
7 */
8
9#ifndef _ASM_X86_HYPERV_TLFS_H
10#define _ASM_X86_HYPERV_TLFS_H
4 11
5#include <linux/types.h> 12#include <linux/types.h>
6 13
@@ -14,6 +21,7 @@
14#define HYPERV_CPUID_FEATURES 0x40000003 21#define HYPERV_CPUID_FEATURES 0x40000003
15#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 22#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
16#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 23#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
24#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
17 25
18#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 26#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
19#define HYPERV_CPUID_MIN 0x40000005 27#define HYPERV_CPUID_MIN 0x40000005
@@ -159,6 +167,9 @@
159/* Recommend using the newer ExProcessorMasks interface */ 167/* Recommend using the newer ExProcessorMasks interface */
160#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) 168#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
161 169
170/* Recommend using enlightened VMCS */
171#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14)
172
162/* 173/*
163 * Crash notification flag. 174 * Crash notification flag.
164 */ 175 */
@@ -192,7 +203,7 @@
192#define HV_X64_MSR_EOI 0x40000070 203#define HV_X64_MSR_EOI 0x40000070
193#define HV_X64_MSR_ICR 0x40000071 204#define HV_X64_MSR_ICR 0x40000071
194#define HV_X64_MSR_TPR 0x40000072 205#define HV_X64_MSR_TPR 0x40000072
195#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 206#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
196 207
197/* Define synthetic interrupt controller model specific registers. */ 208/* Define synthetic interrupt controller model specific registers. */
198#define HV_X64_MSR_SCONTROL 0x40000080 209#define HV_X64_MSR_SCONTROL 0x40000080
@@ -240,6 +251,55 @@
240#define HV_X64_MSR_CRASH_PARAMS \ 251#define HV_X64_MSR_CRASH_PARAMS \
241 (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) 252 (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
242 253
254/*
255 * Declare the MSR used to setup pages used to communicate with the hypervisor.
256 */
257union hv_x64_msr_hypercall_contents {
258 u64 as_uint64;
259 struct {
260 u64 enable:1;
261 u64 reserved:11;
262 u64 guest_physical_address:52;
263 };
264};
265
266/*
267 * TSC page layout.
268 */
269struct ms_hyperv_tsc_page {
270 volatile u32 tsc_sequence;
271 u32 reserved1;
272 volatile u64 tsc_scale;
273 volatile s64 tsc_offset;
274 u64 reserved2[509];
275};
276
277/*
278 * The guest OS needs to register the guest ID with the hypervisor.
279 * The guest ID is a 64 bit entity and the structure of this ID is
280 * specified in the Hyper-V specification:
281 *
282 * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
283 *
284 * While the current guideline does not specify how Linux guest ID(s)
285 * need to be generated, our plan is to publish the guidelines for
286 * Linux and other guest operating systems that currently are hosted
287 * on Hyper-V. The implementation here conforms to this yet
288 * unpublished guidelines.
289 *
290 *
291 * Bit(s)
292 * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
293 * 62:56 - Os Type; Linux is 0x100
294 * 55:48 - Distro specific identification
295 * 47:16 - Linux kernel version number
296 * 15:0 - Distro specific identification
297 *
298 *
299 */
300
301#define HV_LINUX_VENDOR_ID 0x8100
302
243/* TSC emulation after migration */ 303/* TSC emulation after migration */
244#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 304#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
245 305
@@ -278,10 +338,13 @@ struct hv_tsc_emulation_status {
278#define HVCALL_POST_MESSAGE 0x005c 338#define HVCALL_POST_MESSAGE 0x005c
279#define HVCALL_SIGNAL_EVENT 0x005d 339#define HVCALL_SIGNAL_EVENT 0x005d
280 340
281#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 341#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
282#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 342#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
283#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ 343#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
284 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) 344 (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
345
346/* Hyper-V Enlightened VMCS version mask in nested features CPUID */
347#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff
285 348
286#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 349#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
287#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 350#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
@@ -301,12 +364,22 @@ enum HV_GENERIC_SET_FORMAT {
301 HV_GENERIC_SET_ALL, 364 HV_GENERIC_SET_ALL,
302}; 365};
303 366
367#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
368#define HV_HYPERCALL_FAST_BIT BIT(16)
369#define HV_HYPERCALL_VARHEAD_OFFSET 17
370#define HV_HYPERCALL_REP_COMP_OFFSET 32
371#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
372#define HV_HYPERCALL_REP_START_OFFSET 48
373#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
374
304/* hypercall status code */ 375/* hypercall status code */
305#define HV_STATUS_SUCCESS 0 376#define HV_STATUS_SUCCESS 0
306#define HV_STATUS_INVALID_HYPERCALL_CODE 2 377#define HV_STATUS_INVALID_HYPERCALL_CODE 2
307#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 378#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
308#define HV_STATUS_INVALID_ALIGNMENT 4 379#define HV_STATUS_INVALID_ALIGNMENT 4
380#define HV_STATUS_INVALID_PARAMETER 5
309#define HV_STATUS_INSUFFICIENT_MEMORY 11 381#define HV_STATUS_INSUFFICIENT_MEMORY 11
382#define HV_STATUS_INVALID_PORT_ID 17
310#define HV_STATUS_INVALID_CONNECTION_ID 18 383#define HV_STATUS_INVALID_CONNECTION_ID 18
311#define HV_STATUS_INSUFFICIENT_BUFFERS 19 384#define HV_STATUS_INSUFFICIENT_BUFFERS 19
312 385
@@ -321,6 +394,8 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
321#define HV_SYNIC_SINT_COUNT (16) 394#define HV_SYNIC_SINT_COUNT (16)
322/* Define the expected SynIC version. */ 395/* Define the expected SynIC version. */
323#define HV_SYNIC_VERSION_1 (0x1) 396#define HV_SYNIC_VERSION_1 (0x1)
397/* Valid SynIC vectors are 16-255. */
398#define HV_SYNIC_FIRST_VALID_VECTOR (16)
324 399
325#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) 400#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
326#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) 401#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
@@ -415,6 +490,216 @@ struct hv_timer_message_payload {
415 __u64 delivery_time; /* When the message was delivered */ 490 __u64 delivery_time; /* When the message was delivered */
416}; 491};
417 492
493/* Define virtual processor assist page structure. */
494struct hv_vp_assist_page {
495 __u32 apic_assist;
496 __u32 reserved;
497 __u64 vtl_control[2];
498 __u64 nested_enlightenments_control[2];
499 __u32 enlighten_vmentry;
500 __u64 current_nested_vmcs;
501};
502
503struct hv_enlightened_vmcs {
504 u32 revision_id;
505 u32 abort;
506
507 u16 host_es_selector;
508 u16 host_cs_selector;
509 u16 host_ss_selector;
510 u16 host_ds_selector;
511 u16 host_fs_selector;
512 u16 host_gs_selector;
513 u16 host_tr_selector;
514
515 u64 host_ia32_pat;
516 u64 host_ia32_efer;
517
518 u64 host_cr0;
519 u64 host_cr3;
520 u64 host_cr4;
521
522 u64 host_ia32_sysenter_esp;
523 u64 host_ia32_sysenter_eip;
524 u64 host_rip;
525 u32 host_ia32_sysenter_cs;
526
527 u32 pin_based_vm_exec_control;
528 u32 vm_exit_controls;
529 u32 secondary_vm_exec_control;
530
531 u64 io_bitmap_a;
532 u64 io_bitmap_b;
533 u64 msr_bitmap;
534
535 u16 guest_es_selector;
536 u16 guest_cs_selector;
537 u16 guest_ss_selector;
538 u16 guest_ds_selector;
539 u16 guest_fs_selector;
540 u16 guest_gs_selector;
541 u16 guest_ldtr_selector;
542 u16 guest_tr_selector;
543
544 u32 guest_es_limit;
545 u32 guest_cs_limit;
546 u32 guest_ss_limit;
547 u32 guest_ds_limit;
548 u32 guest_fs_limit;
549 u32 guest_gs_limit;
550 u32 guest_ldtr_limit;
551 u32 guest_tr_limit;
552 u32 guest_gdtr_limit;
553 u32 guest_idtr_limit;
554
555 u32 guest_es_ar_bytes;
556 u32 guest_cs_ar_bytes;
557 u32 guest_ss_ar_bytes;
558 u32 guest_ds_ar_bytes;
559 u32 guest_fs_ar_bytes;
560 u32 guest_gs_ar_bytes;
561 u32 guest_ldtr_ar_bytes;
562 u32 guest_tr_ar_bytes;
563
564 u64 guest_es_base;
565 u64 guest_cs_base;
566 u64 guest_ss_base;
567 u64 guest_ds_base;
568 u64 guest_fs_base;
569 u64 guest_gs_base;
570 u64 guest_ldtr_base;
571 u64 guest_tr_base;
572 u64 guest_gdtr_base;
573 u64 guest_idtr_base;
574
575 u64 padding64_1[3];
576
577 u64 vm_exit_msr_store_addr;
578 u64 vm_exit_msr_load_addr;
579 u64 vm_entry_msr_load_addr;
580
581 u64 cr3_target_value0;
582 u64 cr3_target_value1;
583 u64 cr3_target_value2;
584 u64 cr3_target_value3;
585
586 u32 page_fault_error_code_mask;
587 u32 page_fault_error_code_match;
588
589 u32 cr3_target_count;
590 u32 vm_exit_msr_store_count;
591 u32 vm_exit_msr_load_count;
592 u32 vm_entry_msr_load_count;
593
594 u64 tsc_offset;
595 u64 virtual_apic_page_addr;
596 u64 vmcs_link_pointer;
597
598 u64 guest_ia32_debugctl;
599 u64 guest_ia32_pat;
600 u64 guest_ia32_efer;
601
602 u64 guest_pdptr0;
603 u64 guest_pdptr1;
604 u64 guest_pdptr2;
605 u64 guest_pdptr3;
606
607 u64 guest_pending_dbg_exceptions;
608 u64 guest_sysenter_esp;
609 u64 guest_sysenter_eip;
610
611 u32 guest_activity_state;
612 u32 guest_sysenter_cs;
613
614 u64 cr0_guest_host_mask;
615 u64 cr4_guest_host_mask;
616 u64 cr0_read_shadow;
617 u64 cr4_read_shadow;
618 u64 guest_cr0;
619 u64 guest_cr3;
620 u64 guest_cr4;
621 u64 guest_dr7;
622
623 u64 host_fs_base;
624 u64 host_gs_base;
625 u64 host_tr_base;
626 u64 host_gdtr_base;
627 u64 host_idtr_base;
628 u64 host_rsp;
629
630 u64 ept_pointer;
631
632 u16 virtual_processor_id;
633 u16 padding16[3];
634
635 u64 padding64_2[5];
636 u64 guest_physical_address;
637
638 u32 vm_instruction_error;
639 u32 vm_exit_reason;
640 u32 vm_exit_intr_info;
641 u32 vm_exit_intr_error_code;
642 u32 idt_vectoring_info_field;
643 u32 idt_vectoring_error_code;
644 u32 vm_exit_instruction_len;
645 u32 vmx_instruction_info;
646
647 u64 exit_qualification;
648 u64 exit_io_instruction_ecx;
649 u64 exit_io_instruction_esi;
650 u64 exit_io_instruction_edi;
651 u64 exit_io_instruction_eip;
652
653 u64 guest_linear_address;
654 u64 guest_rsp;
655 u64 guest_rflags;
656
657 u32 guest_interruptibility_info;
658 u32 cpu_based_vm_exec_control;
659 u32 exception_bitmap;
660 u32 vm_entry_controls;
661 u32 vm_entry_intr_info_field;
662 u32 vm_entry_exception_error_code;
663 u32 vm_entry_instruction_len;
664 u32 tpr_threshold;
665
666 u64 guest_rip;
667
668 u32 hv_clean_fields;
669 u32 hv_padding_32;
670 u32 hv_synthetic_controls;
671 u32 hv_enlightenments_control;
672 u32 hv_vp_id;
673
674 u64 hv_vm_id;
675 u64 partition_assist_page;
676 u64 padding64_4[4];
677 u64 guest_bndcfgs;
678 u64 padding64_5[7];
679 u64 xss_exit_bitmap;
680 u64 padding64_6[7];
681};
682
683#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
684#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
685#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
686#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
687#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
688#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
689#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
690#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
691#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
692#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
693#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
694#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
695#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
696#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
697#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
698#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
699#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
700
701#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
702
418#define HV_STIMER_ENABLE (1ULL << 0) 703#define HV_STIMER_ENABLE (1ULL << 0)
419#define HV_STIMER_PERIODIC (1ULL << 1) 704#define HV_STIMER_PERIODIC (1ULL << 1)
420#define HV_STIMER_LAZY (1ULL << 2) 705#define HV_STIMER_LAZY (1ULL << 2)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b605a5b6a30c..949c977bc4c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,6 +34,7 @@
34#include <asm/msr-index.h> 34#include <asm/msr-index.h>
35#include <asm/asm.h> 35#include <asm/asm.h>
36#include <asm/kvm_page_track.h> 36#include <asm/kvm_page_track.h>
37#include <asm/hyperv-tlfs.h>
37 38
38#define KVM_MAX_VCPUS 288 39#define KVM_MAX_VCPUS 288
39#define KVM_SOFT_MAX_VCPUS 240 40#define KVM_SOFT_MAX_VCPUS 240
@@ -73,6 +74,7 @@
73#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) 74#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
74#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) 75#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
75#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) 76#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
77#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
76 78
77#define CR0_RESERVED_BITS \ 79#define CR0_RESERVED_BITS \
78 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 80 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -498,6 +500,7 @@ struct kvm_vcpu_arch {
498 u64 apic_base; 500 u64 apic_base;
499 struct kvm_lapic *apic; /* kernel irqchip context */ 501 struct kvm_lapic *apic; /* kernel irqchip context */
500 bool apicv_active; 502 bool apicv_active;
503 bool load_eoi_exitmap_pending;
501 DECLARE_BITMAP(ioapic_handled_vectors, 256); 504 DECLARE_BITMAP(ioapic_handled_vectors, 256);
502 unsigned long apic_attention; 505 unsigned long apic_attention;
503 int32_t apic_arb_prio; 506 int32_t apic_arb_prio;
@@ -571,7 +574,7 @@ struct kvm_vcpu_arch {
571 } exception; 574 } exception;
572 575
573 struct kvm_queued_interrupt { 576 struct kvm_queued_interrupt {
574 bool pending; 577 bool injected;
575 bool soft; 578 bool soft;
576 u8 nr; 579 u8 nr;
577 } interrupt; 580 } interrupt;
@@ -754,6 +757,12 @@ struct kvm_hv {
754 u64 hv_crash_ctl; 757 u64 hv_crash_ctl;
755 758
756 HV_REFERENCE_TSC_PAGE tsc_ref; 759 HV_REFERENCE_TSC_PAGE tsc_ref;
760
761 struct idr conn_to_evt;
762
763 u64 hv_reenlightenment_control;
764 u64 hv_tsc_emulation_control;
765 u64 hv_tsc_emulation_status;
757}; 766};
758 767
759enum kvm_irqchip_mode { 768enum kvm_irqchip_mode {
@@ -762,15 +771,6 @@ enum kvm_irqchip_mode {
762 KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ 771 KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
763}; 772};
764 773
765struct kvm_sev_info {
766 bool active; /* SEV enabled guest */
767 unsigned int asid; /* ASID used for this guest */
768 unsigned int handle; /* SEV firmware handle */
769 int fd; /* SEV device fd */
770 unsigned long pages_locked; /* Number of pages locked */
771 struct list_head regions_list; /* List of registered regions */
772};
773
774struct kvm_arch { 774struct kvm_arch {
775 unsigned int n_used_mmu_pages; 775 unsigned int n_used_mmu_pages;
776 unsigned int n_requested_mmu_pages; 776 unsigned int n_requested_mmu_pages;
@@ -800,13 +800,13 @@ struct kvm_arch {
800 struct mutex apic_map_lock; 800 struct mutex apic_map_lock;
801 struct kvm_apic_map *apic_map; 801 struct kvm_apic_map *apic_map;
802 802
803 unsigned int tss_addr;
804 bool apic_access_page_done; 803 bool apic_access_page_done;
805 804
806 gpa_t wall_clock; 805 gpa_t wall_clock;
807 806
808 bool ept_identity_pagetable_done; 807 bool mwait_in_guest;
809 gpa_t ept_identity_map_addr; 808 bool hlt_in_guest;
809 bool pause_in_guest;
810 810
811 unsigned long irq_sources_bitmap; 811 unsigned long irq_sources_bitmap;
812 s64 kvmclock_offset; 812 s64 kvmclock_offset;
@@ -849,17 +849,8 @@ struct kvm_arch {
849 849
850 bool disabled_lapic_found; 850 bool disabled_lapic_found;
851 851
852 /* Struct members for AVIC */
853 u32 avic_vm_id;
854 u32 ldr_mode;
855 struct page *avic_logical_id_table_page;
856 struct page *avic_physical_id_table_page;
857 struct hlist_node hnode;
858
859 bool x2apic_format; 852 bool x2apic_format;
860 bool x2apic_broadcast_quirk_disabled; 853 bool x2apic_broadcast_quirk_disabled;
861
862 struct kvm_sev_info sev_info;
863}; 854};
864 855
865struct kvm_vm_stat { 856struct kvm_vm_stat {
@@ -936,6 +927,8 @@ struct kvm_x86_ops {
936 bool (*cpu_has_high_real_mode_segbase)(void); 927 bool (*cpu_has_high_real_mode_segbase)(void);
937 void (*cpuid_update)(struct kvm_vcpu *vcpu); 928 void (*cpuid_update)(struct kvm_vcpu *vcpu);
938 929
930 struct kvm *(*vm_alloc)(void);
931 void (*vm_free)(struct kvm *);
939 int (*vm_init)(struct kvm *kvm); 932 int (*vm_init)(struct kvm *kvm);
940 void (*vm_destroy)(struct kvm *kvm); 933 void (*vm_destroy)(struct kvm *kvm);
941 934
@@ -1007,6 +1000,7 @@ struct kvm_x86_ops {
1007 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 1000 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
1008 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); 1001 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
1009 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 1002 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
1003 int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
1010 int (*get_tdp_level)(struct kvm_vcpu *vcpu); 1004 int (*get_tdp_level)(struct kvm_vcpu *vcpu);
1011 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 1005 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
1012 int (*get_lpage_level)(void); 1006 int (*get_lpage_level)(void);
@@ -1109,6 +1103,17 @@ struct kvm_arch_async_pf {
1109 1103
1110extern struct kvm_x86_ops *kvm_x86_ops; 1104extern struct kvm_x86_ops *kvm_x86_ops;
1111 1105
1106#define __KVM_HAVE_ARCH_VM_ALLOC
1107static inline struct kvm *kvm_arch_alloc_vm(void)
1108{
1109 return kvm_x86_ops->vm_alloc();
1110}
1111
1112static inline void kvm_arch_free_vm(struct kvm *kvm)
1113{
1114 return kvm_x86_ops->vm_free(kvm);
1115}
1116
1112int kvm_mmu_module_init(void); 1117int kvm_mmu_module_init(void);
1113void kvm_mmu_module_exit(void); 1118void kvm_mmu_module_exit(void);
1114 1119
@@ -1187,6 +1192,8 @@ enum emulation_result {
1187#define EMULTYPE_SKIP (1 << 2) 1192#define EMULTYPE_SKIP (1 << 2)
1188#define EMULTYPE_RETRY (1 << 3) 1193#define EMULTYPE_RETRY (1 << 3)
1189#define EMULTYPE_NO_REEXECUTE (1 << 4) 1194#define EMULTYPE_NO_REEXECUTE (1 << 4)
1195#define EMULTYPE_NO_UD_ON_FAIL (1 << 5)
1196#define EMULTYPE_VMWARE (1 << 6)
1190int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 1197int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
1191 int emulation_type, void *insn, int insn_len); 1198 int emulation_type, void *insn, int insn_len);
1192 1199
@@ -1204,8 +1211,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
1204 1211
1205struct x86_emulate_ctxt; 1212struct x86_emulate_ctxt;
1206 1213
1207int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 1214int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
1208int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
1209int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 1215int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
1210int kvm_emulate_halt(struct kvm_vcpu *vcpu); 1216int kvm_emulate_halt(struct kvm_vcpu *vcpu);
1211int kvm_vcpu_halt(struct kvm_vcpu *vcpu); 1217int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b407dda2bd7..3aea2658323a 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,6 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
88#ifdef CONFIG_KVM_GUEST 88#ifdef CONFIG_KVM_GUEST
89bool kvm_para_available(void); 89bool kvm_para_available(void);
90unsigned int kvm_arch_para_features(void); 90unsigned int kvm_arch_para_features(void);
91unsigned int kvm_arch_para_hints(void);
91void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); 92void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
92void kvm_async_pf_task_wake(u32 token); 93void kvm_async_pf_task_wake(u32 token);
93u32 kvm_read_and_reset_pf_reason(void); 94u32 kvm_read_and_reset_pf_reason(void);
@@ -115,6 +116,11 @@ static inline unsigned int kvm_arch_para_features(void)
115 return 0; 116 return 0;
116} 117}
117 118
119static inline unsigned int kvm_arch_para_hints(void)
120{
121 return 0;
122}
123
118static inline u32 kvm_read_and_reset_pf_reason(void) 124static inline u32 kvm_read_and_reset_pf_reason(void)
119{ 125{
120 return 0; 126 return 0;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index e73c4d0c06ad..b90e79610cf7 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -6,90 +6,23 @@
6#include <linux/atomic.h> 6#include <linux/atomic.h>
7#include <linux/nmi.h> 7#include <linux/nmi.h>
8#include <asm/io.h> 8#include <asm/io.h>
9#include <asm/hyperv.h> 9#include <asm/hyperv-tlfs.h>
10#include <asm/nospec-branch.h> 10#include <asm/nospec-branch.h>
11 11
12/*
13 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
14 * is set by CPUID(HVCPUID_VERSION_FEATURES).
15 */
16enum hv_cpuid_function {
17 HVCPUID_VERSION_FEATURES = 0x00000001,
18 HVCPUID_VENDOR_MAXFUNCTION = 0x40000000,
19 HVCPUID_INTERFACE = 0x40000001,
20
21 /*
22 * The remaining functions depend on the value of
23 * HVCPUID_INTERFACE
24 */
25 HVCPUID_VERSION = 0x40000002,
26 HVCPUID_FEATURES = 0x40000003,
27 HVCPUID_ENLIGHTENMENT_INFO = 0x40000004,
28 HVCPUID_IMPLEMENTATION_LIMITS = 0x40000005,
29};
30
31struct ms_hyperv_info { 12struct ms_hyperv_info {
32 u32 features; 13 u32 features;
33 u32 misc_features; 14 u32 misc_features;
34 u32 hints; 15 u32 hints;
16 u32 nested_features;
35 u32 max_vp_index; 17 u32 max_vp_index;
36 u32 max_lp_index; 18 u32 max_lp_index;
37}; 19};
38 20
39extern struct ms_hyperv_info ms_hyperv; 21extern struct ms_hyperv_info ms_hyperv;
40 22
41/*
42 * Declare the MSR used to setup pages used to communicate with the hypervisor.
43 */
44union hv_x64_msr_hypercall_contents {
45 u64 as_uint64;
46 struct {
47 u64 enable:1;
48 u64 reserved:11;
49 u64 guest_physical_address:52;
50 };
51};
52 23
53/* 24/*
54 * TSC page layout. 25 * Generate the guest ID.
55 */
56
57struct ms_hyperv_tsc_page {
58 volatile u32 tsc_sequence;
59 u32 reserved1;
60 volatile u64 tsc_scale;
61 volatile s64 tsc_offset;
62 u64 reserved2[509];
63};
64
65/*
66 * The guest OS needs to register the guest ID with the hypervisor.
67 * The guest ID is a 64 bit entity and the structure of this ID is
68 * specified in the Hyper-V specification:
69 *
70 * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
71 *
72 * While the current guideline does not specify how Linux guest ID(s)
73 * need to be generated, our plan is to publish the guidelines for
74 * Linux and other guest operating systems that currently are hosted
75 * on Hyper-V. The implementation here conforms to this yet
76 * unpublished guidelines.
77 *
78 *
79 * Bit(s)
80 * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
81 * 62:56 - Os Type; Linux is 0x100
82 * 55:48 - Distro specific identification
83 * 47:16 - Linux kernel version number
84 * 15:0 - Distro specific identification
85 *
86 *
87 */
88
89#define HV_LINUX_VENDOR_ID 0x8100
90
91/*
92 * Generate the guest ID based on the guideline described above.
93 */ 26 */
94 27
95static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version, 28static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
@@ -228,14 +161,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
228 return hv_status; 161 return hv_status;
229} 162}
230 163
231#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
232#define HV_HYPERCALL_FAST_BIT BIT(16)
233#define HV_HYPERCALL_VARHEAD_OFFSET 17
234#define HV_HYPERCALL_REP_COMP_OFFSET 32
235#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
236#define HV_HYPERCALL_REP_START_OFFSET 48
237#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
238
239/* Fast hypercall with 8 bytes of input and no output */ 164/* Fast hypercall with 8 bytes of input and no output */
240static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) 165static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
241{ 166{
@@ -307,6 +232,15 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
307 */ 232 */
308extern u32 *hv_vp_index; 233extern u32 *hv_vp_index;
309extern u32 hv_max_vp_index; 234extern u32 hv_max_vp_index;
235extern struct hv_vp_assist_page **hv_vp_assist_page;
236
237static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
238{
239 if (!hv_vp_assist_page)
240 return NULL;
241
242 return hv_vp_assist_page[cpu];
243}
310 244
311/** 245/**
312 * hv_cpu_number_to_vp_number() - Map CPU to VP. 246 * hv_cpu_number_to_vp_number() - Map CPU to VP.
@@ -343,6 +277,10 @@ static inline void hyperv_setup_mmu_ops(void) {}
343static inline void set_hv_tscchange_cb(void (*cb)(void)) {} 277static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
344static inline void clear_hv_tscchange_cb(void) {} 278static inline void clear_hv_tscchange_cb(void) {}
345static inline void hyperv_stop_tsc_emulation(void) {}; 279static inline void hyperv_stop_tsc_emulation(void) {};
280static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
281{
282 return NULL;
283}
346#endif /* CONFIG_HYPERV */ 284#endif /* CONFIG_HYPERV */
347 285
348#ifdef CONFIG_HYPERV_TSCPAGE 286#ifdef CONFIG_HYPERV_TSCPAGE
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c9084dedfcfa..53d5b1b9255e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -353,7 +353,21 @@
353 353
354/* Fam 15h MSRs */ 354/* Fam 15h MSRs */
355#define MSR_F15H_PERF_CTL 0xc0010200 355#define MSR_F15H_PERF_CTL 0xc0010200
356#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
357#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
358#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4)
359#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6)
360#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8)
361#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10)
362
356#define MSR_F15H_PERF_CTR 0xc0010201 363#define MSR_F15H_PERF_CTR 0xc0010201
364#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR
365#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2)
366#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4)
367#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6)
368#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8)
369#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10)
370
357#define MSR_F15H_NB_PERF_CTL 0xc0010240 371#define MSR_F15H_NB_PERF_CTL 0xc0010240
358#define MSR_F15H_NB_PERF_CTR 0xc0010241 372#define MSR_F15H_NB_PERF_CTR 0xc0010241
359#define MSR_F15H_PTSC 0xc0010280 373#define MSR_F15H_PTSC 0xc0010280
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b0ccd4847a58..4fa4206029e3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -407,9 +407,19 @@ union irq_stack_union {
407DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; 407DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
408DECLARE_INIT_PER_CPU(irq_stack_union); 408DECLARE_INIT_PER_CPU(irq_stack_union);
409 409
410static inline unsigned long cpu_kernelmode_gs_base(int cpu)
411{
412 return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu);
413}
414
410DECLARE_PER_CPU(char *, irq_stack_ptr); 415DECLARE_PER_CPU(char *, irq_stack_ptr);
411DECLARE_PER_CPU(unsigned int, irq_count); 416DECLARE_PER_CPU(unsigned int, irq_count);
412extern asmlinkage void ignore_sysret(void); 417extern asmlinkage void ignore_sysret(void);
418
419#if IS_ENABLED(CONFIG_KVM)
420/* Save actual FS/GS selectors and bases to current->thread */
421void save_fsgs_for_kvm(void);
422#endif
413#else /* X86_64 */ 423#else /* X86_64 */
414#ifdef CONFIG_CC_STACKPROTECTOR 424#ifdef CONFIG_CC_STACKPROTECTOR
415/* 425/*
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0487ac054870..93b462e48067 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -60,7 +60,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
60 u32 intercept_dr; 60 u32 intercept_dr;
61 u32 intercept_exceptions; 61 u32 intercept_exceptions;
62 u64 intercept; 62 u64 intercept;
63 u8 reserved_1[42]; 63 u8 reserved_1[40];
64 u16 pause_filter_thresh;
64 u16 pause_filter_count; 65 u16 pause_filter_count;
65 u64 iopm_base_pa; 66 u64 iopm_base_pa;
66 u64 msrpm_base_pa; 67 u64 msrpm_base_pa;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a960488eae..c535c2fdea13 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,8 +354,25 @@ struct kvm_xcrs {
354 __u64 padding[16]; 354 __u64 padding[16];
355}; 355};
356 356
357/* definition of registers in kvm_run */ 357#define KVM_SYNC_X86_REGS (1UL << 0)
358#define KVM_SYNC_X86_SREGS (1UL << 1)
359#define KVM_SYNC_X86_EVENTS (1UL << 2)
360
361#define KVM_SYNC_X86_VALID_FIELDS \
362 (KVM_SYNC_X86_REGS| \
363 KVM_SYNC_X86_SREGS| \
364 KVM_SYNC_X86_EVENTS)
365
366/* kvm_sync_regs struct included by kvm_run struct */
358struct kvm_sync_regs { 367struct kvm_sync_regs {
368 /* Members of this structure are potentially malicious.
369 * Care must be taken by code reading, esp. interpreting,
370 * data fields from them inside KVM to prevent TOCTOU and
371 * double-fetch types of vulnerabilities.
372 */
373 struct kvm_regs regs;
374 struct kvm_sregs sregs;
375 struct kvm_vcpu_events events;
359}; 376};
360 377
361#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 378#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6cfa9c8cb7d6..4c851ebb3ceb 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -3,15 +3,16 @@
3#define _UAPI_ASM_X86_KVM_PARA_H 3#define _UAPI_ASM_X86_KVM_PARA_H
4 4
5#include <linux/types.h> 5#include <linux/types.h>
6#include <asm/hyperv.h>
7 6
8/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 7/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
9 * should be used to determine that a VM is running under KVM. 8 * should be used to determine that a VM is running under KVM.
10 */ 9 */
11#define KVM_CPUID_SIGNATURE 0x40000000 10#define KVM_CPUID_SIGNATURE 0x40000000
12 11
13/* This CPUID returns a feature bitmap in eax. Before enabling a particular 12/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
14 * paravirtualization, the appropriate feature bit should be checked. 13 * a particular paravirtualization, the appropriate feature bit should
14 * be checked in eax. The performance hint feature bit should be checked
15 * in edx.
15 */ 16 */
16#define KVM_CPUID_FEATURES 0x40000001 17#define KVM_CPUID_FEATURES 0x40000001
17#define KVM_FEATURE_CLOCKSOURCE 0 18#define KVM_FEATURE_CLOCKSOURCE 0
@@ -28,6 +29,8 @@
28#define KVM_FEATURE_PV_TLB_FLUSH 9 29#define KVM_FEATURE_PV_TLB_FLUSH 9
29#define KVM_FEATURE_ASYNC_PF_VMEXIT 10 30#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
30 31
32#define KVM_HINTS_DEDICATED 0
33
31/* The last 8 bits are used to indicate how to interpret the flags field 34/* The last 8 bits are used to indicate how to interpret the flags field
32 * in pvclock structure. If no bits are set, all flags are ignored. 35 * in pvclock structure. If no bits are set, all flags are ignored.
33 */ 36 */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 348cf4821240..4702fbd98f92 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -487,7 +487,7 @@ void load_percpu_segment(int cpu)
487 loadsegment(fs, __KERNEL_PERCPU); 487 loadsegment(fs, __KERNEL_PERCPU);
488#else 488#else
489 __loadsegment_simple(gs, 0); 489 __loadsegment_simple(gs, 0);
490 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); 490 wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
491#endif 491#endif
492 load_stack_canary_segment(); 492 load_stack_canary_segment();
493} 493}
@@ -1398,6 +1398,7 @@ __setup("clearcpuid=", setup_clearcpuid);
1398#ifdef CONFIG_X86_64 1398#ifdef CONFIG_X86_64
1399DEFINE_PER_CPU_FIRST(union irq_stack_union, 1399DEFINE_PER_CPU_FIRST(union irq_stack_union,
1400 irq_stack_union) __aligned(PAGE_SIZE) __visible; 1400 irq_stack_union) __aligned(PAGE_SIZE) __visible;
1401EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
1401 1402
1402/* 1403/*
1403 * The following percpu variables are hot. Align current_task to 1404 * The following percpu variables are hot. Align current_task to
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 4488cf0dd499..031082c96db8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -22,7 +22,7 @@
22#include <linux/kexec.h> 22#include <linux/kexec.h>
23#include <asm/processor.h> 23#include <asm/processor.h>
24#include <asm/hypervisor.h> 24#include <asm/hypervisor.h>
25#include <asm/hyperv.h> 25#include <asm/hyperv-tlfs.h>
26#include <asm/mshyperv.h> 26#include <asm/mshyperv.h>
27#include <asm/desc.h> 27#include <asm/desc.h>
28#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
@@ -216,8 +216,8 @@ static void __init ms_hyperv_init_platform(void)
216 pr_info("Hyper-V: features 0x%x, hints 0x%x\n", 216 pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
217 ms_hyperv.features, ms_hyperv.hints); 217 ms_hyperv.features, ms_hyperv.hints);
218 218
219 ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); 219 ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
220 ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); 220 ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
221 221
222 pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", 222 pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
223 ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); 223 ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
@@ -225,11 +225,12 @@ static void __init ms_hyperv_init_platform(void)
225 /* 225 /*
226 * Extract host information. 226 * Extract host information.
227 */ 227 */
228 if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) { 228 if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
229 hv_host_info_eax = cpuid_eax(HVCPUID_VERSION); 229 HYPERV_CPUID_VERSION) {
230 hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION); 230 hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
231 hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION); 231 hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
232 hv_host_info_edx = cpuid_edx(HVCPUID_VERSION); 232 hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
233 hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
233 234
234 pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", 235 pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
235 hv_host_info_eax, hv_host_info_ebx >> 16, 236 hv_host_info_eax, hv_host_info_ebx >> 16,
@@ -243,6 +244,11 @@ static void __init ms_hyperv_init_platform(void)
243 x86_platform.calibrate_cpu = hv_get_tsc_khz; 244 x86_platform.calibrate_cpu = hv_get_tsc_khz;
244 } 245 }
245 246
247 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
248 ms_hyperv.nested_features =
249 cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
250 }
251
246#ifdef CONFIG_X86_LOCAL_APIC 252#ifdef CONFIG_X86_LOCAL_APIC
247 if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && 253 if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
248 ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { 254 ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index fae86e36e399..7867417cfaff 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -454,6 +454,13 @@ static void __init sev_map_percpu_data(void)
454} 454}
455 455
456#ifdef CONFIG_SMP 456#ifdef CONFIG_SMP
457static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
458{
459 native_smp_prepare_cpus(max_cpus);
460 if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
461 static_branch_disable(&virt_spin_lock_key);
462}
463
457static void __init kvm_smp_prepare_boot_cpu(void) 464static void __init kvm_smp_prepare_boot_cpu(void)
458{ 465{
459 /* 466 /*
@@ -546,6 +553,7 @@ static void __init kvm_guest_init(void)
546 } 553 }
547 554
548 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && 555 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
556 !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
549 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) 557 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
550 pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; 558 pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
551 559
@@ -556,6 +564,7 @@ static void __init kvm_guest_init(void)
556 kvm_setup_vsyscall_timeinfo(); 564 kvm_setup_vsyscall_timeinfo();
557 565
558#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
567 smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
559 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 568 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
560 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", 569 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
561 kvm_cpu_online, kvm_cpu_down_prepare) < 0) 570 kvm_cpu_online, kvm_cpu_down_prepare) < 0)
@@ -605,6 +614,11 @@ unsigned int kvm_arch_para_features(void)
605 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); 614 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
606} 615}
607 616
617unsigned int kvm_arch_para_hints(void)
618{
619 return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
620}
621
608static uint32_t __init kvm_detect(void) 622static uint32_t __init kvm_detect(void)
609{ 623{
610 return kvm_cpuid_base(); 624 return kvm_cpuid_base();
@@ -635,6 +649,7 @@ static __init int kvm_setup_pv_tlb_flush(void)
635 int cpu; 649 int cpu;
636 650
637 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && 651 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
652 !kvm_para_has_hint(KVM_HINTS_DEDICATED) &&
638 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { 653 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
639 for_each_possible_cpu(cpu) { 654 for_each_possible_cpu(cpu) {
640 zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), 655 zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
@@ -730,6 +745,9 @@ void __init kvm_spinlock_init(void)
730 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) 745 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
731 return; 746 return;
732 747
748 if (kvm_para_has_hint(KVM_HINTS_DEDICATED))
749 return;
750
733 __pv_init_lock_hash(); 751 __pv_init_lock_hash();
734 pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; 752 pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
735 pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); 753 pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9eb448c7859d..4b100fe0f508 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -205,6 +205,20 @@ static __always_inline void save_fsgs(struct task_struct *task)
205 save_base_legacy(task, task->thread.gsindex, GS); 205 save_base_legacy(task, task->thread.gsindex, GS);
206} 206}
207 207
208#if IS_ENABLED(CONFIG_KVM)
209/*
210 * While a process is running,current->thread.fsbase and current->thread.gsbase
211 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
212 * wants an efficient way to save and restore FSBASE and GSBASE.
213 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
214 */
215void save_fsgs_for_kvm(void)
216{
217 save_fsgs(current);
218}
219EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
220#endif
221
208static __always_inline void loadseg(enum which_selector which, 222static __always_inline void loadseg(enum which_selector which,
209 unsigned short sel) 223 unsigned short sel)
210{ 224{
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b671fc2d0422..82055b90a8b3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -135,6 +135,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
135 return -EINVAL; 135 return -EINVAL;
136 } 136 }
137 137
138 best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
139 if (kvm_hlt_in_guest(vcpu->kvm) && best &&
140 (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
141 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
142
138 /* Update physical-address width */ 143 /* Update physical-address width */
139 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 144 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
140 kvm_mmu_reset_context(vcpu); 145 kvm_mmu_reset_context(vcpu);
@@ -370,7 +375,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
370 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 375 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
371 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 376 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
372 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | 377 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
373 F(TOPOEXT); 378 F(TOPOEXT) | F(PERFCTR_CORE);
374 379
375 /* cpuid 0x80000008.ebx */ 380 /* cpuid 0x80000008.ebx */
376 const u32 kvm_cpuid_8000_0008_ebx_x86_features = 381 const u32 kvm_cpuid_8000_0008_ebx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d91eaeb01034..b3705ae52824 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -30,6 +30,7 @@
30#include "x86.h" 30#include "x86.h"
31#include "tss.h" 31#include "tss.h"
32#include "mmu.h" 32#include "mmu.h"
33#include "pmu.h"
33 34
34/* 35/*
35 * Operand types 36 * Operand types
@@ -2887,6 +2888,9 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2887 return ctxt->ops->cpl(ctxt) > iopl; 2888 return ctxt->ops->cpl(ctxt) > iopl;
2888} 2889}
2889 2890
2891#define VMWARE_PORT_VMPORT (0x5658)
2892#define VMWARE_PORT_VMRPC (0x5659)
2893
2890static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2894static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2891 u16 port, u16 len) 2895 u16 port, u16 len)
2892{ 2896{
@@ -2898,6 +2902,14 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2898 unsigned mask = (1 << len) - 1; 2902 unsigned mask = (1 << len) - 1;
2899 unsigned long base; 2903 unsigned long base;
2900 2904
2905 /*
2906 * VMware allows access to these ports even if denied
2907 * by TSS I/O permission bitmap. Mimic behavior.
2908 */
2909 if (enable_vmware_backdoor &&
2910 ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
2911 return true;
2912
2901 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); 2913 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
2902 if (!tr_seg.p) 2914 if (!tr_seg.p)
2903 return false; 2915 return false;
@@ -4282,6 +4294,13 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
4282 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 4294 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
4283 u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); 4295 u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
4284 4296
4297 /*
4298 * VMware allows access to these Pseduo-PMCs even when read via RDPMC
4299 * in Ring3 when CR4.PCE=0.
4300 */
4301 if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
4302 return X86EMUL_CONTINUE;
4303
4285 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 4304 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
4286 ctxt->ops->check_pmc(ctxt, rcx)) 4305 ctxt->ops->check_pmc(ctxt, rcx))
4287 return emulate_gp(ctxt, 0); 4306 return emulate_gp(ctxt, 0);
@@ -4498,6 +4517,10 @@ static const struct gprefix pfx_0f_2b = {
4498 ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N, 4517 ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
4499}; 4518};
4500 4519
4520static const struct gprefix pfx_0f_10_0f_11 = {
4521 I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
4522};
4523
4501static const struct gprefix pfx_0f_28_0f_29 = { 4524static const struct gprefix pfx_0f_28_0f_29 = {
4502 I(Aligned, em_mov), I(Aligned, em_mov), N, N, 4525 I(Aligned, em_mov), I(Aligned, em_mov), N, N,
4503}; 4526};
@@ -4709,7 +4732,9 @@ static const struct opcode twobyte_table[256] = {
4709 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 4732 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
4710 N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, 4733 N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
4711 /* 0x10 - 0x1F */ 4734 /* 0x10 - 0x1F */
4712 N, N, N, N, N, N, N, N, 4735 GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
4736 GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
4737 N, N, N, N, N, N,
4713 D(ImplicitOps | ModRM | SrcMem | NoAccess), 4738 D(ImplicitOps | ModRM | SrcMem | NoAccess),
4714 N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), 4739 N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
4715 /* 0x20 - 0x2F */ 4740 /* 0x20 - 0x2F */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index dc97f2544b6f..98618e397342 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -29,6 +29,7 @@
29#include <linux/kvm_host.h> 29#include <linux/kvm_host.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/sched/cputime.h> 31#include <linux/sched/cputime.h>
32#include <linux/eventfd.h>
32 33
33#include <asm/apicdef.h> 34#include <asm/apicdef.h>
34#include <trace/events/kvm.h> 35#include <trace/events/kvm.h>
@@ -74,13 +75,38 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
74 return false; 75 return false;
75} 76}
76 77
78static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
79 int vector)
80{
81 if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
82 return;
83
84 if (synic_has_vector_connected(synic, vector))
85 __set_bit(vector, synic->vec_bitmap);
86 else
87 __clear_bit(vector, synic->vec_bitmap);
88
89 if (synic_has_vector_auto_eoi(synic, vector))
90 __set_bit(vector, synic->auto_eoi_bitmap);
91 else
92 __clear_bit(vector, synic->auto_eoi_bitmap);
93}
94
77static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, 95static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
78 u64 data, bool host) 96 u64 data, bool host)
79{ 97{
80 int vector; 98 int vector, old_vector;
99 bool masked;
81 100
82 vector = data & HV_SYNIC_SINT_VECTOR_MASK; 101 vector = data & HV_SYNIC_SINT_VECTOR_MASK;
83 if (vector < 16 && !host) 102 masked = data & HV_SYNIC_SINT_MASKED;
103
104 /*
105 * Valid vectors are 16-255, however, nested Hyper-V attempts to write
106 * default '0x10000' value on boot and this should not #GP. We need to
107 * allow zero-initing the register from host as well.
108 */
109 if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
84 return 1; 110 return 1;
85 /* 111 /*
86 * Guest may configure multiple SINTs to use the same vector, so 112 * Guest may configure multiple SINTs to use the same vector, so
@@ -88,18 +114,13 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
88 * bitmap of vectors with auto-eoi behavior. The bitmaps are 114 * bitmap of vectors with auto-eoi behavior. The bitmaps are
89 * updated here, and atomically queried on fast paths. 115 * updated here, and atomically queried on fast paths.
90 */ 116 */
117 old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
91 118
92 atomic64_set(&synic->sint[sint], data); 119 atomic64_set(&synic->sint[sint], data);
93 120
94 if (synic_has_vector_connected(synic, vector)) 121 synic_update_vector(synic, old_vector);
95 __set_bit(vector, synic->vec_bitmap);
96 else
97 __clear_bit(vector, synic->vec_bitmap);
98 122
99 if (synic_has_vector_auto_eoi(synic, vector)) 123 synic_update_vector(synic, vector);
100 __set_bit(vector, synic->auto_eoi_bitmap);
101 else
102 __clear_bit(vector, synic->auto_eoi_bitmap);
103 124
104 /* Load SynIC vectors into EOI exit bitmap */ 125 /* Load SynIC vectors into EOI exit bitmap */
105 kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic)); 126 kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
@@ -736,6 +757,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
736 case HV_X64_MSR_CRASH_CTL: 757 case HV_X64_MSR_CRASH_CTL:
737 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 758 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
738 case HV_X64_MSR_RESET: 759 case HV_X64_MSR_RESET:
760 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
761 case HV_X64_MSR_TSC_EMULATION_CONTROL:
762 case HV_X64_MSR_TSC_EMULATION_STATUS:
739 r = true; 763 r = true;
740 break; 764 break;
741 } 765 }
@@ -981,6 +1005,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
981 kvm_make_request(KVM_REQ_HV_RESET, vcpu); 1005 kvm_make_request(KVM_REQ_HV_RESET, vcpu);
982 } 1006 }
983 break; 1007 break;
1008 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
1009 hv->hv_reenlightenment_control = data;
1010 break;
1011 case HV_X64_MSR_TSC_EMULATION_CONTROL:
1012 hv->hv_tsc_emulation_control = data;
1013 break;
1014 case HV_X64_MSR_TSC_EMULATION_STATUS:
1015 hv->hv_tsc_emulation_status = data;
1016 break;
984 default: 1017 default:
985 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", 1018 vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
986 msr, data); 1019 msr, data);
@@ -1009,17 +1042,17 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
1009 return 1; 1042 return 1;
1010 hv->vp_index = (u32)data; 1043 hv->vp_index = (u32)data;
1011 break; 1044 break;
1012 case HV_X64_MSR_APIC_ASSIST_PAGE: { 1045 case HV_X64_MSR_VP_ASSIST_PAGE: {
1013 u64 gfn; 1046 u64 gfn;
1014 unsigned long addr; 1047 unsigned long addr;
1015 1048
1016 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { 1049 if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
1017 hv->hv_vapic = data; 1050 hv->hv_vapic = data;
1018 if (kvm_lapic_enable_pv_eoi(vcpu, 0)) 1051 if (kvm_lapic_enable_pv_eoi(vcpu, 0))
1019 return 1; 1052 return 1;
1020 break; 1053 break;
1021 } 1054 }
1022 gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; 1055 gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
1023 addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); 1056 addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
1024 if (kvm_is_error_hva(addr)) 1057 if (kvm_is_error_hva(addr))
1025 return 1; 1058 return 1;
@@ -1105,6 +1138,15 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1105 case HV_X64_MSR_RESET: 1138 case HV_X64_MSR_RESET:
1106 data = 0; 1139 data = 0;
1107 break; 1140 break;
1141 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
1142 data = hv->hv_reenlightenment_control;
1143 break;
1144 case HV_X64_MSR_TSC_EMULATION_CONTROL:
1145 data = hv->hv_tsc_emulation_control;
1146 break;
1147 case HV_X64_MSR_TSC_EMULATION_STATUS:
1148 data = hv->hv_tsc_emulation_status;
1149 break;
1108 default: 1150 default:
1109 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1151 vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1110 return 1; 1152 return 1;
@@ -1129,7 +1171,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1129 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 1171 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1130 case HV_X64_MSR_TPR: 1172 case HV_X64_MSR_TPR:
1131 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 1173 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1132 case HV_X64_MSR_APIC_ASSIST_PAGE: 1174 case HV_X64_MSR_VP_ASSIST_PAGE:
1133 data = hv->hv_vapic; 1175 data = hv->hv_vapic;
1134 break; 1176 break;
1135 case HV_X64_MSR_VP_RUNTIME: 1177 case HV_X64_MSR_VP_RUNTIME:
@@ -1226,10 +1268,47 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
1226 return 1; 1268 return 1;
1227} 1269}
1228 1270
1271static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
1272{
1273 struct eventfd_ctx *eventfd;
1274
1275 if (unlikely(!fast)) {
1276 int ret;
1277 gpa_t gpa = param;
1278
1279 if ((gpa & (__alignof__(param) - 1)) ||
1280 offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
1281 return HV_STATUS_INVALID_ALIGNMENT;
1282
1283 ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
1284 if (ret < 0)
1285 return HV_STATUS_INVALID_ALIGNMENT;
1286 }
1287
1288 /*
1289 * Per spec, bits 32-47 contain the extra "flag number". However, we
1290 * have no use for it, and in all known usecases it is zero, so just
1291 * report lookup failure if it isn't.
1292 */
1293 if (param & 0xffff00000000ULL)
1294 return HV_STATUS_INVALID_PORT_ID;
1295 /* remaining bits are reserved-zero */
1296 if (param & ~KVM_HYPERV_CONN_ID_MASK)
1297 return HV_STATUS_INVALID_HYPERCALL_INPUT;
1298
1299 /* conn_to_evt is protected by vcpu->kvm->srcu */
1300 eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
1301 if (!eventfd)
1302 return HV_STATUS_INVALID_PORT_ID;
1303
1304 eventfd_signal(eventfd, 1);
1305 return HV_STATUS_SUCCESS;
1306}
1307
1229int kvm_hv_hypercall(struct kvm_vcpu *vcpu) 1308int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1230{ 1309{
1231 u64 param, ingpa, outgpa, ret; 1310 u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
1232 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; 1311 uint16_t code, rep_idx, rep_cnt;
1233 bool fast, longmode; 1312 bool fast, longmode;
1234 1313
1235 /* 1314 /*
@@ -1268,7 +1347,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1268 1347
1269 /* Hypercall continuation is not supported yet */ 1348 /* Hypercall continuation is not supported yet */
1270 if (rep_cnt || rep_idx) { 1349 if (rep_cnt || rep_idx) {
1271 res = HV_STATUS_INVALID_HYPERCALL_CODE; 1350 ret = HV_STATUS_INVALID_HYPERCALL_CODE;
1272 goto set_result; 1351 goto set_result;
1273 } 1352 }
1274 1353
@@ -1276,11 +1355,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1276 case HVCALL_NOTIFY_LONG_SPIN_WAIT: 1355 case HVCALL_NOTIFY_LONG_SPIN_WAIT:
1277 kvm_vcpu_on_spin(vcpu, true); 1356 kvm_vcpu_on_spin(vcpu, true);
1278 break; 1357 break;
1279 case HVCALL_POST_MESSAGE:
1280 case HVCALL_SIGNAL_EVENT: 1358 case HVCALL_SIGNAL_EVENT:
1359 ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
1360 if (ret != HV_STATUS_INVALID_PORT_ID)
1361 break;
1362 /* maybe userspace knows this conn_id: fall through */
1363 case HVCALL_POST_MESSAGE:
1281 /* don't bother userspace if it has no way to handle it */ 1364 /* don't bother userspace if it has no way to handle it */
1282 if (!vcpu_to_synic(vcpu)->active) { 1365 if (!vcpu_to_synic(vcpu)->active) {
1283 res = HV_STATUS_INVALID_HYPERCALL_CODE; 1366 ret = HV_STATUS_INVALID_HYPERCALL_CODE;
1284 break; 1367 break;
1285 } 1368 }
1286 vcpu->run->exit_reason = KVM_EXIT_HYPERV; 1369 vcpu->run->exit_reason = KVM_EXIT_HYPERV;
@@ -1292,12 +1375,79 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
1292 kvm_hv_hypercall_complete_userspace; 1375 kvm_hv_hypercall_complete_userspace;
1293 return 0; 1376 return 0;
1294 default: 1377 default:
1295 res = HV_STATUS_INVALID_HYPERCALL_CODE; 1378 ret = HV_STATUS_INVALID_HYPERCALL_CODE;
1296 break; 1379 break;
1297 } 1380 }
1298 1381
1299set_result: 1382set_result:
1300 ret = res | (((u64)rep_done & 0xfff) << 32);
1301 kvm_hv_hypercall_set_result(vcpu, ret); 1383 kvm_hv_hypercall_set_result(vcpu, ret);
1302 return 1; 1384 return 1;
1303} 1385}
1386
1387void kvm_hv_init_vm(struct kvm *kvm)
1388{
1389 mutex_init(&kvm->arch.hyperv.hv_lock);
1390 idr_init(&kvm->arch.hyperv.conn_to_evt);
1391}
1392
1393void kvm_hv_destroy_vm(struct kvm *kvm)
1394{
1395 struct eventfd_ctx *eventfd;
1396 int i;
1397
1398 idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
1399 eventfd_ctx_put(eventfd);
1400 idr_destroy(&kvm->arch.hyperv.conn_to_evt);
1401}
1402
1403static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
1404{
1405 struct kvm_hv *hv = &kvm->arch.hyperv;
1406 struct eventfd_ctx *eventfd;
1407 int ret;
1408
1409 eventfd = eventfd_ctx_fdget(fd);
1410 if (IS_ERR(eventfd))
1411 return PTR_ERR(eventfd);
1412
1413 mutex_lock(&hv->hv_lock);
1414 ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
1415 GFP_KERNEL);
1416 mutex_unlock(&hv->hv_lock);
1417
1418 if (ret >= 0)
1419 return 0;
1420
1421 if (ret == -ENOSPC)
1422 ret = -EEXIST;
1423 eventfd_ctx_put(eventfd);
1424 return ret;
1425}
1426
1427static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
1428{
1429 struct kvm_hv *hv = &kvm->arch.hyperv;
1430 struct eventfd_ctx *eventfd;
1431
1432 mutex_lock(&hv->hv_lock);
1433 eventfd = idr_remove(&hv->conn_to_evt, conn_id);
1434 mutex_unlock(&hv->hv_lock);
1435
1436 if (!eventfd)
1437 return -ENOENT;
1438
1439 synchronize_srcu(&kvm->srcu);
1440 eventfd_ctx_put(eventfd);
1441 return 0;
1442}
1443
1444int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
1445{
1446 if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
1447 (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
1448 return -EINVAL;
1449
1450 if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
1451 return kvm_hv_eventfd_deassign(kvm, args->conn_id);
1452 return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
1453}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e637631a9574..837465d69c6d 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -88,4 +88,8 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
88void kvm_hv_setup_tsc_page(struct kvm *kvm, 88void kvm_hv_setup_tsc_page(struct kvm *kvm,
89 struct pvclock_vcpu_time_info *hv_clock); 89 struct pvclock_vcpu_time_info *hv_clock);
90 90
91void kvm_hv_init_vm(struct kvm *kvm);
92void kvm_hv_destroy_vm(struct kvm *kvm);
93int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
94
91#endif 95#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f171051eecf3..faa264822cee 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -73,8 +73,19 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
73 */ 73 */
74int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) 74int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
75{ 75{
76 /*
77 * FIXME: interrupt.injected represents an interrupt that it's
78 * side-effects have already been applied (e.g. bit from IRR
79 * already moved to ISR). Therefore, it is incorrect to rely
80 * on interrupt.injected to know if there is a pending
81 * interrupt in the user-mode LAPIC.
82 * This leads to nVMX/nSVM not be able to distinguish
83 * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
84 * pending interrupt or should re-inject an injected
85 * interrupt.
86 */
76 if (!lapic_in_kernel(v)) 87 if (!lapic_in_kernel(v))
77 return v->arch.interrupt.pending; 88 return v->arch.interrupt.injected;
78 89
79 if (kvm_cpu_has_extint(v)) 90 if (kvm_cpu_has_extint(v))
80 return 1; 91 return 1;
@@ -91,8 +102,19 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
91 */ 102 */
92int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 103int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
93{ 104{
105 /*
106 * FIXME: interrupt.injected represents an interrupt that it's
107 * side-effects have already been applied (e.g. bit from IRR
108 * already moved to ISR). Therefore, it is incorrect to rely
109 * on interrupt.injected to know if there is a pending
110 * interrupt in the user-mode LAPIC.
111 * This leads to nVMX/nSVM not be able to distinguish
112 * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
113 * pending interrupt or should re-inject an injected
114 * interrupt.
115 */
94 if (!lapic_in_kernel(v)) 116 if (!lapic_in_kernel(v))
95 return v->arch.interrupt.pending; 117 return v->arch.interrupt.injected;
96 118
97 if (kvm_cpu_has_extint(v)) 119 if (kvm_cpu_has_extint(v))
98 return 1; 120 return 1;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index f500293dad8d..9619dcc2b325 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -41,7 +41,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
41 41
42 if (!test_bit(VCPU_EXREG_PDPTR, 42 if (!test_bit(VCPU_EXREG_PDPTR,
43 (unsigned long *)&vcpu->arch.regs_avail)) 43 (unsigned long *)&vcpu->arch.regs_avail))
44 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 44 kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
45 45
46 return vcpu->arch.walk_mmu->pdptrs[index]; 46 return vcpu->arch.walk_mmu->pdptrs[index];
47} 47}
@@ -93,6 +93,11 @@ static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
93static inline void leave_guest_mode(struct kvm_vcpu *vcpu) 93static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
94{ 94{
95 vcpu->arch.hflags &= ~HF_GUEST_MASK; 95 vcpu->arch.hflags &= ~HF_GUEST_MASK;
96
97 if (vcpu->arch.load_eoi_exitmap_pending) {
98 vcpu->arch.load_eoi_exitmap_pending = false;
99 kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
100 }
96} 101}
97 102
98static inline bool is_guest_mode(struct kvm_vcpu *vcpu) 103static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 391dda8d43b7..70dcb5548022 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -321,8 +321,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
321 if (!lapic_in_kernel(vcpu)) 321 if (!lapic_in_kernel(vcpu))
322 return; 322 return;
323 323
324 /*
325 * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
326 * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
327 * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
328 * version first and level-triggered interrupts never get EOIed in
329 * IOAPIC.
330 */
324 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); 331 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
325 if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) 332 if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
333 !ioapic_in_kernel(vcpu->kvm))
326 v |= APIC_LVR_DIRECTED_EOI; 334 v |= APIC_LVR_DIRECTED_EOI;
327 kvm_lapic_set_reg(apic, APIC_LVR, v); 335 kvm_lapic_set_reg(apic, APIC_LVR, v);
328} 336}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 56c36014f7b7..edce055e9fd7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -109,7 +109,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
109 109
110static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) 110static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
111{ 111{
112 return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; 112 return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
113} 113}
114 114
115int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); 115int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 763bb3bade63..8494dbae41b9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3031,7 +3031,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3031 return RET_PF_RETRY; 3031 return RET_PF_RETRY;
3032 } 3032 }
3033 3033
3034 return RET_PF_EMULATE; 3034 return -EFAULT;
3035} 3035}
3036 3036
3037static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 3037static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5abae72266b7..6288e9d7068e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -452,14 +452,21 @@ error:
452 * done by is_rsvd_bits_set() above. 452 * done by is_rsvd_bits_set() above.
453 * 453 *
454 * We set up the value of exit_qualification to inject: 454 * We set up the value of exit_qualification to inject:
455 * [2:0] - Derive from [2:0] of real exit_qualification at EPT violation 455 * [2:0] - Derive from the access bits. The exit_qualification might be
456 * out of date if it is serving an EPT misconfiguration.
456 * [5:3] - Calculated by the page walk of the guest EPT page tables 457 * [5:3] - Calculated by the page walk of the guest EPT page tables
457 * [7:8] - Derived from [7:8] of real exit_qualification 458 * [7:8] - Derived from [7:8] of real exit_qualification
458 * 459 *
459 * The other bits are set to 0. 460 * The other bits are set to 0.
460 */ 461 */
461 if (!(errcode & PFERR_RSVD_MASK)) { 462 if (!(errcode & PFERR_RSVD_MASK)) {
462 vcpu->arch.exit_qualification &= 0x187; 463 vcpu->arch.exit_qualification &= 0x180;
464 if (write_fault)
465 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
466 if (user_fault)
467 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
468 if (fetch_fault)
469 vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
463 vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3; 470 vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
464 } 471 }
465#endif 472#endif
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 026db42a86c3..58ead7db71a3 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -244,12 +244,49 @@ int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
244 return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); 244 return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
245} 245}
246 246
247bool is_vmware_backdoor_pmc(u32 pmc_idx)
248{
249 switch (pmc_idx) {
250 case VMWARE_BACKDOOR_PMC_HOST_TSC:
251 case VMWARE_BACKDOOR_PMC_REAL_TIME:
252 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
253 return true;
254 }
255 return false;
256}
257
258static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
259{
260 u64 ctr_val;
261
262 switch (idx) {
263 case VMWARE_BACKDOOR_PMC_HOST_TSC:
264 ctr_val = rdtsc();
265 break;
266 case VMWARE_BACKDOOR_PMC_REAL_TIME:
267 ctr_val = ktime_get_boot_ns();
268 break;
269 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
270 ctr_val = ktime_get_boot_ns() +
271 vcpu->kvm->arch.kvmclock_offset;
272 break;
273 default:
274 return 1;
275 }
276
277 *data = ctr_val;
278 return 0;
279}
280
247int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) 281int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
248{ 282{
249 bool fast_mode = idx & (1u << 31); 283 bool fast_mode = idx & (1u << 31);
250 struct kvm_pmc *pmc; 284 struct kvm_pmc *pmc;
251 u64 ctr_val; 285 u64 ctr_val;
252 286
287 if (is_vmware_backdoor_pmc(idx))
288 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
289
253 pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx); 290 pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
254 if (!pmc) 291 if (!pmc)
255 return 1; 292 return 1;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a9a62b9a73e2..ba8898e1a854 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -9,6 +9,10 @@
9/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ 9/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
10#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) 10#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
11 11
12#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
13#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
14#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
15
12struct kvm_event_hw_type_mapping { 16struct kvm_event_hw_type_mapping {
13 u8 eventsel; 17 u8 eventsel;
14 u8 unit_mask; 18 u8 unit_mask;
@@ -114,6 +118,8 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
114void kvm_pmu_init(struct kvm_vcpu *vcpu); 118void kvm_pmu_init(struct kvm_vcpu *vcpu);
115void kvm_pmu_destroy(struct kvm_vcpu *vcpu); 119void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
116 120
121bool is_vmware_backdoor_pmc(u32 pmc_idx);
122
117extern struct kvm_pmu_ops intel_pmu_ops; 123extern struct kvm_pmu_ops intel_pmu_ops;
118extern struct kvm_pmu_ops amd_pmu_ops; 124extern struct kvm_pmu_ops amd_pmu_ops;
119#endif /* __KVM_X86_PMU_H */ 125#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index cd944435dfbd..1495a735b38e 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -19,6 +19,21 @@
19#include "lapic.h" 19#include "lapic.h"
20#include "pmu.h" 20#include "pmu.h"
21 21
22enum pmu_type {
23 PMU_TYPE_COUNTER = 0,
24 PMU_TYPE_EVNTSEL,
25};
26
27enum index {
28 INDEX_ZERO = 0,
29 INDEX_ONE,
30 INDEX_TWO,
31 INDEX_THREE,
32 INDEX_FOUR,
33 INDEX_FIVE,
34 INDEX_ERROR,
35};
36
22/* duplicated from amd_perfmon_event_map, K7 and above should work. */ 37/* duplicated from amd_perfmon_event_map, K7 and above should work. */
23static struct kvm_event_hw_type_mapping amd_event_mapping[] = { 38static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
24 [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, 39 [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -31,6 +46,88 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
31 [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 46 [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
32}; 47};
33 48
49static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
50{
51 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
52
53 if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
54 if (type == PMU_TYPE_COUNTER)
55 return MSR_F15H_PERF_CTR;
56 else
57 return MSR_F15H_PERF_CTL;
58 } else {
59 if (type == PMU_TYPE_COUNTER)
60 return MSR_K7_PERFCTR0;
61 else
62 return MSR_K7_EVNTSEL0;
63 }
64}
65
66static enum index msr_to_index(u32 msr)
67{
68 switch (msr) {
69 case MSR_F15H_PERF_CTL0:
70 case MSR_F15H_PERF_CTR0:
71 case MSR_K7_EVNTSEL0:
72 case MSR_K7_PERFCTR0:
73 return INDEX_ZERO;
74 case MSR_F15H_PERF_CTL1:
75 case MSR_F15H_PERF_CTR1:
76 case MSR_K7_EVNTSEL1:
77 case MSR_K7_PERFCTR1:
78 return INDEX_ONE;
79 case MSR_F15H_PERF_CTL2:
80 case MSR_F15H_PERF_CTR2:
81 case MSR_K7_EVNTSEL2:
82 case MSR_K7_PERFCTR2:
83 return INDEX_TWO;
84 case MSR_F15H_PERF_CTL3:
85 case MSR_F15H_PERF_CTR3:
86 case MSR_K7_EVNTSEL3:
87 case MSR_K7_PERFCTR3:
88 return INDEX_THREE;
89 case MSR_F15H_PERF_CTL4:
90 case MSR_F15H_PERF_CTR4:
91 return INDEX_FOUR;
92 case MSR_F15H_PERF_CTL5:
93 case MSR_F15H_PERF_CTR5:
94 return INDEX_FIVE;
95 default:
96 return INDEX_ERROR;
97 }
98}
99
100static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
101 enum pmu_type type)
102{
103 switch (msr) {
104 case MSR_F15H_PERF_CTL0:
105 case MSR_F15H_PERF_CTL1:
106 case MSR_F15H_PERF_CTL2:
107 case MSR_F15H_PERF_CTL3:
108 case MSR_F15H_PERF_CTL4:
109 case MSR_F15H_PERF_CTL5:
110 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
111 if (type != PMU_TYPE_EVNTSEL)
112 return NULL;
113 break;
114 case MSR_F15H_PERF_CTR0:
115 case MSR_F15H_PERF_CTR1:
116 case MSR_F15H_PERF_CTR2:
117 case MSR_F15H_PERF_CTR3:
118 case MSR_F15H_PERF_CTR4:
119 case MSR_F15H_PERF_CTR5:
120 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
121 if (type != PMU_TYPE_COUNTER)
122 return NULL;
123 break;
124 default:
125 return NULL;
126 }
127
128 return &pmu->gp_counters[msr_to_index(msr)];
129}
130
34static unsigned amd_find_arch_event(struct kvm_pmu *pmu, 131static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
35 u8 event_select, 132 u8 event_select,
36 u8 unit_mask) 133 u8 unit_mask)
@@ -64,7 +161,18 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
64 161
65static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) 162static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
66{ 163{
67 return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0); 164 unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER);
165 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
166
167 if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
168 /*
169 * The idx is contiguous. The MSRs are not. The counter MSRs
170 * are interleaved with the event select MSRs.
171 */
172 pmc_idx *= 2;
173 }
174
175 return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
68} 176}
69 177
70/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ 178/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
@@ -96,8 +204,8 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
96 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 204 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
97 int ret = false; 205 int ret = false;
98 206
99 ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) || 207 ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
100 get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); 208 get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
101 209
102 return ret; 210 return ret;
103} 211}
@@ -107,14 +215,14 @@ static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
107 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 215 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
108 struct kvm_pmc *pmc; 216 struct kvm_pmc *pmc;
109 217
110 /* MSR_K7_PERFCTRn */ 218 /* MSR_PERFCTRn */
111 pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); 219 pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
112 if (pmc) { 220 if (pmc) {
113 *data = pmc_read_counter(pmc); 221 *data = pmc_read_counter(pmc);
114 return 0; 222 return 0;
115 } 223 }
116 /* MSR_K7_EVNTSELn */ 224 /* MSR_EVNTSELn */
117 pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); 225 pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
118 if (pmc) { 226 if (pmc) {
119 *data = pmc->eventsel; 227 *data = pmc->eventsel;
120 return 0; 228 return 0;
@@ -130,14 +238,14 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
130 u32 msr = msr_info->index; 238 u32 msr = msr_info->index;
131 u64 data = msr_info->data; 239 u64 data = msr_info->data;
132 240
133 /* MSR_K7_PERFCTRn */ 241 /* MSR_PERFCTRn */
134 pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); 242 pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
135 if (pmc) { 243 if (pmc) {
136 pmc->counter += data - pmc_read_counter(pmc); 244 pmc->counter += data - pmc_read_counter(pmc);
137 return 0; 245 return 0;
138 } 246 }
139 /* MSR_K7_EVNTSELn */ 247 /* MSR_EVNTSELn */
140 pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); 248 pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
141 if (pmc) { 249 if (pmc) {
142 if (data == pmc->eventsel) 250 if (data == pmc->eventsel)
143 return 0; 251 return 0;
@@ -154,7 +262,11 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
154{ 262{
155 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 263 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
156 264
157 pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; 265 if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
266 pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
267 else
268 pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
269
158 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; 270 pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
159 pmu->reserved_bits = 0xffffffff00200000ull; 271 pmu->reserved_bits = 0xffffffff00200000ull;
160 /* not applicable to AMD; but clean them to prevent any fall out */ 272 /* not applicable to AMD; but clean them to prevent any fall out */
@@ -169,7 +281,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
169 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 281 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
170 int i; 282 int i;
171 283
172 for (i = 0; i < AMD64_NUM_COUNTERS ; i++) { 284 BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
285
286 for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
173 pmu->gp_counters[i].type = KVM_PMC_GP; 287 pmu->gp_counters[i].type = KVM_PMC_GP;
174 pmu->gp_counters[i].vcpu = vcpu; 288 pmu->gp_counters[i].vcpu = vcpu;
175 pmu->gp_counters[i].idx = i; 289 pmu->gp_counters[i].idx = i;
@@ -181,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
181 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 295 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
182 int i; 296 int i;
183 297
184 for (i = 0; i < AMD64_NUM_COUNTERS; i++) { 298 for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
185 struct kvm_pmc *pmc = &pmu->gp_counters[i]; 299 struct kvm_pmc *pmc = &pmu->gp_counters[i];
186 300
187 pmc_stop_counter(pmc); 301 pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9d2043f94e29..b58787daf9f8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -131,6 +131,28 @@ static const u32 host_save_user_msrs[] = {
131 131
132#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) 132#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
133 133
134struct kvm_sev_info {
135 bool active; /* SEV enabled guest */
136 unsigned int asid; /* ASID used for this guest */
137 unsigned int handle; /* SEV firmware handle */
138 int fd; /* SEV device fd */
139 unsigned long pages_locked; /* Number of pages locked */
140 struct list_head regions_list; /* List of registered regions */
141};
142
143struct kvm_svm {
144 struct kvm kvm;
145
146 /* Struct members for AVIC */
147 u32 avic_vm_id;
148 u32 ldr_mode;
149 struct page *avic_logical_id_table_page;
150 struct page *avic_physical_id_table_page;
151 struct hlist_node hnode;
152
153 struct kvm_sev_info sev_info;
154};
155
134struct kvm_vcpu; 156struct kvm_vcpu;
135 157
136struct nested_state { 158struct nested_state {
@@ -276,6 +298,54 @@ static bool npt_enabled = true;
276static bool npt_enabled; 298static bool npt_enabled;
277#endif 299#endif
278 300
301/*
302 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
303 * pause_filter_count: On processors that support Pause filtering(indicated
304 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
305 * count value. On VMRUN this value is loaded into an internal counter.
306 * Each time a pause instruction is executed, this counter is decremented
307 * until it reaches zero at which time a #VMEXIT is generated if pause
308 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
309 * Intercept Filtering for more details.
310 * This also indicate if ple logic enabled.
311 *
312 * pause_filter_thresh: In addition, some processor families support advanced
313 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
314 * the amount of time a guest is allowed to execute in a pause loop.
315 * In this mode, a 16-bit pause filter threshold field is added in the
316 * VMCB. The threshold value is a cycle count that is used to reset the
317 * pause counter. As with simple pause filtering, VMRUN loads the pause
318 * count value from VMCB into an internal counter. Then, on each pause
319 * instruction the hardware checks the elapsed number of cycles since
320 * the most recent pause instruction against the pause filter threshold.
321 * If the elapsed cycle count is greater than the pause filter threshold,
322 * then the internal pause count is reloaded from the VMCB and execution
323 * continues. If the elapsed cycle count is less than the pause filter
324 * threshold, then the internal pause count is decremented. If the count
325 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
326 * triggered. If advanced pause filtering is supported and pause filter
327 * threshold field is set to zero, the filter will operate in the simpler,
328 * count only mode.
329 */
330
331static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
332module_param(pause_filter_thresh, ushort, 0444);
333
334static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
335module_param(pause_filter_count, ushort, 0444);
336
337/* Default doubles per-vcpu window every exit. */
338static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
339module_param(pause_filter_count_grow, ushort, 0444);
340
341/* Default resets per-vcpu window every exit to pause_filter_count. */
342static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
343module_param(pause_filter_count_shrink, ushort, 0444);
344
345/* Default is to compute the maximum so we can never overflow. */
346static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
347module_param(pause_filter_count_max, ushort, 0444);
348
279/* allow nested paging (virtualized MMU) for all guests */ 349/* allow nested paging (virtualized MMU) for all guests */
280static int npt = true; 350static int npt = true;
281module_param(npt, int, S_IRUGO); 351module_param(npt, int, S_IRUGO);
@@ -352,6 +422,12 @@ struct enc_region {
352 unsigned long size; 422 unsigned long size;
353}; 423};
354 424
425
426static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
427{
428 return container_of(kvm, struct kvm_svm, kvm);
429}
430
355static inline bool svm_sev_enabled(void) 431static inline bool svm_sev_enabled(void)
356{ 432{
357 return max_sev_asid; 433 return max_sev_asid;
@@ -359,14 +435,14 @@ static inline bool svm_sev_enabled(void)
359 435
360static inline bool sev_guest(struct kvm *kvm) 436static inline bool sev_guest(struct kvm *kvm)
361{ 437{
362 struct kvm_sev_info *sev = &kvm->arch.sev_info; 438 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
363 439
364 return sev->active; 440 return sev->active;
365} 441}
366 442
367static inline int sev_get_asid(struct kvm *kvm) 443static inline int sev_get_asid(struct kvm *kvm)
368{ 444{
369 struct kvm_sev_info *sev = &kvm->arch.sev_info; 445 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
370 446
371 return sev->asid; 447 return sev->asid;
372} 448}
@@ -1083,7 +1159,7 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
1083} 1159}
1084 1160
1085/* Note: 1161/* Note:
1086 * This hash table is used to map VM_ID to a struct kvm_arch, 1162 * This hash table is used to map VM_ID to a struct kvm_svm,
1087 * when handling AMD IOMMU GALOG notification to schedule in 1163 * when handling AMD IOMMU GALOG notification to schedule in
1088 * a particular vCPU. 1164 * a particular vCPU.
1089 */ 1165 */
@@ -1100,7 +1176,7 @@ static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
1100static int avic_ga_log_notifier(u32 ga_tag) 1176static int avic_ga_log_notifier(u32 ga_tag)
1101{ 1177{
1102 unsigned long flags; 1178 unsigned long flags;
1103 struct kvm_arch *ka = NULL; 1179 struct kvm_svm *kvm_svm;
1104 struct kvm_vcpu *vcpu = NULL; 1180 struct kvm_vcpu *vcpu = NULL;
1105 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); 1181 u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
1106 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); 1182 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
@@ -1108,13 +1184,10 @@ static int avic_ga_log_notifier(u32 ga_tag)
1108 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); 1184 pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
1109 1185
1110 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1186 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1111 hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { 1187 hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
1112 struct kvm *kvm = container_of(ka, struct kvm, arch); 1188 if (kvm_svm->avic_vm_id != vm_id)
1113 struct kvm_arch *vm_data = &kvm->arch;
1114
1115 if (vm_data->avic_vm_id != vm_id)
1116 continue; 1189 continue;
1117 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); 1190 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
1118 break; 1191 break;
1119 } 1192 }
1120 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1193 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -1172,6 +1245,42 @@ err:
1172 return rc; 1245 return rc;
1173} 1246}
1174 1247
1248static void grow_ple_window(struct kvm_vcpu *vcpu)
1249{
1250 struct vcpu_svm *svm = to_svm(vcpu);
1251 struct vmcb_control_area *control = &svm->vmcb->control;
1252 int old = control->pause_filter_count;
1253
1254 control->pause_filter_count = __grow_ple_window(old,
1255 pause_filter_count,
1256 pause_filter_count_grow,
1257 pause_filter_count_max);
1258
1259 if (control->pause_filter_count != old)
1260 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1261
1262 trace_kvm_ple_window_grow(vcpu->vcpu_id,
1263 control->pause_filter_count, old);
1264}
1265
1266static void shrink_ple_window(struct kvm_vcpu *vcpu)
1267{
1268 struct vcpu_svm *svm = to_svm(vcpu);
1269 struct vmcb_control_area *control = &svm->vmcb->control;
1270 int old = control->pause_filter_count;
1271
1272 control->pause_filter_count =
1273 __shrink_ple_window(old,
1274 pause_filter_count,
1275 pause_filter_count_shrink,
1276 pause_filter_count);
1277 if (control->pause_filter_count != old)
1278 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1279
1280 trace_kvm_ple_window_shrink(vcpu->vcpu_id,
1281 control->pause_filter_count, old);
1282}
1283
1175static __init int svm_hardware_setup(void) 1284static __init int svm_hardware_setup(void)
1176{ 1285{
1177 int cpu; 1286 int cpu;
@@ -1202,6 +1311,14 @@ static __init int svm_hardware_setup(void)
1202 kvm_tsc_scaling_ratio_frac_bits = 32; 1311 kvm_tsc_scaling_ratio_frac_bits = 32;
1203 } 1312 }
1204 1313
1314 /* Check for pause filtering support */
1315 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1316 pause_filter_count = 0;
1317 pause_filter_thresh = 0;
1318 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
1319 pause_filter_thresh = 0;
1320 }
1321
1205 if (nested) { 1322 if (nested) {
1206 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 1323 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1207 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 1324 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -1328,10 +1445,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1328static void avic_init_vmcb(struct vcpu_svm *svm) 1445static void avic_init_vmcb(struct vcpu_svm *svm)
1329{ 1446{
1330 struct vmcb *vmcb = svm->vmcb; 1447 struct vmcb *vmcb = svm->vmcb;
1331 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; 1448 struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
1332 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 1449 phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
1333 phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); 1450 phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
1334 phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); 1451 phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
1335 1452
1336 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1453 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1337 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; 1454 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1363,6 +1480,14 @@ static void init_vmcb(struct vcpu_svm *svm)
1363 set_exception_intercept(svm, MC_VECTOR); 1480 set_exception_intercept(svm, MC_VECTOR);
1364 set_exception_intercept(svm, AC_VECTOR); 1481 set_exception_intercept(svm, AC_VECTOR);
1365 set_exception_intercept(svm, DB_VECTOR); 1482 set_exception_intercept(svm, DB_VECTOR);
1483 /*
1484 * Guest access to VMware backdoor ports could legitimately
1485 * trigger #GP because of TSS I/O permission bitmap.
1486 * We intercept those #GP and allow access to them anyway
1487 * as VMware does.
1488 */
1489 if (enable_vmware_backdoor)
1490 set_exception_intercept(svm, GP_VECTOR);
1366 1491
1367 set_intercept(svm, INTERCEPT_INTR); 1492 set_intercept(svm, INTERCEPT_INTR);
1368 set_intercept(svm, INTERCEPT_NMI); 1493 set_intercept(svm, INTERCEPT_NMI);
@@ -1371,7 +1496,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1371 set_intercept(svm, INTERCEPT_RDPMC); 1496 set_intercept(svm, INTERCEPT_RDPMC);
1372 set_intercept(svm, INTERCEPT_CPUID); 1497 set_intercept(svm, INTERCEPT_CPUID);
1373 set_intercept(svm, INTERCEPT_INVD); 1498 set_intercept(svm, INTERCEPT_INVD);
1374 set_intercept(svm, INTERCEPT_HLT);
1375 set_intercept(svm, INTERCEPT_INVLPG); 1499 set_intercept(svm, INTERCEPT_INVLPG);
1376 set_intercept(svm, INTERCEPT_INVLPGA); 1500 set_intercept(svm, INTERCEPT_INVLPGA);
1377 set_intercept(svm, INTERCEPT_IOIO_PROT); 1501 set_intercept(svm, INTERCEPT_IOIO_PROT);
@@ -1389,11 +1513,14 @@ static void init_vmcb(struct vcpu_svm *svm)
1389 set_intercept(svm, INTERCEPT_XSETBV); 1513 set_intercept(svm, INTERCEPT_XSETBV);
1390 set_intercept(svm, INTERCEPT_RSM); 1514 set_intercept(svm, INTERCEPT_RSM);
1391 1515
1392 if (!kvm_mwait_in_guest()) { 1516 if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1393 set_intercept(svm, INTERCEPT_MONITOR); 1517 set_intercept(svm, INTERCEPT_MONITOR);
1394 set_intercept(svm, INTERCEPT_MWAIT); 1518 set_intercept(svm, INTERCEPT_MWAIT);
1395 } 1519 }
1396 1520
1521 if (!kvm_hlt_in_guest(svm->vcpu.kvm))
1522 set_intercept(svm, INTERCEPT_HLT);
1523
1397 control->iopm_base_pa = __sme_set(iopm_base); 1524 control->iopm_base_pa = __sme_set(iopm_base);
1398 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1525 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1399 control->int_ctl = V_INTR_MASKING_MASK; 1526 control->int_ctl = V_INTR_MASKING_MASK;
@@ -1449,9 +1576,13 @@ static void init_vmcb(struct vcpu_svm *svm)
1449 svm->nested.vmcb = 0; 1576 svm->nested.vmcb = 0;
1450 svm->vcpu.arch.hflags = 0; 1577 svm->vcpu.arch.hflags = 0;
1451 1578
1452 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 1579 if (pause_filter_count) {
1453 control->pause_filter_count = 3000; 1580 control->pause_filter_count = pause_filter_count;
1581 if (pause_filter_thresh)
1582 control->pause_filter_thresh = pause_filter_thresh;
1454 set_intercept(svm, INTERCEPT_PAUSE); 1583 set_intercept(svm, INTERCEPT_PAUSE);
1584 } else {
1585 clr_intercept(svm, INTERCEPT_PAUSE);
1455 } 1586 }
1456 1587
1457 if (kvm_vcpu_apicv_active(&svm->vcpu)) 1588 if (kvm_vcpu_apicv_active(&svm->vcpu))
@@ -1488,12 +1619,12 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
1488 unsigned int index) 1619 unsigned int index)
1489{ 1620{
1490 u64 *avic_physical_id_table; 1621 u64 *avic_physical_id_table;
1491 struct kvm_arch *vm_data = &vcpu->kvm->arch; 1622 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1492 1623
1493 if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) 1624 if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1494 return NULL; 1625 return NULL;
1495 1626
1496 avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page); 1627 avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
1497 1628
1498 return &avic_physical_id_table[index]; 1629 return &avic_physical_id_table[index];
1499} 1630}
@@ -1576,7 +1707,7 @@ static void __sev_asid_free(int asid)
1576 1707
1577static void sev_asid_free(struct kvm *kvm) 1708static void sev_asid_free(struct kvm *kvm)
1578{ 1709{
1579 struct kvm_sev_info *sev = &kvm->arch.sev_info; 1710 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1580 1711
1581 __sev_asid_free(sev->asid); 1712 __sev_asid_free(sev->asid);
1582} 1713}
@@ -1616,7 +1747,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
1616 unsigned long ulen, unsigned long *n, 1747 unsigned long ulen, unsigned long *n,
1617 int write) 1748 int write)
1618{ 1749{
1619 struct kvm_sev_info *sev = &kvm->arch.sev_info; 1750 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1620 unsigned long npages, npinned, size; 1751 unsigned long npages, npinned, size;
1621 unsigned long locked, lock_limit; 1752 unsigned long locked, lock_limit;
1622 struct page **pages; 1753 struct page **pages;
@@ -1667,7 +1798,7 @@ err:
1667static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 1798static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
1668 unsigned long npages) 1799 unsigned long npages)
1669{ 1800{
1670 struct kvm_sev_info *sev = &kvm->arch.sev_info; 1801 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1671 1802
1672 release_pages(pages, npages); 1803 release_pages(pages, npages);
1673 kvfree(pages); 1804 kvfree(pages);
@@ -1705,9 +1836,20 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
1705 kfree(region); 1836 kfree(region);
1706} 1837}
1707 1838
1839static struct kvm *svm_vm_alloc(void)
1840{
1841 struct kvm_svm *kvm_svm = kzalloc(sizeof(struct kvm_svm), GFP_KERNEL);
1842 return &kvm_svm->kvm;
1843}
1844
1845static void svm_vm_free(struct kvm *kvm)
1846{
1847 kfree(to_kvm_svm(kvm));
1848}
1849
1708static void sev_vm_destroy(struct kvm *kvm) 1850static void sev_vm_destroy(struct kvm *kvm)
1709{ 1851{
1710 struct kvm_sev_info *sev = &kvm->arch.sev_info; 1852 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1711 struct list_head *head = &sev->regions_list; 1853 struct list_head *head = &sev->regions_list;
1712 struct list_head *pos, *q; 1854 struct list_head *pos, *q;
1713 1855
@@ -1736,18 +1878,18 @@ static void sev_vm_destroy(struct kvm *kvm)
1736static void avic_vm_destroy(struct kvm *kvm) 1878static void avic_vm_destroy(struct kvm *kvm)
1737{ 1879{
1738 unsigned long flags; 1880 unsigned long flags;
1739 struct kvm_arch *vm_data = &kvm->arch; 1881 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1740 1882
1741 if (!avic) 1883 if (!avic)
1742 return; 1884 return;
1743 1885
1744 if (vm_data->avic_logical_id_table_page) 1886 if (kvm_svm->avic_logical_id_table_page)
1745 __free_page(vm_data->avic_logical_id_table_page); 1887 __free_page(kvm_svm->avic_logical_id_table_page);
1746 if (vm_data->avic_physical_id_table_page) 1888 if (kvm_svm->avic_physical_id_table_page)
1747 __free_page(vm_data->avic_physical_id_table_page); 1889 __free_page(kvm_svm->avic_physical_id_table_page);
1748 1890
1749 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1891 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1750 hash_del(&vm_data->hnode); 1892 hash_del(&kvm_svm->hnode);
1751 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1893 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1752} 1894}
1753 1895
@@ -1761,10 +1903,10 @@ static int avic_vm_init(struct kvm *kvm)
1761{ 1903{
1762 unsigned long flags; 1904 unsigned long flags;
1763 int err = -ENOMEM; 1905 int err = -ENOMEM;
1764 struct kvm_arch *vm_data = &kvm->arch; 1906 struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1907 struct kvm_svm *k2;
1765 struct page *p_page; 1908 struct page *p_page;
1766 struct page *l_page; 1909 struct page *l_page;
1767 struct kvm_arch *ka;
1768 u32 vm_id; 1910 u32 vm_id;
1769 1911
1770 if (!avic) 1912 if (!avic)
@@ -1775,7 +1917,7 @@ static int avic_vm_init(struct kvm *kvm)
1775 if (!p_page) 1917 if (!p_page)
1776 goto free_avic; 1918 goto free_avic;
1777 1919
1778 vm_data->avic_physical_id_table_page = p_page; 1920 kvm_svm->avic_physical_id_table_page = p_page;
1779 clear_page(page_address(p_page)); 1921 clear_page(page_address(p_page));
1780 1922
1781 /* Allocating logical APIC ID table (4KB) */ 1923 /* Allocating logical APIC ID table (4KB) */
@@ -1783,7 +1925,7 @@ static int avic_vm_init(struct kvm *kvm)
1783 if (!l_page) 1925 if (!l_page)
1784 goto free_avic; 1926 goto free_avic;
1785 1927
1786 vm_data->avic_logical_id_table_page = l_page; 1928 kvm_svm->avic_logical_id_table_page = l_page;
1787 clear_page(page_address(l_page)); 1929 clear_page(page_address(l_page));
1788 1930
1789 spin_lock_irqsave(&svm_vm_data_hash_lock, flags); 1931 spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
@@ -1795,15 +1937,13 @@ static int avic_vm_init(struct kvm *kvm)
1795 } 1937 }
1796 /* Is it still in use? Only possible if wrapped at least once */ 1938 /* Is it still in use? Only possible if wrapped at least once */
1797 if (next_vm_id_wrapped) { 1939 if (next_vm_id_wrapped) {
1798 hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { 1940 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
1799 struct kvm *k2 = container_of(ka, struct kvm, arch); 1941 if (k2->avic_vm_id == vm_id)
1800 struct kvm_arch *vd2 = &k2->arch;
1801 if (vd2->avic_vm_id == vm_id)
1802 goto again; 1942 goto again;
1803 } 1943 }
1804 } 1944 }
1805 vm_data->avic_vm_id = vm_id; 1945 kvm_svm->avic_vm_id = vm_id;
1806 hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); 1946 hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
1807 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); 1947 spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1808 1948
1809 return 0; 1949 return 0;
@@ -2535,14 +2675,7 @@ static int bp_interception(struct vcpu_svm *svm)
2535 2675
2536static int ud_interception(struct vcpu_svm *svm) 2676static int ud_interception(struct vcpu_svm *svm)
2537{ 2677{
2538 int er; 2678 return handle_ud(&svm->vcpu);
2539
2540 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
2541 if (er == EMULATE_USER_EXIT)
2542 return 0;
2543 if (er != EMULATE_DONE)
2544 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2545 return 1;
2546} 2679}
2547 2680
2548static int ac_interception(struct vcpu_svm *svm) 2681static int ac_interception(struct vcpu_svm *svm)
@@ -2551,6 +2684,23 @@ static int ac_interception(struct vcpu_svm *svm)
2551 return 1; 2684 return 1;
2552} 2685}
2553 2686
2687static int gp_interception(struct vcpu_svm *svm)
2688{
2689 struct kvm_vcpu *vcpu = &svm->vcpu;
2690 u32 error_code = svm->vmcb->control.exit_info_1;
2691 int er;
2692
2693 WARN_ON_ONCE(!enable_vmware_backdoor);
2694
2695 er = emulate_instruction(vcpu,
2696 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
2697 if (er == EMULATE_USER_EXIT)
2698 return 0;
2699 else if (er != EMULATE_DONE)
2700 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2701 return 1;
2702}
2703
2554static bool is_erratum_383(void) 2704static bool is_erratum_383(void)
2555{ 2705{
2556 int err, i; 2706 int err, i;
@@ -2639,7 +2789,7 @@ static int io_interception(struct vcpu_svm *svm)
2639{ 2789{
2640 struct kvm_vcpu *vcpu = &svm->vcpu; 2790 struct kvm_vcpu *vcpu = &svm->vcpu;
2641 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2791 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2642 int size, in, string, ret; 2792 int size, in, string;
2643 unsigned port; 2793 unsigned port;
2644 2794
2645 ++svm->vcpu.stat.io_exits; 2795 ++svm->vcpu.stat.io_exits;
@@ -2651,16 +2801,8 @@ static int io_interception(struct vcpu_svm *svm)
2651 port = io_info >> 16; 2801 port = io_info >> 16;
2652 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2802 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2653 svm->next_rip = svm->vmcb->control.exit_info_2; 2803 svm->next_rip = svm->vmcb->control.exit_info_2;
2654 ret = kvm_skip_emulated_instruction(&svm->vcpu);
2655 2804
2656 /* 2805 return kvm_fast_pio(&svm->vcpu, size, port, in);
2657 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
2658 * KVM_EXIT_DEBUG here.
2659 */
2660 if (in)
2661 return kvm_fast_pio_in(vcpu, size, port) && ret;
2662 else
2663 return kvm_fast_pio_out(vcpu, size, port) && ret;
2664} 2806}
2665 2807
2666static int nmi_interception(struct vcpu_svm *svm) 2808static int nmi_interception(struct vcpu_svm *svm)
@@ -4233,6 +4375,9 @@ static int pause_interception(struct vcpu_svm *svm)
4233 struct kvm_vcpu *vcpu = &svm->vcpu; 4375 struct kvm_vcpu *vcpu = &svm->vcpu;
4234 bool in_kernel = (svm_get_cpl(vcpu) == 0); 4376 bool in_kernel = (svm_get_cpl(vcpu) == 0);
4235 4377
4378 if (pause_filter_thresh)
4379 grow_ple_window(vcpu);
4380
4236 kvm_vcpu_on_spin(vcpu, in_kernel); 4381 kvm_vcpu_on_spin(vcpu, in_kernel);
4237 return 1; 4382 return 1;
4238} 4383}
@@ -4323,7 +4468,7 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
4323 4468
4324static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) 4469static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
4325{ 4470{
4326 struct kvm_arch *vm_data = &vcpu->kvm->arch; 4471 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
4327 int index; 4472 int index;
4328 u32 *logical_apic_id_table; 4473 u32 *logical_apic_id_table;
4329 int dlid = GET_APIC_LOGICAL_ID(ldr); 4474 int dlid = GET_APIC_LOGICAL_ID(ldr);
@@ -4345,7 +4490,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
4345 index = (cluster << 2) + apic; 4490 index = (cluster << 2) + apic;
4346 } 4491 }
4347 4492
4348 logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page); 4493 logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
4349 4494
4350 return &logical_apic_id_table[index]; 4495 return &logical_apic_id_table[index];
4351} 4496}
@@ -4425,7 +4570,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
4425static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) 4570static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
4426{ 4571{
4427 struct vcpu_svm *svm = to_svm(vcpu); 4572 struct vcpu_svm *svm = to_svm(vcpu);
4428 struct kvm_arch *vm_data = &vcpu->kvm->arch; 4573 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
4429 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 4574 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
4430 u32 mod = (dfr >> 28) & 0xf; 4575 u32 mod = (dfr >> 28) & 0xf;
4431 4576
@@ -4434,11 +4579,11 @@ static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
4434 * If this changes, we need to flush the AVIC logical 4579 * If this changes, we need to flush the AVIC logical
4435 * APID id table. 4580 * APID id table.
4436 */ 4581 */
4437 if (vm_data->ldr_mode == mod) 4582 if (kvm_svm->ldr_mode == mod)
4438 return 0; 4583 return 0;
4439 4584
4440 clear_page(page_address(vm_data->avic_logical_id_table_page)); 4585 clear_page(page_address(kvm_svm->avic_logical_id_table_page));
4441 vm_data->ldr_mode = mod; 4586 kvm_svm->ldr_mode = mod;
4442 4587
4443 if (svm->ldr_reg) 4588 if (svm->ldr_reg)
4444 avic_handle_ldr_update(vcpu); 4589 avic_handle_ldr_update(vcpu);
@@ -4558,6 +4703,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
4558 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 4703 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
4559 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 4704 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
4560 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 4705 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
4706 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
4561 [SVM_EXIT_INTR] = intr_interception, 4707 [SVM_EXIT_INTR] = intr_interception,
4562 [SVM_EXIT_NMI] = nmi_interception, 4708 [SVM_EXIT_NMI] = nmi_interception,
4563 [SVM_EXIT_SMI] = nop_on_interception, 4709 [SVM_EXIT_SMI] = nop_on_interception,
@@ -4606,6 +4752,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
4606 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); 4752 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
4607 pr_err("%-20s%016llx\n", "intercepts:", control->intercept); 4753 pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
4608 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 4754 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
4755 pr_err("%-20s%d\n", "pause filter threshold:",
4756 control->pause_filter_thresh);
4609 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 4757 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
4610 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 4758 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
4611 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 4759 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
@@ -5073,7 +5221,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
5073 /* Try to enable guest_mode in IRTE */ 5221 /* Try to enable guest_mode in IRTE */
5074 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 5222 pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
5075 AVIC_HPA_MASK); 5223 AVIC_HPA_MASK);
5076 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, 5224 pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
5077 svm->vcpu.vcpu_id); 5225 svm->vcpu.vcpu_id);
5078 pi.is_guest_mode = true; 5226 pi.is_guest_mode = true;
5079 pi.vcpu_data = &vcpu_info; 5227 pi.vcpu_data = &vcpu_info;
@@ -5237,6 +5385,11 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
5237 return 0; 5385 return 0;
5238} 5386}
5239 5387
5388static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5389{
5390 return 0;
5391}
5392
5240static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) 5393static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
5241{ 5394{
5242 struct vcpu_svm *svm = to_svm(vcpu); 5395 struct vcpu_svm *svm = to_svm(vcpu);
@@ -5538,14 +5691,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5538 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 5691 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
5539 5692
5540 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5693 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5541 kvm_before_handle_nmi(&svm->vcpu); 5694 kvm_before_interrupt(&svm->vcpu);
5542 5695
5543 stgi(); 5696 stgi();
5544 5697
5545 /* Any pending NMI will happen here */ 5698 /* Any pending NMI will happen here */
5546 5699
5547 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 5700 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5548 kvm_after_handle_nmi(&svm->vcpu); 5701 kvm_after_interrupt(&svm->vcpu);
5549 5702
5550 sync_cr8_to_lapic(vcpu); 5703 sync_cr8_to_lapic(vcpu);
5551 5704
@@ -5921,6 +6074,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
5921 6074
5922static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) 6075static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
5923{ 6076{
6077 if (pause_filter_thresh)
6078 shrink_ple_window(vcpu);
5924} 6079}
5925 6080
5926static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) 6081static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
@@ -6037,7 +6192,7 @@ static int sev_asid_new(void)
6037 6192
6038static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) 6193static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
6039{ 6194{
6040 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6195 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6041 int asid, ret; 6196 int asid, ret;
6042 6197
6043 ret = -EBUSY; 6198 ret = -EBUSY;
@@ -6102,14 +6257,14 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
6102 6257
6103static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) 6258static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
6104{ 6259{
6105 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6260 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6106 6261
6107 return __sev_issue_cmd(sev->fd, id, data, error); 6262 return __sev_issue_cmd(sev->fd, id, data, error);
6108} 6263}
6109 6264
6110static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 6265static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
6111{ 6266{
6112 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6267 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6113 struct sev_data_launch_start *start; 6268 struct sev_data_launch_start *start;
6114 struct kvm_sev_launch_start params; 6269 struct kvm_sev_launch_start params;
6115 void *dh_blob, *session_blob; 6270 void *dh_blob, *session_blob;
@@ -6207,7 +6362,7 @@ static int get_num_contig_pages(int idx, struct page **inpages,
6207static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 6362static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
6208{ 6363{
6209 unsigned long vaddr, vaddr_end, next_vaddr, npages, size; 6364 unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
6210 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6365 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6211 struct kvm_sev_launch_update_data params; 6366 struct kvm_sev_launch_update_data params;
6212 struct sev_data_launch_update_data *data; 6367 struct sev_data_launch_update_data *data;
6213 struct page **inpages; 6368 struct page **inpages;
@@ -6283,7 +6438,7 @@ e_free:
6283static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) 6438static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
6284{ 6439{
6285 void __user *measure = (void __user *)(uintptr_t)argp->data; 6440 void __user *measure = (void __user *)(uintptr_t)argp->data;
6286 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6441 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6287 struct sev_data_launch_measure *data; 6442 struct sev_data_launch_measure *data;
6288 struct kvm_sev_launch_measure params; 6443 struct kvm_sev_launch_measure params;
6289 void __user *p = NULL; 6444 void __user *p = NULL;
@@ -6351,7 +6506,7 @@ e_free:
6351 6506
6352static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 6507static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
6353{ 6508{
6354 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6509 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6355 struct sev_data_launch_finish *data; 6510 struct sev_data_launch_finish *data;
6356 int ret; 6511 int ret;
6357 6512
@@ -6371,7 +6526,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
6371 6526
6372static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) 6527static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
6373{ 6528{
6374 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6529 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6375 struct kvm_sev_guest_status params; 6530 struct kvm_sev_guest_status params;
6376 struct sev_data_guest_status *data; 6531 struct sev_data_guest_status *data;
6377 int ret; 6532 int ret;
@@ -6403,7 +6558,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
6403 unsigned long dst, int size, 6558 unsigned long dst, int size,
6404 int *error, bool enc) 6559 int *error, bool enc)
6405{ 6560{
6406 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6561 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6407 struct sev_data_dbg *data; 6562 struct sev_data_dbg *data;
6408 int ret; 6563 int ret;
6409 6564
@@ -6635,7 +6790,7 @@ err:
6635 6790
6636static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) 6791static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
6637{ 6792{
6638 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6793 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6639 struct sev_data_launch_secret *data; 6794 struct sev_data_launch_secret *data;
6640 struct kvm_sev_launch_secret params; 6795 struct kvm_sev_launch_secret params;
6641 struct page **pages; 6796 struct page **pages;
@@ -6759,7 +6914,7 @@ out:
6759static int svm_register_enc_region(struct kvm *kvm, 6914static int svm_register_enc_region(struct kvm *kvm,
6760 struct kvm_enc_region *range) 6915 struct kvm_enc_region *range)
6761{ 6916{
6762 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6917 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6763 struct enc_region *region; 6918 struct enc_region *region;
6764 int ret = 0; 6919 int ret = 0;
6765 6920
@@ -6801,7 +6956,7 @@ e_free:
6801static struct enc_region * 6956static struct enc_region *
6802find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) 6957find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
6803{ 6958{
6804 struct kvm_sev_info *sev = &kvm->arch.sev_info; 6959 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6805 struct list_head *head = &sev->regions_list; 6960 struct list_head *head = &sev->regions_list;
6806 struct enc_region *i; 6961 struct enc_region *i;
6807 6962
@@ -6859,6 +7014,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
6859 .vcpu_free = svm_free_vcpu, 7014 .vcpu_free = svm_free_vcpu,
6860 .vcpu_reset = svm_vcpu_reset, 7015 .vcpu_reset = svm_vcpu_reset,
6861 7016
7017 .vm_alloc = svm_vm_alloc,
7018 .vm_free = svm_vm_free,
6862 .vm_init = avic_vm_init, 7019 .vm_init = avic_vm_init,
6863 .vm_destroy = svm_vm_destroy, 7020 .vm_destroy = svm_vm_destroy,
6864 7021
@@ -6925,6 +7082,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
6925 .apicv_post_state_restore = avic_post_state_restore, 7082 .apicv_post_state_restore = avic_post_state_restore,
6926 7083
6927 .set_tss_addr = svm_set_tss_addr, 7084 .set_tss_addr = svm_set_tss_addr,
7085 .set_identity_map_addr = svm_set_identity_map_addr,
6928 .get_tdp_level = get_npt_level, 7086 .get_tdp_level = get_npt_level,
6929 .get_mt_mask = svm_get_mt_mask, 7087 .get_mt_mask = svm_get_mt_mask,
6930 7088
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 92496b9b5f2b..aafcc9881e88 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -52,9 +52,11 @@
52#include <asm/irq_remapping.h> 52#include <asm/irq_remapping.h>
53#include <asm/mmu_context.h> 53#include <asm/mmu_context.h>
54#include <asm/nospec-branch.h> 54#include <asm/nospec-branch.h>
55#include <asm/mshyperv.h>
55 56
56#include "trace.h" 57#include "trace.h"
57#include "pmu.h" 58#include "pmu.h"
59#include "vmx_evmcs.h"
58 60
59#define __ex(x) __kvm_handle_fault_on_reboot(x) 61#define __ex(x) __kvm_handle_fault_on_reboot(x)
60#define __ex_clear(x, reg) \ 62#define __ex_clear(x, reg) \
@@ -130,13 +132,15 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
130#endif 132#endif
131 133
132#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 134#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
133#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 135#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
134#define KVM_VM_CR0_ALWAYS_ON \ 136#define KVM_VM_CR0_ALWAYS_ON \
135 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 137 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
138 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
136#define KVM_CR4_GUEST_OWNED_BITS \ 139#define KVM_CR4_GUEST_OWNED_BITS \
137 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 140 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
138 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) 141 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
139 142
143#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
140#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 144#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
141#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 145#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
142 146
@@ -165,34 +169,33 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
165 * Time is measured based on a counter that runs at the same rate as the TSC, 169 * Time is measured based on a counter that runs at the same rate as the TSC,
166 * refer SDM volume 3b section 21.6.13 & 22.1.3. 170 * refer SDM volume 3b section 21.6.13 & 22.1.3.
167 */ 171 */
168#define KVM_VMX_DEFAULT_PLE_GAP 128 172static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
169#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
170#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
171#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
172#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
173 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
174 173
175static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 174static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
176module_param(ple_gap, int, S_IRUGO); 175module_param(ple_window, uint, 0444);
177
178static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
179module_param(ple_window, int, S_IRUGO);
180 176
181/* Default doubles per-vcpu window every exit. */ 177/* Default doubles per-vcpu window every exit. */
182static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; 178static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
183module_param(ple_window_grow, int, S_IRUGO); 179module_param(ple_window_grow, uint, 0444);
184 180
185/* Default resets per-vcpu window every exit to ple_window. */ 181/* Default resets per-vcpu window every exit to ple_window. */
186static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; 182static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
187module_param(ple_window_shrink, int, S_IRUGO); 183module_param(ple_window_shrink, uint, 0444);
188 184
189/* Default is to compute the maximum so we can never overflow. */ 185/* Default is to compute the maximum so we can never overflow. */
190static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 186static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
191static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 187module_param(ple_window_max, uint, 0444);
192module_param(ple_window_max, int, S_IRUGO);
193 188
194extern const ulong vmx_return; 189extern const ulong vmx_return;
195 190
191struct kvm_vmx {
192 struct kvm kvm;
193
194 unsigned int tss_addr;
195 bool ept_identity_pagetable_done;
196 gpa_t ept_identity_map_addr;
197};
198
196#define NR_AUTOLOAD_MSRS 8 199#define NR_AUTOLOAD_MSRS 8
197 200
198struct vmcs { 201struct vmcs {
@@ -424,6 +427,35 @@ struct __packed vmcs12 {
424 */ 427 */
425#define VMCS12_MAX_FIELD_INDEX 0x17 428#define VMCS12_MAX_FIELD_INDEX 0x17
426 429
430struct nested_vmx_msrs {
431 /*
432 * We only store the "true" versions of the VMX capability MSRs. We
433 * generate the "non-true" versions by setting the must-be-1 bits
434 * according to the SDM.
435 */
436 u32 procbased_ctls_low;
437 u32 procbased_ctls_high;
438 u32 secondary_ctls_low;
439 u32 secondary_ctls_high;
440 u32 pinbased_ctls_low;
441 u32 pinbased_ctls_high;
442 u32 exit_ctls_low;
443 u32 exit_ctls_high;
444 u32 entry_ctls_low;
445 u32 entry_ctls_high;
446 u32 misc_low;
447 u32 misc_high;
448 u32 ept_caps;
449 u32 vpid_caps;
450 u64 basic;
451 u64 cr0_fixed0;
452 u64 cr0_fixed1;
453 u64 cr4_fixed0;
454 u64 cr4_fixed1;
455 u64 vmcs_enum;
456 u64 vmfunc_controls;
457};
458
427/* 459/*
428 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 460 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
429 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 461 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -475,32 +507,7 @@ struct nested_vmx {
475 u16 vpid02; 507 u16 vpid02;
476 u16 last_vpid; 508 u16 last_vpid;
477 509
478 /* 510 struct nested_vmx_msrs msrs;
479 * We only store the "true" versions of the VMX capability MSRs. We
480 * generate the "non-true" versions by setting the must-be-1 bits
481 * according to the SDM.
482 */
483 u32 nested_vmx_procbased_ctls_low;
484 u32 nested_vmx_procbased_ctls_high;
485 u32 nested_vmx_secondary_ctls_low;
486 u32 nested_vmx_secondary_ctls_high;
487 u32 nested_vmx_pinbased_ctls_low;
488 u32 nested_vmx_pinbased_ctls_high;
489 u32 nested_vmx_exit_ctls_low;
490 u32 nested_vmx_exit_ctls_high;
491 u32 nested_vmx_entry_ctls_low;
492 u32 nested_vmx_entry_ctls_high;
493 u32 nested_vmx_misc_low;
494 u32 nested_vmx_misc_high;
495 u32 nested_vmx_ept_caps;
496 u32 nested_vmx_vpid_caps;
497 u64 nested_vmx_basic;
498 u64 nested_vmx_cr0_fixed0;
499 u64 nested_vmx_cr0_fixed1;
500 u64 nested_vmx_cr4_fixed0;
501 u64 nested_vmx_cr4_fixed1;
502 u64 nested_vmx_vmcs_enum;
503 u64 nested_vmx_vmfunc_controls;
504 511
505 /* SMM related state */ 512 /* SMM related state */
506 struct { 513 struct {
@@ -691,6 +698,11 @@ enum segment_cache_field {
691 SEG_FIELD_NR = 4 698 SEG_FIELD_NR = 4
692}; 699};
693 700
701static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
702{
703 return container_of(kvm, struct kvm_vmx, kvm);
704}
705
694static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 706static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
695{ 707{
696 return container_of(vcpu, struct vcpu_vmx, vcpu); 708 return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -953,6 +965,7 @@ static struct vmcs_config {
953 u32 cpu_based_2nd_exec_ctrl; 965 u32 cpu_based_2nd_exec_ctrl;
954 u32 vmexit_ctrl; 966 u32 vmexit_ctrl;
955 u32 vmentry_ctrl; 967 u32 vmentry_ctrl;
968 struct nested_vmx_msrs nested;
956} vmcs_config; 969} vmcs_config;
957 970
958static struct vmx_capability { 971static struct vmx_capability {
@@ -999,6 +1012,169 @@ static const u32 vmx_msr_index[] = {
999 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 1012 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1000}; 1013};
1001 1014
1015DEFINE_STATIC_KEY_FALSE(enable_evmcs);
1016
1017#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
1018
1019#define KVM_EVMCS_VERSION 1
1020
1021#if IS_ENABLED(CONFIG_HYPERV)
1022static bool __read_mostly enlightened_vmcs = true;
1023module_param(enlightened_vmcs, bool, 0444);
1024
1025static inline void evmcs_write64(unsigned long field, u64 value)
1026{
1027 u16 clean_field;
1028 int offset = get_evmcs_offset(field, &clean_field);
1029
1030 if (offset < 0)
1031 return;
1032
1033 *(u64 *)((char *)current_evmcs + offset) = value;
1034
1035 current_evmcs->hv_clean_fields &= ~clean_field;
1036}
1037
1038static inline void evmcs_write32(unsigned long field, u32 value)
1039{
1040 u16 clean_field;
1041 int offset = get_evmcs_offset(field, &clean_field);
1042
1043 if (offset < 0)
1044 return;
1045
1046 *(u32 *)((char *)current_evmcs + offset) = value;
1047 current_evmcs->hv_clean_fields &= ~clean_field;
1048}
1049
1050static inline void evmcs_write16(unsigned long field, u16 value)
1051{
1052 u16 clean_field;
1053 int offset = get_evmcs_offset(field, &clean_field);
1054
1055 if (offset < 0)
1056 return;
1057
1058 *(u16 *)((char *)current_evmcs + offset) = value;
1059 current_evmcs->hv_clean_fields &= ~clean_field;
1060}
1061
1062static inline u64 evmcs_read64(unsigned long field)
1063{
1064 int offset = get_evmcs_offset(field, NULL);
1065
1066 if (offset < 0)
1067 return 0;
1068
1069 return *(u64 *)((char *)current_evmcs + offset);
1070}
1071
1072static inline u32 evmcs_read32(unsigned long field)
1073{
1074 int offset = get_evmcs_offset(field, NULL);
1075
1076 if (offset < 0)
1077 return 0;
1078
1079 return *(u32 *)((char *)current_evmcs + offset);
1080}
1081
1082static inline u16 evmcs_read16(unsigned long field)
1083{
1084 int offset = get_evmcs_offset(field, NULL);
1085
1086 if (offset < 0)
1087 return 0;
1088
1089 return *(u16 *)((char *)current_evmcs + offset);
1090}
1091
1092static void evmcs_load(u64 phys_addr)
1093{
1094 struct hv_vp_assist_page *vp_ap =
1095 hv_get_vp_assist_page(smp_processor_id());
1096
1097 vp_ap->current_nested_vmcs = phys_addr;
1098 vp_ap->enlighten_vmentry = 1;
1099}
1100
1101static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
1102{
1103 /*
1104 * Enlightened VMCSv1 doesn't support these:
1105 *
1106 * POSTED_INTR_NV = 0x00000002,
1107 * GUEST_INTR_STATUS = 0x00000810,
1108 * APIC_ACCESS_ADDR = 0x00002014,
1109 * POSTED_INTR_DESC_ADDR = 0x00002016,
1110 * EOI_EXIT_BITMAP0 = 0x0000201c,
1111 * EOI_EXIT_BITMAP1 = 0x0000201e,
1112 * EOI_EXIT_BITMAP2 = 0x00002020,
1113 * EOI_EXIT_BITMAP3 = 0x00002022,
1114 */
1115 vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
1116 vmcs_conf->cpu_based_2nd_exec_ctrl &=
1117 ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
1118 vmcs_conf->cpu_based_2nd_exec_ctrl &=
1119 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1120 vmcs_conf->cpu_based_2nd_exec_ctrl &=
1121 ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
1122
1123 /*
1124 * GUEST_PML_INDEX = 0x00000812,
1125 * PML_ADDRESS = 0x0000200e,
1126 */
1127 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
1128
1129 /* VM_FUNCTION_CONTROL = 0x00002018, */
1130 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
1131
1132 /*
1133 * EPTP_LIST_ADDRESS = 0x00002024,
1134 * VMREAD_BITMAP = 0x00002026,
1135 * VMWRITE_BITMAP = 0x00002028,
1136 */
1137 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
1138
1139 /*
1140 * TSC_MULTIPLIER = 0x00002032,
1141 */
1142 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
1143
1144 /*
1145 * PLE_GAP = 0x00004020,
1146 * PLE_WINDOW = 0x00004022,
1147 */
1148 vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1149
1150 /*
1151 * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
1152 */
1153 vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
1154
1155 /*
1156 * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
1157 * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
1158 */
1159 vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
1160 vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
1161
1162 /*
1163 * Currently unsupported in KVM:
1164 * GUEST_IA32_RTIT_CTL = 0x00002814,
1165 */
1166}
1167#else /* !IS_ENABLED(CONFIG_HYPERV) */
1168static inline void evmcs_write64(unsigned long field, u64 value) {}
1169static inline void evmcs_write32(unsigned long field, u32 value) {}
1170static inline void evmcs_write16(unsigned long field, u16 value) {}
1171static inline u64 evmcs_read64(unsigned long field) { return 0; }
1172static inline u32 evmcs_read32(unsigned long field) { return 0; }
1173static inline u16 evmcs_read16(unsigned long field) { return 0; }
1174static inline void evmcs_load(u64 phys_addr) {}
1175static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
1176#endif /* IS_ENABLED(CONFIG_HYPERV) */
1177
1002static inline bool is_exception_n(u32 intr_info, u8 vector) 1178static inline bool is_exception_n(u32 intr_info, u8 vector)
1003{ 1179{
1004 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 1180 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -1031,6 +1207,11 @@ static inline bool is_invalid_opcode(u32 intr_info)
1031 return is_exception_n(intr_info, UD_VECTOR); 1207 return is_exception_n(intr_info, UD_VECTOR);
1032} 1208}
1033 1209
1210static inline bool is_gp_fault(u32 intr_info)
1211{
1212 return is_exception_n(intr_info, GP_VECTOR);
1213}
1214
1034static inline bool is_external_interrupt(u32 intr_info) 1215static inline bool is_external_interrupt(u32 intr_info)
1035{ 1216{
1036 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1217 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1320,7 +1501,7 @@ static inline bool report_flexpriority(void)
1320 1501
1321static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) 1502static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
1322{ 1503{
1323 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.nested_vmx_misc_low); 1504 return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
1324} 1505}
1325 1506
1326static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1507static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
@@ -1341,6 +1522,16 @@ static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1341 PIN_BASED_VMX_PREEMPTION_TIMER; 1522 PIN_BASED_VMX_PREEMPTION_TIMER;
1342} 1523}
1343 1524
1525static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
1526{
1527 return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
1528}
1529
1530static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1531{
1532 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1533}
1534
1344static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1535static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1345{ 1536{
1346 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1537 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1479,6 +1670,9 @@ static void vmcs_load(struct vmcs *vmcs)
1479 u64 phys_addr = __pa(vmcs); 1670 u64 phys_addr = __pa(vmcs);
1480 u8 error; 1671 u8 error;
1481 1672
1673 if (static_branch_unlikely(&enable_evmcs))
1674 return evmcs_load(phys_addr);
1675
1482 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 1676 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1483 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1677 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1484 : "cc", "memory"); 1678 : "cc", "memory");
@@ -1652,18 +1846,24 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
1652static __always_inline u16 vmcs_read16(unsigned long field) 1846static __always_inline u16 vmcs_read16(unsigned long field)
1653{ 1847{
1654 vmcs_check16(field); 1848 vmcs_check16(field);
1849 if (static_branch_unlikely(&enable_evmcs))
1850 return evmcs_read16(field);
1655 return __vmcs_readl(field); 1851 return __vmcs_readl(field);
1656} 1852}
1657 1853
1658static __always_inline u32 vmcs_read32(unsigned long field) 1854static __always_inline u32 vmcs_read32(unsigned long field)
1659{ 1855{
1660 vmcs_check32(field); 1856 vmcs_check32(field);
1857 if (static_branch_unlikely(&enable_evmcs))
1858 return evmcs_read32(field);
1661 return __vmcs_readl(field); 1859 return __vmcs_readl(field);
1662} 1860}
1663 1861
1664static __always_inline u64 vmcs_read64(unsigned long field) 1862static __always_inline u64 vmcs_read64(unsigned long field)
1665{ 1863{
1666 vmcs_check64(field); 1864 vmcs_check64(field);
1865 if (static_branch_unlikely(&enable_evmcs))
1866 return evmcs_read64(field);
1667#ifdef CONFIG_X86_64 1867#ifdef CONFIG_X86_64
1668 return __vmcs_readl(field); 1868 return __vmcs_readl(field);
1669#else 1869#else
@@ -1674,6 +1874,8 @@ static __always_inline u64 vmcs_read64(unsigned long field)
1674static __always_inline unsigned long vmcs_readl(unsigned long field) 1874static __always_inline unsigned long vmcs_readl(unsigned long field)
1675{ 1875{
1676 vmcs_checkl(field); 1876 vmcs_checkl(field);
1877 if (static_branch_unlikely(&enable_evmcs))
1878 return evmcs_read64(field);
1677 return __vmcs_readl(field); 1879 return __vmcs_readl(field);
1678} 1880}
1679 1881
@@ -1697,18 +1899,27 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
1697static __always_inline void vmcs_write16(unsigned long field, u16 value) 1899static __always_inline void vmcs_write16(unsigned long field, u16 value)
1698{ 1900{
1699 vmcs_check16(field); 1901 vmcs_check16(field);
1902 if (static_branch_unlikely(&enable_evmcs))
1903 return evmcs_write16(field, value);
1904
1700 __vmcs_writel(field, value); 1905 __vmcs_writel(field, value);
1701} 1906}
1702 1907
1703static __always_inline void vmcs_write32(unsigned long field, u32 value) 1908static __always_inline void vmcs_write32(unsigned long field, u32 value)
1704{ 1909{
1705 vmcs_check32(field); 1910 vmcs_check32(field);
1911 if (static_branch_unlikely(&enable_evmcs))
1912 return evmcs_write32(field, value);
1913
1706 __vmcs_writel(field, value); 1914 __vmcs_writel(field, value);
1707} 1915}
1708 1916
1709static __always_inline void vmcs_write64(unsigned long field, u64 value) 1917static __always_inline void vmcs_write64(unsigned long field, u64 value)
1710{ 1918{
1711 vmcs_check64(field); 1919 vmcs_check64(field);
1920 if (static_branch_unlikely(&enable_evmcs))
1921 return evmcs_write64(field, value);
1922
1712 __vmcs_writel(field, value); 1923 __vmcs_writel(field, value);
1713#ifndef CONFIG_X86_64 1924#ifndef CONFIG_X86_64
1714 asm volatile (""); 1925 asm volatile ("");
@@ -1719,6 +1930,9 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
1719static __always_inline void vmcs_writel(unsigned long field, unsigned long value) 1930static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
1720{ 1931{
1721 vmcs_checkl(field); 1932 vmcs_checkl(field);
1933 if (static_branch_unlikely(&enable_evmcs))
1934 return evmcs_write64(field, value);
1935
1722 __vmcs_writel(field, value); 1936 __vmcs_writel(field, value);
1723} 1937}
1724 1938
@@ -1726,6 +1940,9 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
1726{ 1940{
1727 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, 1941 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1728 "vmcs_clear_bits does not support 64-bit fields"); 1942 "vmcs_clear_bits does not support 64-bit fields");
1943 if (static_branch_unlikely(&enable_evmcs))
1944 return evmcs_write32(field, evmcs_read32(field) & ~mask);
1945
1729 __vmcs_writel(field, __vmcs_readl(field) & ~mask); 1946 __vmcs_writel(field, __vmcs_readl(field) & ~mask);
1730} 1947}
1731 1948
@@ -1733,6 +1950,9 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
1733{ 1950{
1734 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, 1951 BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
1735 "vmcs_set_bits does not support 64-bit fields"); 1952 "vmcs_set_bits does not support 64-bit fields");
1953 if (static_branch_unlikely(&enable_evmcs))
1954 return evmcs_write32(field, evmcs_read32(field) | mask);
1955
1736 __vmcs_writel(field, __vmcs_readl(field) | mask); 1956 __vmcs_writel(field, __vmcs_readl(field) | mask);
1737} 1957}
1738 1958
@@ -1864,6 +2084,14 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1864 2084
1865 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 2085 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1866 (1u << DB_VECTOR) | (1u << AC_VECTOR); 2086 (1u << DB_VECTOR) | (1u << AC_VECTOR);
2087 /*
2088 * Guest access to VMware backdoor ports could legitimately
2089 * trigger #GP because of TSS I/O permission bitmap.
2090 * We intercept those #GP and allow access to them anyway
2091 * as VMware does.
2092 */
2093 if (enable_vmware_backdoor)
2094 eb |= (1u << GP_VECTOR);
1867 if ((vcpu->guest_debug & 2095 if ((vcpu->guest_debug &
1868 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 2096 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1869 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 2097 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -2129,6 +2357,9 @@ static unsigned long segment_base(u16 selector)
2129static void vmx_save_host_state(struct kvm_vcpu *vcpu) 2357static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2130{ 2358{
2131 struct vcpu_vmx *vmx = to_vmx(vcpu); 2359 struct vcpu_vmx *vmx = to_vmx(vcpu);
2360#ifdef CONFIG_X86_64
2361 int cpu = raw_smp_processor_id();
2362#endif
2132 int i; 2363 int i;
2133 2364
2134 if (vmx->host_state.loaded) 2365 if (vmx->host_state.loaded)
@@ -2141,7 +2372,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2141 */ 2372 */
2142 vmx->host_state.ldt_sel = kvm_read_ldt(); 2373 vmx->host_state.ldt_sel = kvm_read_ldt();
2143 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 2374 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
2375
2376#ifdef CONFIG_X86_64
2377 save_fsgs_for_kvm();
2378 vmx->host_state.fs_sel = current->thread.fsindex;
2379 vmx->host_state.gs_sel = current->thread.gsindex;
2380#else
2144 savesegment(fs, vmx->host_state.fs_sel); 2381 savesegment(fs, vmx->host_state.fs_sel);
2382 savesegment(gs, vmx->host_state.gs_sel);
2383#endif
2145 if (!(vmx->host_state.fs_sel & 7)) { 2384 if (!(vmx->host_state.fs_sel & 7)) {
2146 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 2385 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
2147 vmx->host_state.fs_reload_needed = 0; 2386 vmx->host_state.fs_reload_needed = 0;
@@ -2149,7 +2388,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2149 vmcs_write16(HOST_FS_SELECTOR, 0); 2388 vmcs_write16(HOST_FS_SELECTOR, 0);
2150 vmx->host_state.fs_reload_needed = 1; 2389 vmx->host_state.fs_reload_needed = 1;
2151 } 2390 }
2152 savesegment(gs, vmx->host_state.gs_sel);
2153 if (!(vmx->host_state.gs_sel & 7)) 2391 if (!(vmx->host_state.gs_sel & 7))
2154 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 2392 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
2155 else { 2393 else {
@@ -2160,20 +2398,16 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2160#ifdef CONFIG_X86_64 2398#ifdef CONFIG_X86_64
2161 savesegment(ds, vmx->host_state.ds_sel); 2399 savesegment(ds, vmx->host_state.ds_sel);
2162 savesegment(es, vmx->host_state.es_sel); 2400 savesegment(es, vmx->host_state.es_sel);
2163#endif
2164 2401
2165#ifdef CONFIG_X86_64 2402 vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
2166 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 2403 vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
2167 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
2168#else
2169 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2170 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2171#endif
2172 2404
2173#ifdef CONFIG_X86_64 2405 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2174 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2175 if (is_long_mode(&vmx->vcpu)) 2406 if (is_long_mode(&vmx->vcpu))
2176 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 2407 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2408#else
2409 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
2410 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
2177#endif 2411#endif
2178 if (boot_cpu_has(X86_FEATURE_MPX)) 2412 if (boot_cpu_has(X86_FEATURE_MPX))
2179 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 2413 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
@@ -2532,6 +2766,19 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
2532 return 0; 2766 return 0;
2533} 2767}
2534 2768
2769static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
2770{
2771 /*
2772 * Ensure that we clear the HLT state in the VMCS. We don't need to
2773 * explicitly skip the instruction because if the HLT state is set,
2774 * then the instruction is already executing and RIP has already been
2775 * advanced.
2776 */
2777 if (kvm_hlt_in_guest(vcpu->kvm) &&
2778 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
2779 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2780}
2781
2535static void vmx_queue_exception(struct kvm_vcpu *vcpu) 2782static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2536{ 2783{
2537 struct vcpu_vmx *vmx = to_vmx(vcpu); 2784 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2554,6 +2801,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2554 return; 2801 return;
2555 } 2802 }
2556 2803
2804 WARN_ON_ONCE(vmx->emulation_required);
2805
2557 if (kvm_exception_is_soft(nr)) { 2806 if (kvm_exception_is_soft(nr)) {
2558 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2807 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2559 vmx->vcpu.arch.event_exit_inst_len); 2808 vmx->vcpu.arch.event_exit_inst_len);
@@ -2562,6 +2811,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2562 intr_info |= INTR_TYPE_HARD_EXCEPTION; 2811 intr_info |= INTR_TYPE_HARD_EXCEPTION;
2563 2812
2564 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 2813 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2814
2815 vmx_clear_hlt(vcpu);
2565} 2816}
2566 2817
2567static bool vmx_rdtscp_supported(void) 2818static bool vmx_rdtscp_supported(void)
@@ -2689,8 +2940,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2689 * bit in the high half is on if the corresponding bit in the control field 2940 * bit in the high half is on if the corresponding bit in the control field
2690 * may be on. See also vmx_control_verify(). 2941 * may be on. See also vmx_control_verify().
2691 */ 2942 */
2692static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) 2943static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
2693{ 2944{
2945 if (!nested) {
2946 memset(msrs, 0, sizeof(*msrs));
2947 return;
2948 }
2949
2694 /* 2950 /*
2695 * Note that as a general rule, the high half of the MSRs (bits in 2951 * Note that as a general rule, the high half of the MSRs (bits in
2696 * the control fields which may be 1) should be initialized by the 2952 * the control fields which may be 1) should be initialized by the
@@ -2708,70 +2964,68 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2708 2964
2709 /* pin-based controls */ 2965 /* pin-based controls */
2710 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2966 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2711 vmx->nested.nested_vmx_pinbased_ctls_low, 2967 msrs->pinbased_ctls_low,
2712 vmx->nested.nested_vmx_pinbased_ctls_high); 2968 msrs->pinbased_ctls_high);
2713 vmx->nested.nested_vmx_pinbased_ctls_low |= 2969 msrs->pinbased_ctls_low |=
2714 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2970 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2715 vmx->nested.nested_vmx_pinbased_ctls_high &= 2971 msrs->pinbased_ctls_high &=
2716 PIN_BASED_EXT_INTR_MASK | 2972 PIN_BASED_EXT_INTR_MASK |
2717 PIN_BASED_NMI_EXITING | 2973 PIN_BASED_NMI_EXITING |
2718 PIN_BASED_VIRTUAL_NMIS; 2974 PIN_BASED_VIRTUAL_NMIS |
2719 vmx->nested.nested_vmx_pinbased_ctls_high |= 2975 (apicv ? PIN_BASED_POSTED_INTR : 0);
2976 msrs->pinbased_ctls_high |=
2720 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2977 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2721 PIN_BASED_VMX_PREEMPTION_TIMER; 2978 PIN_BASED_VMX_PREEMPTION_TIMER;
2722 if (kvm_vcpu_apicv_active(&vmx->vcpu))
2723 vmx->nested.nested_vmx_pinbased_ctls_high |=
2724 PIN_BASED_POSTED_INTR;
2725 2979
2726 /* exit controls */ 2980 /* exit controls */
2727 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2981 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2728 vmx->nested.nested_vmx_exit_ctls_low, 2982 msrs->exit_ctls_low,
2729 vmx->nested.nested_vmx_exit_ctls_high); 2983 msrs->exit_ctls_high);
2730 vmx->nested.nested_vmx_exit_ctls_low = 2984 msrs->exit_ctls_low =
2731 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2985 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2732 2986
2733 vmx->nested.nested_vmx_exit_ctls_high &= 2987 msrs->exit_ctls_high &=
2734#ifdef CONFIG_X86_64 2988#ifdef CONFIG_X86_64
2735 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2989 VM_EXIT_HOST_ADDR_SPACE_SIZE |
2736#endif 2990#endif
2737 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2991 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2738 vmx->nested.nested_vmx_exit_ctls_high |= 2992 msrs->exit_ctls_high |=
2739 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2993 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2740 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2994 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2741 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2995 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2742 2996
2743 if (kvm_mpx_supported()) 2997 if (kvm_mpx_supported())
2744 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2998 msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2745 2999
2746 /* We support free control of debug control saving. */ 3000 /* We support free control of debug control saving. */
2747 vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 3001 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
2748 3002
2749 /* entry controls */ 3003 /* entry controls */
2750 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 3004 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2751 vmx->nested.nested_vmx_entry_ctls_low, 3005 msrs->entry_ctls_low,
2752 vmx->nested.nested_vmx_entry_ctls_high); 3006 msrs->entry_ctls_high);
2753 vmx->nested.nested_vmx_entry_ctls_low = 3007 msrs->entry_ctls_low =
2754 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 3008 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2755 vmx->nested.nested_vmx_entry_ctls_high &= 3009 msrs->entry_ctls_high &=
2756#ifdef CONFIG_X86_64 3010#ifdef CONFIG_X86_64
2757 VM_ENTRY_IA32E_MODE | 3011 VM_ENTRY_IA32E_MODE |
2758#endif 3012#endif
2759 VM_ENTRY_LOAD_IA32_PAT; 3013 VM_ENTRY_LOAD_IA32_PAT;
2760 vmx->nested.nested_vmx_entry_ctls_high |= 3014 msrs->entry_ctls_high |=
2761 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 3015 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2762 if (kvm_mpx_supported()) 3016 if (kvm_mpx_supported())
2763 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 3017 msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2764 3018
2765 /* We support free control of debug control loading. */ 3019 /* We support free control of debug control loading. */
2766 vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 3020 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2767 3021
2768 /* cpu-based controls */ 3022 /* cpu-based controls */
2769 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 3023 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2770 vmx->nested.nested_vmx_procbased_ctls_low, 3024 msrs->procbased_ctls_low,
2771 vmx->nested.nested_vmx_procbased_ctls_high); 3025 msrs->procbased_ctls_high);
2772 vmx->nested.nested_vmx_procbased_ctls_low = 3026 msrs->procbased_ctls_low =
2773 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 3027 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2774 vmx->nested.nested_vmx_procbased_ctls_high &= 3028 msrs->procbased_ctls_high &=
2775 CPU_BASED_VIRTUAL_INTR_PENDING | 3029 CPU_BASED_VIRTUAL_INTR_PENDING |
2776 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 3030 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2777 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 3031 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2791,12 +3045,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2791 * can use it to avoid exits to L1 - even when L0 runs L2 3045 * can use it to avoid exits to L1 - even when L0 runs L2
2792 * without MSR bitmaps. 3046 * without MSR bitmaps.
2793 */ 3047 */
2794 vmx->nested.nested_vmx_procbased_ctls_high |= 3048 msrs->procbased_ctls_high |=
2795 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 3049 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2796 CPU_BASED_USE_MSR_BITMAPS; 3050 CPU_BASED_USE_MSR_BITMAPS;
2797 3051
2798 /* We support free control of CR3 access interception. */ 3052 /* We support free control of CR3 access interception. */
2799 vmx->nested.nested_vmx_procbased_ctls_low &= 3053 msrs->procbased_ctls_low &=
2800 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 3054 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2801 3055
2802 /* 3056 /*
@@ -2804,10 +3058,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2804 * depend on CPUID bits, they are added later by vmx_cpuid_update. 3058 * depend on CPUID bits, they are added later by vmx_cpuid_update.
2805 */ 3059 */
2806 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 3060 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2807 vmx->nested.nested_vmx_secondary_ctls_low, 3061 msrs->secondary_ctls_low,
2808 vmx->nested.nested_vmx_secondary_ctls_high); 3062 msrs->secondary_ctls_high);
2809 vmx->nested.nested_vmx_secondary_ctls_low = 0; 3063 msrs->secondary_ctls_low = 0;
2810 vmx->nested.nested_vmx_secondary_ctls_high &= 3064 msrs->secondary_ctls_high &=
2811 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 3065 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2812 SECONDARY_EXEC_DESC | 3066 SECONDARY_EXEC_DESC |
2813 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3067 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
@@ -2817,33 +3071,33 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2817 3071
2818 if (enable_ept) { 3072 if (enable_ept) {
2819 /* nested EPT: emulate EPT also to L1 */ 3073 /* nested EPT: emulate EPT also to L1 */
2820 vmx->nested.nested_vmx_secondary_ctls_high |= 3074 msrs->secondary_ctls_high |=
2821 SECONDARY_EXEC_ENABLE_EPT; 3075 SECONDARY_EXEC_ENABLE_EPT;
2822 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 3076 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2823 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 3077 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
2824 if (cpu_has_vmx_ept_execute_only()) 3078 if (cpu_has_vmx_ept_execute_only())
2825 vmx->nested.nested_vmx_ept_caps |= 3079 msrs->ept_caps |=
2826 VMX_EPT_EXECUTE_ONLY_BIT; 3080 VMX_EPT_EXECUTE_ONLY_BIT;
2827 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; 3081 msrs->ept_caps &= vmx_capability.ept;
2828 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 3082 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
2829 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 3083 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
2830 VMX_EPT_1GB_PAGE_BIT; 3084 VMX_EPT_1GB_PAGE_BIT;
2831 if (enable_ept_ad_bits) { 3085 if (enable_ept_ad_bits) {
2832 vmx->nested.nested_vmx_secondary_ctls_high |= 3086 msrs->secondary_ctls_high |=
2833 SECONDARY_EXEC_ENABLE_PML; 3087 SECONDARY_EXEC_ENABLE_PML;
2834 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; 3088 msrs->ept_caps |= VMX_EPT_AD_BIT;
2835 } 3089 }
2836 } 3090 }
2837 3091
2838 if (cpu_has_vmx_vmfunc()) { 3092 if (cpu_has_vmx_vmfunc()) {
2839 vmx->nested.nested_vmx_secondary_ctls_high |= 3093 msrs->secondary_ctls_high |=
2840 SECONDARY_EXEC_ENABLE_VMFUNC; 3094 SECONDARY_EXEC_ENABLE_VMFUNC;
2841 /* 3095 /*
2842 * Advertise EPTP switching unconditionally 3096 * Advertise EPTP switching unconditionally
2843 * since we emulate it 3097 * since we emulate it
2844 */ 3098 */
2845 if (enable_ept) 3099 if (enable_ept)
2846 vmx->nested.nested_vmx_vmfunc_controls = 3100 msrs->vmfunc_controls =
2847 VMX_VMFUNC_EPTP_SWITCHING; 3101 VMX_VMFUNC_EPTP_SWITCHING;
2848 } 3102 }
2849 3103
@@ -2854,25 +3108,25 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2854 * not failing the single-context invvpid, and it is worse. 3108 * not failing the single-context invvpid, and it is worse.
2855 */ 3109 */
2856 if (enable_vpid) { 3110 if (enable_vpid) {
2857 vmx->nested.nested_vmx_secondary_ctls_high |= 3111 msrs->secondary_ctls_high |=
2858 SECONDARY_EXEC_ENABLE_VPID; 3112 SECONDARY_EXEC_ENABLE_VPID;
2859 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | 3113 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
2860 VMX_VPID_EXTENT_SUPPORTED_MASK; 3114 VMX_VPID_EXTENT_SUPPORTED_MASK;
2861 } 3115 }
2862 3116
2863 if (enable_unrestricted_guest) 3117 if (enable_unrestricted_guest)
2864 vmx->nested.nested_vmx_secondary_ctls_high |= 3118 msrs->secondary_ctls_high |=
2865 SECONDARY_EXEC_UNRESTRICTED_GUEST; 3119 SECONDARY_EXEC_UNRESTRICTED_GUEST;
2866 3120
2867 /* miscellaneous data */ 3121 /* miscellaneous data */
2868 rdmsr(MSR_IA32_VMX_MISC, 3122 rdmsr(MSR_IA32_VMX_MISC,
2869 vmx->nested.nested_vmx_misc_low, 3123 msrs->misc_low,
2870 vmx->nested.nested_vmx_misc_high); 3124 msrs->misc_high);
2871 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 3125 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
2872 vmx->nested.nested_vmx_misc_low |= 3126 msrs->misc_low |=
2873 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 3127 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2874 VMX_MISC_ACTIVITY_HLT; 3128 VMX_MISC_ACTIVITY_HLT;
2875 vmx->nested.nested_vmx_misc_high = 0; 3129 msrs->misc_high = 0;
2876 3130
2877 /* 3131 /*
2878 * This MSR reports some information about VMX support. We 3132 * This MSR reports some information about VMX support. We
@@ -2880,14 +3134,14 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2880 * guest, and the VMCS structure we give it - not about the 3134 * guest, and the VMCS structure we give it - not about the
2881 * VMX support of the underlying hardware. 3135 * VMX support of the underlying hardware.
2882 */ 3136 */
2883 vmx->nested.nested_vmx_basic = 3137 msrs->basic =
2884 VMCS12_REVISION | 3138 VMCS12_REVISION |
2885 VMX_BASIC_TRUE_CTLS | 3139 VMX_BASIC_TRUE_CTLS |
2886 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 3140 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2887 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 3141 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2888 3142
2889 if (cpu_has_vmx_basic_inout()) 3143 if (cpu_has_vmx_basic_inout())
2890 vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; 3144 msrs->basic |= VMX_BASIC_INOUT;
2891 3145
2892 /* 3146 /*
2893 * These MSRs specify bits which the guest must keep fixed on 3147 * These MSRs specify bits which the guest must keep fixed on
@@ -2896,15 +3150,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2896 */ 3150 */
2897#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 3151#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2898#define VMXON_CR4_ALWAYSON X86_CR4_VMXE 3152#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
2899 vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; 3153 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
2900 vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; 3154 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
2901 3155
2902 /* These MSRs specify bits which the guest must keep fixed off. */ 3156 /* These MSRs specify bits which the guest must keep fixed off. */
2903 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); 3157 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
2904 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); 3158 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
2905 3159
2906 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 3160 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2907 vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 3161 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
2908} 3162}
2909 3163
2910/* 3164/*
@@ -2941,7 +3195,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2941 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 3195 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
2942 /* reserved */ 3196 /* reserved */
2943 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 3197 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
2944 u64 vmx_basic = vmx->nested.nested_vmx_basic; 3198 u64 vmx_basic = vmx->nested.msrs.basic;
2945 3199
2946 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 3200 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
2947 return -EINVAL; 3201 return -EINVAL;
@@ -2960,7 +3214,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
2960 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 3214 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
2961 return -EINVAL; 3215 return -EINVAL;
2962 3216
2963 vmx->nested.nested_vmx_basic = data; 3217 vmx->nested.msrs.basic = data;
2964 return 0; 3218 return 0;
2965} 3219}
2966 3220
@@ -2972,24 +3226,24 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
2972 3226
2973 switch (msr_index) { 3227 switch (msr_index) {
2974 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 3228 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2975 lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; 3229 lowp = &vmx->nested.msrs.pinbased_ctls_low;
2976 highp = &vmx->nested.nested_vmx_pinbased_ctls_high; 3230 highp = &vmx->nested.msrs.pinbased_ctls_high;
2977 break; 3231 break;
2978 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 3232 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2979 lowp = &vmx->nested.nested_vmx_procbased_ctls_low; 3233 lowp = &vmx->nested.msrs.procbased_ctls_low;
2980 highp = &vmx->nested.nested_vmx_procbased_ctls_high; 3234 highp = &vmx->nested.msrs.procbased_ctls_high;
2981 break; 3235 break;
2982 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 3236 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2983 lowp = &vmx->nested.nested_vmx_exit_ctls_low; 3237 lowp = &vmx->nested.msrs.exit_ctls_low;
2984 highp = &vmx->nested.nested_vmx_exit_ctls_high; 3238 highp = &vmx->nested.msrs.exit_ctls_high;
2985 break; 3239 break;
2986 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 3240 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2987 lowp = &vmx->nested.nested_vmx_entry_ctls_low; 3241 lowp = &vmx->nested.msrs.entry_ctls_low;
2988 highp = &vmx->nested.nested_vmx_entry_ctls_high; 3242 highp = &vmx->nested.msrs.entry_ctls_high;
2989 break; 3243 break;
2990 case MSR_IA32_VMX_PROCBASED_CTLS2: 3244 case MSR_IA32_VMX_PROCBASED_CTLS2:
2991 lowp = &vmx->nested.nested_vmx_secondary_ctls_low; 3245 lowp = &vmx->nested.msrs.secondary_ctls_low;
2992 highp = &vmx->nested.nested_vmx_secondary_ctls_high; 3246 highp = &vmx->nested.msrs.secondary_ctls_high;
2993 break; 3247 break;
2994 default: 3248 default:
2995 BUG(); 3249 BUG();
@@ -3020,13 +3274,13 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3020 GENMASK_ULL(13, 9) | BIT_ULL(31); 3274 GENMASK_ULL(13, 9) | BIT_ULL(31);
3021 u64 vmx_misc; 3275 u64 vmx_misc;
3022 3276
3023 vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, 3277 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
3024 vmx->nested.nested_vmx_misc_high); 3278 vmx->nested.msrs.misc_high);
3025 3279
3026 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 3280 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
3027 return -EINVAL; 3281 return -EINVAL;
3028 3282
3029 if ((vmx->nested.nested_vmx_pinbased_ctls_high & 3283 if ((vmx->nested.msrs.pinbased_ctls_high &
3030 PIN_BASED_VMX_PREEMPTION_TIMER) && 3284 PIN_BASED_VMX_PREEMPTION_TIMER) &&
3031 vmx_misc_preemption_timer_rate(data) != 3285 vmx_misc_preemption_timer_rate(data) !=
3032 vmx_misc_preemption_timer_rate(vmx_misc)) 3286 vmx_misc_preemption_timer_rate(vmx_misc))
@@ -3041,8 +3295,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
3041 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 3295 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
3042 return -EINVAL; 3296 return -EINVAL;
3043 3297
3044 vmx->nested.nested_vmx_misc_low = data; 3298 vmx->nested.msrs.misc_low = data;
3045 vmx->nested.nested_vmx_misc_high = data >> 32; 3299 vmx->nested.msrs.misc_high = data >> 32;
3046 return 0; 3300 return 0;
3047} 3301}
3048 3302
@@ -3050,15 +3304,15 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
3050{ 3304{
3051 u64 vmx_ept_vpid_cap; 3305 u64 vmx_ept_vpid_cap;
3052 3306
3053 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, 3307 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
3054 vmx->nested.nested_vmx_vpid_caps); 3308 vmx->nested.msrs.vpid_caps);
3055 3309
3056 /* Every bit is either reserved or a feature bit. */ 3310 /* Every bit is either reserved or a feature bit. */
3057 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 3311 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
3058 return -EINVAL; 3312 return -EINVAL;
3059 3313
3060 vmx->nested.nested_vmx_ept_caps = data; 3314 vmx->nested.msrs.ept_caps = data;
3061 vmx->nested.nested_vmx_vpid_caps = data >> 32; 3315 vmx->nested.msrs.vpid_caps = data >> 32;
3062 return 0; 3316 return 0;
3063} 3317}
3064 3318
@@ -3068,10 +3322,10 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
3068 3322
3069 switch (msr_index) { 3323 switch (msr_index) {
3070 case MSR_IA32_VMX_CR0_FIXED0: 3324 case MSR_IA32_VMX_CR0_FIXED0:
3071 msr = &vmx->nested.nested_vmx_cr0_fixed0; 3325 msr = &vmx->nested.msrs.cr0_fixed0;
3072 break; 3326 break;
3073 case MSR_IA32_VMX_CR4_FIXED0: 3327 case MSR_IA32_VMX_CR4_FIXED0:
3074 msr = &vmx->nested.nested_vmx_cr4_fixed0; 3328 msr = &vmx->nested.msrs.cr4_fixed0;
3075 break; 3329 break;
3076 default: 3330 default:
3077 BUG(); 3331 BUG();
@@ -3135,7 +3389,7 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3135 case MSR_IA32_VMX_EPT_VPID_CAP: 3389 case MSR_IA32_VMX_EPT_VPID_CAP:
3136 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 3390 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
3137 case MSR_IA32_VMX_VMCS_ENUM: 3391 case MSR_IA32_VMX_VMCS_ENUM:
3138 vmx->nested.nested_vmx_vmcs_enum = data; 3392 vmx->nested.msrs.vmcs_enum = data;
3139 return 0; 3393 return 0;
3140 default: 3394 default:
3141 /* 3395 /*
@@ -3146,77 +3400,75 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
3146} 3400}
3147 3401
3148/* Returns 0 on success, non-0 otherwise. */ 3402/* Returns 0 on success, non-0 otherwise. */
3149static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 3403static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
3150{ 3404{
3151 struct vcpu_vmx *vmx = to_vmx(vcpu);
3152
3153 switch (msr_index) { 3405 switch (msr_index) {
3154 case MSR_IA32_VMX_BASIC: 3406 case MSR_IA32_VMX_BASIC:
3155 *pdata = vmx->nested.nested_vmx_basic; 3407 *pdata = msrs->basic;
3156 break; 3408 break;
3157 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 3409 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3158 case MSR_IA32_VMX_PINBASED_CTLS: 3410 case MSR_IA32_VMX_PINBASED_CTLS:
3159 *pdata = vmx_control_msr( 3411 *pdata = vmx_control_msr(
3160 vmx->nested.nested_vmx_pinbased_ctls_low, 3412 msrs->pinbased_ctls_low,
3161 vmx->nested.nested_vmx_pinbased_ctls_high); 3413 msrs->pinbased_ctls_high);
3162 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 3414 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
3163 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 3415 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3164 break; 3416 break;
3165 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 3417 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3166 case MSR_IA32_VMX_PROCBASED_CTLS: 3418 case MSR_IA32_VMX_PROCBASED_CTLS:
3167 *pdata = vmx_control_msr( 3419 *pdata = vmx_control_msr(
3168 vmx->nested.nested_vmx_procbased_ctls_low, 3420 msrs->procbased_ctls_low,
3169 vmx->nested.nested_vmx_procbased_ctls_high); 3421 msrs->procbased_ctls_high);
3170 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 3422 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
3171 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 3423 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
3172 break; 3424 break;
3173 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 3425 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3174 case MSR_IA32_VMX_EXIT_CTLS: 3426 case MSR_IA32_VMX_EXIT_CTLS:
3175 *pdata = vmx_control_msr( 3427 *pdata = vmx_control_msr(
3176 vmx->nested.nested_vmx_exit_ctls_low, 3428 msrs->exit_ctls_low,
3177 vmx->nested.nested_vmx_exit_ctls_high); 3429 msrs->exit_ctls_high);
3178 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 3430 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
3179 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 3431 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
3180 break; 3432 break;
3181 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 3433 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3182 case MSR_IA32_VMX_ENTRY_CTLS: 3434 case MSR_IA32_VMX_ENTRY_CTLS:
3183 *pdata = vmx_control_msr( 3435 *pdata = vmx_control_msr(
3184 vmx->nested.nested_vmx_entry_ctls_low, 3436 msrs->entry_ctls_low,
3185 vmx->nested.nested_vmx_entry_ctls_high); 3437 msrs->entry_ctls_high);
3186 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 3438 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
3187 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 3439 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
3188 break; 3440 break;
3189 case MSR_IA32_VMX_MISC: 3441 case MSR_IA32_VMX_MISC:
3190 *pdata = vmx_control_msr( 3442 *pdata = vmx_control_msr(
3191 vmx->nested.nested_vmx_misc_low, 3443 msrs->misc_low,
3192 vmx->nested.nested_vmx_misc_high); 3444 msrs->misc_high);
3193 break; 3445 break;
3194 case MSR_IA32_VMX_CR0_FIXED0: 3446 case MSR_IA32_VMX_CR0_FIXED0:
3195 *pdata = vmx->nested.nested_vmx_cr0_fixed0; 3447 *pdata = msrs->cr0_fixed0;
3196 break; 3448 break;
3197 case MSR_IA32_VMX_CR0_FIXED1: 3449 case MSR_IA32_VMX_CR0_FIXED1:
3198 *pdata = vmx->nested.nested_vmx_cr0_fixed1; 3450 *pdata = msrs->cr0_fixed1;
3199 break; 3451 break;
3200 case MSR_IA32_VMX_CR4_FIXED0: 3452 case MSR_IA32_VMX_CR4_FIXED0:
3201 *pdata = vmx->nested.nested_vmx_cr4_fixed0; 3453 *pdata = msrs->cr4_fixed0;
3202 break; 3454 break;
3203 case MSR_IA32_VMX_CR4_FIXED1: 3455 case MSR_IA32_VMX_CR4_FIXED1:
3204 *pdata = vmx->nested.nested_vmx_cr4_fixed1; 3456 *pdata = msrs->cr4_fixed1;
3205 break; 3457 break;
3206 case MSR_IA32_VMX_VMCS_ENUM: 3458 case MSR_IA32_VMX_VMCS_ENUM:
3207 *pdata = vmx->nested.nested_vmx_vmcs_enum; 3459 *pdata = msrs->vmcs_enum;
3208 break; 3460 break;
3209 case MSR_IA32_VMX_PROCBASED_CTLS2: 3461 case MSR_IA32_VMX_PROCBASED_CTLS2:
3210 *pdata = vmx_control_msr( 3462 *pdata = vmx_control_msr(
3211 vmx->nested.nested_vmx_secondary_ctls_low, 3463 msrs->secondary_ctls_low,
3212 vmx->nested.nested_vmx_secondary_ctls_high); 3464 msrs->secondary_ctls_high);
3213 break; 3465 break;
3214 case MSR_IA32_VMX_EPT_VPID_CAP: 3466 case MSR_IA32_VMX_EPT_VPID_CAP:
3215 *pdata = vmx->nested.nested_vmx_ept_caps | 3467 *pdata = msrs->ept_caps |
3216 ((u64)vmx->nested.nested_vmx_vpid_caps << 32); 3468 ((u64)msrs->vpid_caps << 32);
3217 break; 3469 break;
3218 case MSR_IA32_VMX_VMFUNC: 3470 case MSR_IA32_VMX_VMFUNC:
3219 *pdata = vmx->nested.nested_vmx_vmfunc_controls; 3471 *pdata = msrs->vmfunc_controls;
3220 break; 3472 break;
3221 default: 3473 default:
3222 return 1; 3474 return 1;
@@ -3235,7 +3487,16 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3235 3487
3236static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 3488static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
3237{ 3489{
3238 return 1; 3490 switch (msr->index) {
3491 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3492 if (!nested)
3493 return 1;
3494 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
3495 default:
3496 return 1;
3497 }
3498
3499 return 0;
3239} 3500}
3240 3501
3241/* 3502/*
@@ -3309,7 +3570,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3309 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 3570 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
3310 if (!nested_vmx_allowed(vcpu)) 3571 if (!nested_vmx_allowed(vcpu))
3311 return 1; 3572 return 1;
3312 return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); 3573 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
3574 &msr_info->data);
3313 case MSR_IA32_XSS: 3575 case MSR_IA32_XSS:
3314 if (!vmx_xsaves_supported()) 3576 if (!vmx_xsaves_supported())
3315 return 1; 3577 return 1;
@@ -3602,6 +3864,14 @@ static int hardware_enable(void)
3602 if (cr4_read_shadow() & X86_CR4_VMXE) 3864 if (cr4_read_shadow() & X86_CR4_VMXE)
3603 return -EBUSY; 3865 return -EBUSY;
3604 3866
3867 /*
3868 * This can happen if we hot-added a CPU but failed to allocate
3869 * VP assist page for it.
3870 */
3871 if (static_branch_unlikely(&enable_evmcs) &&
3872 !hv_get_vp_assist_page(cpu))
3873 return -EFAULT;
3874
3605 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 3875 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3606 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 3876 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3607 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 3877 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
@@ -3700,6 +3970,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3700 u32 _vmexit_control = 0; 3970 u32 _vmexit_control = 0;
3701 u32 _vmentry_control = 0; 3971 u32 _vmentry_control = 0;
3702 3972
3973 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
3703 min = CPU_BASED_HLT_EXITING | 3974 min = CPU_BASED_HLT_EXITING |
3704#ifdef CONFIG_X86_64 3975#ifdef CONFIG_X86_64
3705 CPU_BASED_CR8_LOAD_EXITING | 3976 CPU_BASED_CR8_LOAD_EXITING |
@@ -3710,13 +3981,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3710 CPU_BASED_UNCOND_IO_EXITING | 3981 CPU_BASED_UNCOND_IO_EXITING |
3711 CPU_BASED_MOV_DR_EXITING | 3982 CPU_BASED_MOV_DR_EXITING |
3712 CPU_BASED_USE_TSC_OFFSETING | 3983 CPU_BASED_USE_TSC_OFFSETING |
3984 CPU_BASED_MWAIT_EXITING |
3985 CPU_BASED_MONITOR_EXITING |
3713 CPU_BASED_INVLPG_EXITING | 3986 CPU_BASED_INVLPG_EXITING |
3714 CPU_BASED_RDPMC_EXITING; 3987 CPU_BASED_RDPMC_EXITING;
3715 3988
3716 if (!kvm_mwait_in_guest())
3717 min |= CPU_BASED_MWAIT_EXITING |
3718 CPU_BASED_MONITOR_EXITING;
3719
3720 opt = CPU_BASED_TPR_SHADOW | 3989 opt = CPU_BASED_TPR_SHADOW |
3721 CPU_BASED_USE_MSR_BITMAPS | 3990 CPU_BASED_USE_MSR_BITMAPS |
3722 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 3991 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -3835,7 +4104,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3835 vmcs_conf->size = vmx_msr_high & 0x1fff; 4104 vmcs_conf->size = vmx_msr_high & 0x1fff;
3836 vmcs_conf->order = get_order(vmcs_conf->size); 4105 vmcs_conf->order = get_order(vmcs_conf->size);
3837 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 4106 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3838 vmcs_conf->revision_id = vmx_msr_low; 4107
4108 /* KVM supports Enlightened VMCS v1 only */
4109 if (static_branch_unlikely(&enable_evmcs))
4110 vmcs_conf->revision_id = KVM_EVMCS_VERSION;
4111 else
4112 vmcs_conf->revision_id = vmx_msr_low;
3839 4113
3840 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 4114 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3841 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 4115 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -3843,6 +4117,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
3843 vmcs_conf->vmexit_ctrl = _vmexit_control; 4117 vmcs_conf->vmexit_ctrl = _vmexit_control;
3844 vmcs_conf->vmentry_ctrl = _vmentry_control; 4118 vmcs_conf->vmentry_ctrl = _vmentry_control;
3845 4119
4120 if (static_branch_unlikely(&enable_evmcs))
4121 evmcs_sanitize_exec_ctrls(vmcs_conf);
4122
3846 cpu_has_load_ia32_efer = 4123 cpu_has_load_ia32_efer =
3847 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 4124 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3848 VM_ENTRY_LOAD_IA32_EFER) 4125 VM_ENTRY_LOAD_IA32_EFER)
@@ -4162,6 +4439,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
4162{ 4439{
4163 unsigned long flags; 4440 unsigned long flags;
4164 struct vcpu_vmx *vmx = to_vmx(vcpu); 4441 struct vcpu_vmx *vmx = to_vmx(vcpu);
4442 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
4165 4443
4166 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 4444 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
4167 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 4445 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -4177,13 +4455,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
4177 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 4455 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4178 * vcpu. Warn the user that an update is overdue. 4456 * vcpu. Warn the user that an update is overdue.
4179 */ 4457 */
4180 if (!vcpu->kvm->arch.tss_addr) 4458 if (!kvm_vmx->tss_addr)
4181 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 4459 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
4182 "called before entering vcpu\n"); 4460 "called before entering vcpu\n");
4183 4461
4184 vmx_segment_cache_clear(vmx); 4462 vmx_segment_cache_clear(vmx);
4185 4463
4186 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 4464 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
4187 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 4465 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
4188 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4466 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4189 4467
@@ -4291,7 +4569,7 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
4291 4569
4292static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 4570static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
4293{ 4571{
4294 if (enable_ept && is_paging(vcpu)) 4572 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
4295 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4573 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4296 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 4574 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
4297} 4575}
@@ -4339,11 +4617,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
4339 4617
4340static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) 4618static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4341{ 4619{
4342 u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; 4620 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
4343 u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; 4621 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
4344 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4622 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4345 4623
4346 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & 4624 if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
4347 SECONDARY_EXEC_UNRESTRICTED_GUEST && 4625 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
4348 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 4626 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
4349 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); 4627 fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -4353,16 +4631,16 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4353 4631
4354static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) 4632static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
4355{ 4633{
4356 u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; 4634 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
4357 u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; 4635 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
4358 4636
4359 return fixed_bits_valid(val, fixed0, fixed1); 4637 return fixed_bits_valid(val, fixed0, fixed1);
4360} 4638}
4361 4639
4362static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) 4640static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
4363{ 4641{
4364 u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; 4642 u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
4365 u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; 4643 u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
4366 4644
4367 return fixed_bits_valid(val, fixed0, fixed1); 4645 return fixed_bits_valid(val, fixed0, fixed1);
4368} 4646}
@@ -4428,7 +4706,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4428 } 4706 }
4429#endif 4707#endif
4430 4708
4431 if (enable_ept) 4709 if (enable_ept && !enable_unrestricted_guest)
4432 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 4710 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
4433 4711
4434 vmcs_writel(CR0_READ_SHADOW, cr0); 4712 vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -4469,10 +4747,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
4469 if (enable_ept) { 4747 if (enable_ept) {
4470 eptp = construct_eptp(vcpu, cr3); 4748 eptp = construct_eptp(vcpu, cr3);
4471 vmcs_write64(EPT_POINTER, eptp); 4749 vmcs_write64(EPT_POINTER, eptp);
4472 if (is_paging(vcpu) || is_guest_mode(vcpu)) 4750 if (enable_unrestricted_guest || is_paging(vcpu) ||
4751 is_guest_mode(vcpu))
4473 guest_cr3 = kvm_read_cr3(vcpu); 4752 guest_cr3 = kvm_read_cr3(vcpu);
4474 else 4753 else
4475 guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; 4754 guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
4476 ept_load_pdptrs(vcpu); 4755 ept_load_pdptrs(vcpu);
4477 } 4756 }
4478 4757
@@ -4487,11 +4766,15 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4487 * is in force while we are in guest mode. Do not let guests control 4766 * is in force while we are in guest mode. Do not let guests control
4488 * this bit, even if host CR4.MCE == 0. 4767 * this bit, even if host CR4.MCE == 0.
4489 */ 4768 */
4490 unsigned long hw_cr4 = 4769 unsigned long hw_cr4;
4491 (cr4_read_shadow() & X86_CR4_MCE) | 4770
4492 (cr4 & ~X86_CR4_MCE) | 4771 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
4493 (to_vmx(vcpu)->rmode.vm86_active ? 4772 if (enable_unrestricted_guest)
4494 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 4773 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
4774 else if (to_vmx(vcpu)->rmode.vm86_active)
4775 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
4776 else
4777 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
4495 4778
4496 if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { 4779 if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
4497 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 4780 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
@@ -4517,16 +4800,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4517 return 1; 4800 return 1;
4518 4801
4519 vcpu->arch.cr4 = cr4; 4802 vcpu->arch.cr4 = cr4;
4520 if (enable_ept) { 4803
4521 if (!is_paging(vcpu)) { 4804 if (!enable_unrestricted_guest) {
4522 hw_cr4 &= ~X86_CR4_PAE; 4805 if (enable_ept) {
4523 hw_cr4 |= X86_CR4_PSE; 4806 if (!is_paging(vcpu)) {
4524 } else if (!(cr4 & X86_CR4_PAE)) { 4807 hw_cr4 &= ~X86_CR4_PAE;
4525 hw_cr4 &= ~X86_CR4_PAE; 4808 hw_cr4 |= X86_CR4_PSE;
4809 } else if (!(cr4 & X86_CR4_PAE)) {
4810 hw_cr4 &= ~X86_CR4_PAE;
4811 }
4526 } 4812 }
4527 }
4528 4813
4529 if (!enable_unrestricted_guest && !is_paging(vcpu))
4530 /* 4814 /*
4531 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 4815 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
4532 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 4816 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
@@ -4538,7 +4822,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
4538 * If enable_unrestricted_guest, the CPU automatically 4822 * If enable_unrestricted_guest, the CPU automatically
4539 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 4823 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
4540 */ 4824 */
4541 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 4825 if (!is_paging(vcpu))
4826 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
4827 }
4542 4828
4543 vmcs_writel(CR4_READ_SHADOW, cr4); 4829 vmcs_writel(CR4_READ_SHADOW, cr4);
4544 vmcs_writel(GUEST_CR4, hw_cr4); 4830 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -4906,7 +5192,7 @@ static int init_rmode_tss(struct kvm *kvm)
4906 int idx, r; 5192 int idx, r;
4907 5193
4908 idx = srcu_read_lock(&kvm->srcu); 5194 idx = srcu_read_lock(&kvm->srcu);
4909 fn = kvm->arch.tss_addr >> PAGE_SHIFT; 5195 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
4910 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 5196 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4911 if (r < 0) 5197 if (r < 0)
4912 goto out; 5198 goto out;
@@ -4932,22 +5218,23 @@ out:
4932 5218
4933static int init_rmode_identity_map(struct kvm *kvm) 5219static int init_rmode_identity_map(struct kvm *kvm)
4934{ 5220{
5221 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4935 int i, idx, r = 0; 5222 int i, idx, r = 0;
4936 kvm_pfn_t identity_map_pfn; 5223 kvm_pfn_t identity_map_pfn;
4937 u32 tmp; 5224 u32 tmp;
4938 5225
4939 /* Protect kvm->arch.ept_identity_pagetable_done. */ 5226 /* Protect kvm_vmx->ept_identity_pagetable_done. */
4940 mutex_lock(&kvm->slots_lock); 5227 mutex_lock(&kvm->slots_lock);
4941 5228
4942 if (likely(kvm->arch.ept_identity_pagetable_done)) 5229 if (likely(kvm_vmx->ept_identity_pagetable_done))
4943 goto out2; 5230 goto out2;
4944 5231
4945 if (!kvm->arch.ept_identity_map_addr) 5232 if (!kvm_vmx->ept_identity_map_addr)
4946 kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 5233 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4947 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 5234 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
4948 5235
4949 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 5236 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4950 kvm->arch.ept_identity_map_addr, PAGE_SIZE); 5237 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
4951 if (r < 0) 5238 if (r < 0)
4952 goto out2; 5239 goto out2;
4953 5240
@@ -4964,7 +5251,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
4964 if (r < 0) 5251 if (r < 0)
4965 goto out; 5252 goto out;
4966 } 5253 }
4967 kvm->arch.ept_identity_pagetable_done = true; 5254 kvm_vmx->ept_identity_pagetable_done = true;
4968 5255
4969out: 5256out:
4970 srcu_read_unlock(&kvm->srcu, idx); 5257 srcu_read_unlock(&kvm->srcu, idx);
@@ -5500,6 +5787,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
5500 exec_control |= CPU_BASED_CR3_STORE_EXITING | 5787 exec_control |= CPU_BASED_CR3_STORE_EXITING |
5501 CPU_BASED_CR3_LOAD_EXITING | 5788 CPU_BASED_CR3_LOAD_EXITING |
5502 CPU_BASED_INVLPG_EXITING; 5789 CPU_BASED_INVLPG_EXITING;
5790 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
5791 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
5792 CPU_BASED_MONITOR_EXITING);
5793 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
5794 exec_control &= ~CPU_BASED_HLT_EXITING;
5503 return exec_control; 5795 return exec_control;
5504} 5796}
5505 5797
@@ -5533,7 +5825,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5533 } 5825 }
5534 if (!enable_unrestricted_guest) 5826 if (!enable_unrestricted_guest)
5535 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 5827 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
5536 if (!ple_gap) 5828 if (kvm_pause_in_guest(vmx->vcpu.kvm))
5537 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 5829 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
5538 if (!kvm_vcpu_apicv_active(vcpu)) 5830 if (!kvm_vcpu_apicv_active(vcpu))
5539 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 5831 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -5565,10 +5857,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5565 5857
5566 if (nested) { 5858 if (nested) {
5567 if (xsaves_enabled) 5859 if (xsaves_enabled)
5568 vmx->nested.nested_vmx_secondary_ctls_high |= 5860 vmx->nested.msrs.secondary_ctls_high |=
5569 SECONDARY_EXEC_XSAVES; 5861 SECONDARY_EXEC_XSAVES;
5570 else 5862 else
5571 vmx->nested.nested_vmx_secondary_ctls_high &= 5863 vmx->nested.msrs.secondary_ctls_high &=
5572 ~SECONDARY_EXEC_XSAVES; 5864 ~SECONDARY_EXEC_XSAVES;
5573 } 5865 }
5574 } 5866 }
@@ -5580,10 +5872,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5580 5872
5581 if (nested) { 5873 if (nested) {
5582 if (rdtscp_enabled) 5874 if (rdtscp_enabled)
5583 vmx->nested.nested_vmx_secondary_ctls_high |= 5875 vmx->nested.msrs.secondary_ctls_high |=
5584 SECONDARY_EXEC_RDTSCP; 5876 SECONDARY_EXEC_RDTSCP;
5585 else 5877 else
5586 vmx->nested.nested_vmx_secondary_ctls_high &= 5878 vmx->nested.msrs.secondary_ctls_high &=
5587 ~SECONDARY_EXEC_RDTSCP; 5879 ~SECONDARY_EXEC_RDTSCP;
5588 } 5880 }
5589 } 5881 }
@@ -5601,10 +5893,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5601 5893
5602 if (nested) { 5894 if (nested) {
5603 if (invpcid_enabled) 5895 if (invpcid_enabled)
5604 vmx->nested.nested_vmx_secondary_ctls_high |= 5896 vmx->nested.msrs.secondary_ctls_high |=
5605 SECONDARY_EXEC_ENABLE_INVPCID; 5897 SECONDARY_EXEC_ENABLE_INVPCID;
5606 else 5898 else
5607 vmx->nested.nested_vmx_secondary_ctls_high &= 5899 vmx->nested.msrs.secondary_ctls_high &=
5608 ~SECONDARY_EXEC_ENABLE_INVPCID; 5900 ~SECONDARY_EXEC_ENABLE_INVPCID;
5609 } 5901 }
5610 } 5902 }
@@ -5616,10 +5908,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5616 5908
5617 if (nested) { 5909 if (nested) {
5618 if (rdrand_enabled) 5910 if (rdrand_enabled)
5619 vmx->nested.nested_vmx_secondary_ctls_high |= 5911 vmx->nested.msrs.secondary_ctls_high |=
5620 SECONDARY_EXEC_RDRAND_EXITING; 5912 SECONDARY_EXEC_RDRAND_EXITING;
5621 else 5913 else
5622 vmx->nested.nested_vmx_secondary_ctls_high &= 5914 vmx->nested.msrs.secondary_ctls_high &=
5623 ~SECONDARY_EXEC_RDRAND_EXITING; 5915 ~SECONDARY_EXEC_RDRAND_EXITING;
5624 } 5916 }
5625 } 5917 }
@@ -5631,10 +5923,10 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
5631 5923
5632 if (nested) { 5924 if (nested) {
5633 if (rdseed_enabled) 5925 if (rdseed_enabled)
5634 vmx->nested.nested_vmx_secondary_ctls_high |= 5926 vmx->nested.msrs.secondary_ctls_high |=
5635 SECONDARY_EXEC_RDSEED_EXITING; 5927 SECONDARY_EXEC_RDSEED_EXITING;
5636 else 5928 else
5637 vmx->nested.nested_vmx_secondary_ctls_high &= 5929 vmx->nested.msrs.secondary_ctls_high &=
5638 ~SECONDARY_EXEC_RDSEED_EXITING; 5930 ~SECONDARY_EXEC_RDSEED_EXITING;
5639 } 5931 }
5640 } 5932 }
@@ -5696,7 +5988,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
5696 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 5988 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5697 } 5989 }
5698 5990
5699 if (ple_gap) { 5991 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
5700 vmcs_write32(PLE_GAP, ple_gap); 5992 vmcs_write32(PLE_GAP, ple_gap);
5701 vmx->ple_window = ple_window; 5993 vmx->ple_window = ple_window;
5702 vmx->ple_window_dirty = true; 5994 vmx->ple_window_dirty = true;
@@ -5861,6 +6153,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5861 update_exception_bitmap(vcpu); 6153 update_exception_bitmap(vcpu);
5862 6154
5863 vpid_sync_context(vmx->vpid); 6155 vpid_sync_context(vmx->vpid);
6156 if (init_event)
6157 vmx_clear_hlt(vcpu);
5864} 6158}
5865 6159
5866/* 6160/*
@@ -5885,8 +6179,7 @@ static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
5885 6179
5886static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 6180static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
5887{ 6181{
5888 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 6182 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
5889 PIN_BASED_NMI_EXITING;
5890} 6183}
5891 6184
5892static void enable_irq_window(struct kvm_vcpu *vcpu) 6185static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -5932,6 +6225,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5932 } else 6225 } else
5933 intr |= INTR_TYPE_EXT_INTR; 6226 intr |= INTR_TYPE_EXT_INTR;
5934 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 6227 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
6228
6229 vmx_clear_hlt(vcpu);
5935} 6230}
5936 6231
5937static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 6232static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -5962,6 +6257,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5962 6257
5963 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 6258 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5964 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 6259 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
6260
6261 vmx_clear_hlt(vcpu);
5965} 6262}
5966 6263
5967static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 6264static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -6024,14 +6321,23 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
6024{ 6321{
6025 int ret; 6322 int ret;
6026 6323
6324 if (enable_unrestricted_guest)
6325 return 0;
6326
6027 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 6327 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
6028 PAGE_SIZE * 3); 6328 PAGE_SIZE * 3);
6029 if (ret) 6329 if (ret)
6030 return ret; 6330 return ret;
6031 kvm->arch.tss_addr = addr; 6331 to_kvm_vmx(kvm)->tss_addr = addr;
6032 return init_rmode_tss(kvm); 6332 return init_rmode_tss(kvm);
6033} 6333}
6034 6334
6335static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
6336{
6337 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
6338 return 0;
6339}
6340
6035static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 6341static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6036{ 6342{
6037 switch (vec) { 6343 switch (vec) {
@@ -6134,19 +6440,24 @@ static int handle_exception(struct kvm_vcpu *vcpu)
6134 if (is_nmi(intr_info)) 6440 if (is_nmi(intr_info))
6135 return 1; /* already handled by vmx_vcpu_run() */ 6441 return 1; /* already handled by vmx_vcpu_run() */
6136 6442
6137 if (is_invalid_opcode(intr_info)) { 6443 if (is_invalid_opcode(intr_info))
6138 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 6444 return handle_ud(vcpu);
6139 if (er == EMULATE_USER_EXIT)
6140 return 0;
6141 if (er != EMULATE_DONE)
6142 kvm_queue_exception(vcpu, UD_VECTOR);
6143 return 1;
6144 }
6145 6445
6146 error_code = 0; 6446 error_code = 0;
6147 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 6447 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6148 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6448 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6149 6449
6450 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
6451 WARN_ON_ONCE(!enable_vmware_backdoor);
6452 er = emulate_instruction(vcpu,
6453 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
6454 if (er == EMULATE_USER_EXIT)
6455 return 0;
6456 else if (er != EMULATE_DONE)
6457 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
6458 return 1;
6459 }
6460
6150 /* 6461 /*
6151 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 6462 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
6152 * MMIO, it is better to report an internal error. 6463 * MMIO, it is better to report an internal error.
@@ -6232,28 +6543,22 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu)
6232static int handle_io(struct kvm_vcpu *vcpu) 6543static int handle_io(struct kvm_vcpu *vcpu)
6233{ 6544{
6234 unsigned long exit_qualification; 6545 unsigned long exit_qualification;
6235 int size, in, string, ret; 6546 int size, in, string;
6236 unsigned port; 6547 unsigned port;
6237 6548
6238 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6549 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6239 string = (exit_qualification & 16) != 0; 6550 string = (exit_qualification & 16) != 0;
6240 in = (exit_qualification & 8) != 0;
6241 6551
6242 ++vcpu->stat.io_exits; 6552 ++vcpu->stat.io_exits;
6243 6553
6244 if (string || in) 6554 if (string)
6245 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 6555 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
6246 6556
6247 port = exit_qualification >> 16; 6557 port = exit_qualification >> 16;
6248 size = (exit_qualification & 7) + 1; 6558 size = (exit_qualification & 7) + 1;
6559 in = (exit_qualification & 8) != 0;
6249 6560
6250 ret = kvm_skip_emulated_instruction(vcpu); 6561 return kvm_fast_pio(vcpu, size, port, in);
6251
6252 /*
6253 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
6254 * KVM_EXIT_DEBUG here.
6255 */
6256 return kvm_fast_pio_out(vcpu, size, port) && ret;
6257} 6562}
6258 6563
6259static void 6564static void
@@ -6344,6 +6649,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
6344 err = handle_set_cr0(vcpu, val); 6649 err = handle_set_cr0(vcpu, val);
6345 return kvm_complete_insn_gp(vcpu, err); 6650 return kvm_complete_insn_gp(vcpu, err);
6346 case 3: 6651 case 3:
6652 WARN_ON_ONCE(enable_unrestricted_guest);
6347 err = kvm_set_cr3(vcpu, val); 6653 err = kvm_set_cr3(vcpu, val);
6348 return kvm_complete_insn_gp(vcpu, err); 6654 return kvm_complete_insn_gp(vcpu, err);
6349 case 4: 6655 case 4:
@@ -6376,6 +6682,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
6376 case 1: /*mov from cr*/ 6682 case 1: /*mov from cr*/
6377 switch (cr) { 6683 switch (cr) {
6378 case 3: 6684 case 3:
6685 WARN_ON_ONCE(enable_unrestricted_guest);
6379 val = kvm_read_cr3(vcpu); 6686 val = kvm_read_cr3(vcpu);
6380 kvm_register_write(vcpu, reg, val); 6687 kvm_register_write(vcpu, reg, val);
6381 trace_kvm_cr_read(cr, val); 6688 trace_kvm_cr_read(cr, val);
@@ -6769,7 +7076,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
6769 7076
6770static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 7077static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6771{ 7078{
6772 int ret;
6773 gpa_t gpa; 7079 gpa_t gpa;
6774 7080
6775 /* 7081 /*
@@ -6797,17 +7103,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6797 NULL, 0) == EMULATE_DONE; 7103 NULL, 0) == EMULATE_DONE;
6798 } 7104 }
6799 7105
6800 ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 7106 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
6801 if (ret >= 0)
6802 return ret;
6803
6804 /* It is the real ept misconfig */
6805 WARN_ON(1);
6806
6807 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
6808 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
6809
6810 return 0;
6811} 7107}
6812 7108
6813static int handle_nmi_window(struct kvm_vcpu *vcpu) 7109static int handle_nmi_window(struct kvm_vcpu *vcpu)
@@ -6830,6 +7126,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6830 bool intr_window_requested; 7126 bool intr_window_requested;
6831 unsigned count = 130; 7127 unsigned count = 130;
6832 7128
7129 /*
7130 * We should never reach the point where we are emulating L2
7131 * due to invalid guest state as that means we incorrectly
7132 * allowed a nested VMEntry with an invalid vmcs12.
7133 */
7134 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
7135
6833 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 7136 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6834 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 7137 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
6835 7138
@@ -6848,12 +7151,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6848 goto out; 7151 goto out;
6849 } 7152 }
6850 7153
6851 if (err != EMULATE_DONE) { 7154 if (err != EMULATE_DONE)
6852 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7155 goto emulation_error;
6853 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 7156
6854 vcpu->run->internal.ndata = 0; 7157 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
6855 return 0; 7158 vcpu->arch.exception.pending)
6856 } 7159 goto emulation_error;
6857 7160
6858 if (vcpu->arch.halt_request) { 7161 if (vcpu->arch.halt_request) {
6859 vcpu->arch.halt_request = 0; 7162 vcpu->arch.halt_request = 0;
@@ -6869,34 +7172,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6869 7172
6870out: 7173out:
6871 return ret; 7174 return ret;
6872}
6873
6874static int __grow_ple_window(int val)
6875{
6876 if (ple_window_grow < 1)
6877 return ple_window;
6878
6879 val = min(val, ple_window_actual_max);
6880
6881 if (ple_window_grow < ple_window)
6882 val *= ple_window_grow;
6883 else
6884 val += ple_window_grow;
6885
6886 return val;
6887}
6888 7175
6889static int __shrink_ple_window(int val, int modifier, int minimum) 7176emulation_error:
6890{ 7177 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6891 if (modifier < 1) 7178 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6892 return ple_window; 7179 vcpu->run->internal.ndata = 0;
6893 7180 return 0;
6894 if (modifier < ple_window)
6895 val /= modifier;
6896 else
6897 val -= modifier;
6898
6899 return max(val, minimum);
6900} 7181}
6901 7182
6902static void grow_ple_window(struct kvm_vcpu *vcpu) 7183static void grow_ple_window(struct kvm_vcpu *vcpu)
@@ -6904,7 +7185,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
6904 struct vcpu_vmx *vmx = to_vmx(vcpu); 7185 struct vcpu_vmx *vmx = to_vmx(vcpu);
6905 int old = vmx->ple_window; 7186 int old = vmx->ple_window;
6906 7187
6907 vmx->ple_window = __grow_ple_window(old); 7188 vmx->ple_window = __grow_ple_window(old, ple_window,
7189 ple_window_grow,
7190 ple_window_max);
6908 7191
6909 if (vmx->ple_window != old) 7192 if (vmx->ple_window != old)
6910 vmx->ple_window_dirty = true; 7193 vmx->ple_window_dirty = true;
@@ -6917,8 +7200,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
6917 struct vcpu_vmx *vmx = to_vmx(vcpu); 7200 struct vcpu_vmx *vmx = to_vmx(vcpu);
6918 int old = vmx->ple_window; 7201 int old = vmx->ple_window;
6919 7202
6920 vmx->ple_window = __shrink_ple_window(old, 7203 vmx->ple_window = __shrink_ple_window(old, ple_window,
6921 ple_window_shrink, ple_window); 7204 ple_window_shrink,
7205 ple_window);
6922 7206
6923 if (vmx->ple_window != old) 7207 if (vmx->ple_window != old)
6924 vmx->ple_window_dirty = true; 7208 vmx->ple_window_dirty = true;
@@ -6927,21 +7211,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
6927} 7211}
6928 7212
6929/* 7213/*
6930 * ple_window_actual_max is computed to be one grow_ple_window() below
6931 * ple_window_max. (See __grow_ple_window for the reason.)
6932 * This prevents overflows, because ple_window_max is int.
6933 * ple_window_max effectively rounded down to a multiple of ple_window_grow in
6934 * this process.
6935 * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
6936 */
6937static void update_ple_window_actual_max(void)
6938{
6939 ple_window_actual_max =
6940 __shrink_ple_window(max(ple_window_max, ple_window),
6941 ple_window_grow, INT_MIN);
6942}
6943
6944/*
6945 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 7214 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6946 */ 7215 */
6947static void wakeup_handler(void) 7216static void wakeup_handler(void)
@@ -6960,7 +7229,7 @@ static void wakeup_handler(void)
6960 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 7229 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6961} 7230}
6962 7231
6963void vmx_enable_tdp(void) 7232static void vmx_enable_tdp(void)
6964{ 7233{
6965 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, 7234 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6966 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, 7235 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -7061,8 +7330,6 @@ static __init int hardware_setup(void)
7061 else 7330 else
7062 kvm_disable_tdp(); 7331 kvm_disable_tdp();
7063 7332
7064 update_ple_window_actual_max();
7065
7066 /* 7333 /*
7067 * Only enable PML when hardware supports PML feature, and both EPT 7334 * Only enable PML when hardware supports PML feature, and both EPT
7068 * and EPT A/D bit features are enabled -- PML depends on them to work. 7335 * and EPT A/D bit features are enabled -- PML depends on them to work.
@@ -7094,6 +7361,7 @@ static __init int hardware_setup(void)
7094 init_vmcs_shadow_fields(); 7361 init_vmcs_shadow_fields();
7095 7362
7096 kvm_set_posted_intr_wakeup_handler(wakeup_handler); 7363 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
7364 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
7097 7365
7098 kvm_mce_cap_supported |= MCG_LMCE_P; 7366 kvm_mce_cap_supported |= MCG_LMCE_P;
7099 7367
@@ -7122,7 +7390,7 @@ static __exit void hardware_unsetup(void)
7122 */ 7390 */
7123static int handle_pause(struct kvm_vcpu *vcpu) 7391static int handle_pause(struct kvm_vcpu *vcpu)
7124{ 7392{
7125 if (ple_gap) 7393 if (!kvm_pause_in_guest(vcpu->kvm))
7126 grow_ple_window(vcpu); 7394 grow_ple_window(vcpu);
7127 7395
7128 /* 7396 /*
@@ -7954,9 +8222,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
7954 u64 eptp, gpa; 8222 u64 eptp, gpa;
7955 } operand; 8223 } operand;
7956 8224
7957 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 8225 if (!(vmx->nested.msrs.secondary_ctls_high &
7958 SECONDARY_EXEC_ENABLE_EPT) || 8226 SECONDARY_EXEC_ENABLE_EPT) ||
7959 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 8227 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
7960 kvm_queue_exception(vcpu, UD_VECTOR); 8228 kvm_queue_exception(vcpu, UD_VECTOR);
7961 return 1; 8229 return 1;
7962 } 8230 }
@@ -7967,7 +8235,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
7967 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 8235 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7968 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 8236 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7969 8237
7970 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 8238 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
7971 8239
7972 if (type >= 32 || !(types & (1 << type))) { 8240 if (type >= 32 || !(types & (1 << type))) {
7973 nested_vmx_failValid(vcpu, 8241 nested_vmx_failValid(vcpu,
@@ -8018,9 +8286,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
8018 u64 gla; 8286 u64 gla;
8019 } operand; 8287 } operand;
8020 8288
8021 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 8289 if (!(vmx->nested.msrs.secondary_ctls_high &
8022 SECONDARY_EXEC_ENABLE_VPID) || 8290 SECONDARY_EXEC_ENABLE_VPID) ||
8023 !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) { 8291 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
8024 kvm_queue_exception(vcpu, UD_VECTOR); 8292 kvm_queue_exception(vcpu, UD_VECTOR);
8025 return 1; 8293 return 1;
8026 } 8294 }
@@ -8031,7 +8299,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
8031 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 8299 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
8032 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 8300 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
8033 8301
8034 types = (vmx->nested.nested_vmx_vpid_caps & 8302 types = (vmx->nested.msrs.vpid_caps &
8035 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 8303 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
8036 8304
8037 if (type >= 32 || !(types & (1 << type))) { 8305 if (type >= 32 || !(types & (1 << type))) {
@@ -8125,11 +8393,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
8125 /* Check for memory type validity */ 8393 /* Check for memory type validity */
8126 switch (address & VMX_EPTP_MT_MASK) { 8394 switch (address & VMX_EPTP_MT_MASK) {
8127 case VMX_EPTP_MT_UC: 8395 case VMX_EPTP_MT_UC:
8128 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) 8396 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
8129 return false; 8397 return false;
8130 break; 8398 break;
8131 case VMX_EPTP_MT_WB: 8399 case VMX_EPTP_MT_WB:
8132 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT)) 8400 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
8133 return false; 8401 return false;
8134 break; 8402 break;
8135 default: 8403 default:
@@ -8146,7 +8414,7 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
8146 8414
8147 /* AD, if set, should be supported */ 8415 /* AD, if set, should be supported */
8148 if (address & VMX_EPTP_AD_ENABLE_BIT) { 8416 if (address & VMX_EPTP_AD_ENABLE_BIT) {
8149 if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT)) 8417 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
8150 return false; 8418 return false;
8151 } 8419 }
8152 8420
@@ -8790,7 +9058,8 @@ static void dump_vmcs(void)
8790 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 9058 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
8791 vmcs_read64(GUEST_IA32_DEBUGCTL), 9059 vmcs_read64(GUEST_IA32_DEBUGCTL),
8792 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 9060 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
8793 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 9061 if (cpu_has_load_perf_global_ctrl &&
9062 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
8794 pr_err("PerfGlobCtl = 0x%016llx\n", 9063 pr_err("PerfGlobCtl = 0x%016llx\n",
8795 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 9064 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
8796 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 9065 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
@@ -8826,7 +9095,8 @@ static void dump_vmcs(void)
8826 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 9095 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
8827 vmcs_read64(HOST_IA32_EFER), 9096 vmcs_read64(HOST_IA32_EFER),
8828 vmcs_read64(HOST_IA32_PAT)); 9097 vmcs_read64(HOST_IA32_PAT));
8829 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 9098 if (cpu_has_load_perf_global_ctrl &&
9099 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
8830 pr_err("PerfGlobCtl = 0x%016llx\n", 9100 pr_err("PerfGlobCtl = 0x%016llx\n",
8831 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 9101 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
8832 9102
@@ -9178,9 +9448,9 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
9178 9448
9179 /* We need to handle NMIs before interrupts are enabled */ 9449 /* We need to handle NMIs before interrupts are enabled */
9180 if (is_nmi(exit_intr_info)) { 9450 if (is_nmi(exit_intr_info)) {
9181 kvm_before_handle_nmi(&vmx->vcpu); 9451 kvm_before_interrupt(&vmx->vcpu);
9182 asm("int $2"); 9452 asm("int $2");
9183 kvm_after_handle_nmi(&vmx->vcpu); 9453 kvm_after_interrupt(&vmx->vcpu);
9184 } 9454 }
9185} 9455}
9186 9456
@@ -9403,7 +9673,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
9403static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 9673static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9404{ 9674{
9405 struct vcpu_vmx *vmx = to_vmx(vcpu); 9675 struct vcpu_vmx *vmx = to_vmx(vcpu);
9406 unsigned long cr3, cr4; 9676 unsigned long cr3, cr4, evmcs_rsp;
9407 9677
9408 /* Record the guest's net vcpu time for enforced NMI injections. */ 9678 /* Record the guest's net vcpu time for enforced NMI injections. */
9409 if (unlikely(!enable_vnmi && 9679 if (unlikely(!enable_vnmi &&
@@ -9469,6 +9739,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9469 native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); 9739 native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
9470 9740
9471 vmx->__launched = vmx->loaded_vmcs->launched; 9741 vmx->__launched = vmx->loaded_vmcs->launched;
9742
9743 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
9744 (unsigned long)&current_evmcs->host_rsp : 0;
9745
9472 asm( 9746 asm(
9473 /* Store host registers */ 9747 /* Store host registers */
9474 "push %%" _ASM_DX "; push %%" _ASM_BP ";" 9748 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9477,15 +9751,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9477 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 9751 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9478 "je 1f \n\t" 9752 "je 1f \n\t"
9479 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 9753 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
9754 /* Avoid VMWRITE when Enlightened VMCS is in use */
9755 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
9756 "jz 2f \n\t"
9757 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
9758 "jmp 1f \n\t"
9759 "2: \n\t"
9480 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 9760 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
9481 "1: \n\t" 9761 "1: \n\t"
9482 /* Reload cr2 if changed */ 9762 /* Reload cr2 if changed */
9483 "mov %c[cr2](%0), %%" _ASM_AX " \n\t" 9763 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
9484 "mov %%cr2, %%" _ASM_DX " \n\t" 9764 "mov %%cr2, %%" _ASM_DX " \n\t"
9485 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" 9765 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
9486 "je 2f \n\t" 9766 "je 3f \n\t"
9487 "mov %%" _ASM_AX", %%cr2 \n\t" 9767 "mov %%" _ASM_AX", %%cr2 \n\t"
9488 "2: \n\t" 9768 "3: \n\t"
9489 /* Check if vmlaunch of vmresume is needed */ 9769 /* Check if vmlaunch of vmresume is needed */
9490 "cmpl $0, %c[launched](%0) \n\t" 9770 "cmpl $0, %c[launched](%0) \n\t"
9491 /* Load guest registers. Don't clobber flags. */ 9771 /* Load guest registers. Don't clobber flags. */
@@ -9554,7 +9834,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9554 ".global vmx_return \n\t" 9834 ".global vmx_return \n\t"
9555 "vmx_return: " _ASM_PTR " 2b \n\t" 9835 "vmx_return: " _ASM_PTR " 2b \n\t"
9556 ".popsection" 9836 ".popsection"
9557 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 9837 : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
9558 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 9838 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
9559 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 9839 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
9560 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 9840 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
@@ -9579,10 +9859,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9579 [wordsize]"i"(sizeof(ulong)) 9859 [wordsize]"i"(sizeof(ulong))
9580 : "cc", "memory" 9860 : "cc", "memory"
9581#ifdef CONFIG_X86_64 9861#ifdef CONFIG_X86_64
9582 , "rax", "rbx", "rdi", "rsi" 9862 , "rax", "rbx", "rdi"
9583 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 9863 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
9584#else 9864#else
9585 , "eax", "ebx", "edi", "esi" 9865 , "eax", "ebx", "edi"
9586#endif 9866#endif
9587 ); 9867 );
9588 9868
@@ -9610,6 +9890,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9610 /* Eliminate branch target predictions from guest mode */ 9890 /* Eliminate branch target predictions from guest mode */
9611 vmexit_fill_RSB(); 9891 vmexit_fill_RSB();
9612 9892
9893 /* All fields are clean at this point */
9894 if (static_branch_unlikely(&enable_evmcs))
9895 current_evmcs->hv_clean_fields |=
9896 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
9897
9613 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 9898 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
9614 if (vmx->host_debugctlmsr) 9899 if (vmx->host_debugctlmsr)
9615 update_debugctlmsr(vmx->host_debugctlmsr); 9900 update_debugctlmsr(vmx->host_debugctlmsr);
@@ -9646,14 +9931,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9646 __write_pkru(vmx->host_pkru); 9931 __write_pkru(vmx->host_pkru);
9647 } 9932 }
9648 9933
9649 /*
9650 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
9651 * we did not inject a still-pending event to L1 now because of
9652 * nested_run_pending, we need to re-enable this bit.
9653 */
9654 if (vmx->nested.nested_run_pending)
9655 kvm_make_request(KVM_REQ_EVENT, vcpu);
9656
9657 vmx->nested.nested_run_pending = 0; 9934 vmx->nested.nested_run_pending = 0;
9658 vmx->idt_vectoring_info = 0; 9935 vmx->idt_vectoring_info = 0;
9659 9936
@@ -9670,6 +9947,17 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
9670} 9947}
9671STACK_FRAME_NON_STANDARD(vmx_vcpu_run); 9948STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
9672 9949
9950static struct kvm *vmx_vm_alloc(void)
9951{
9952 struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
9953 return &kvm_vmx->kvm;
9954}
9955
9956static void vmx_vm_free(struct kvm *kvm)
9957{
9958 kfree(to_kvm_vmx(kvm));
9959}
9960
9673static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 9961static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
9674{ 9962{
9675 struct vcpu_vmx *vmx = to_vmx(vcpu); 9963 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -9777,14 +10065,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
9777 goto free_vmcs; 10065 goto free_vmcs;
9778 } 10066 }
9779 10067
9780 if (enable_ept) { 10068 if (enable_ept && !enable_unrestricted_guest) {
9781 err = init_rmode_identity_map(kvm); 10069 err = init_rmode_identity_map(kvm);
9782 if (err) 10070 if (err)
9783 goto free_vmcs; 10071 goto free_vmcs;
9784 } 10072 }
9785 10073
9786 if (nested) { 10074 if (nested) {
9787 nested_vmx_setup_ctls_msrs(vmx); 10075 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
10076 kvm_vcpu_apicv_active(&vmx->vcpu));
9788 vmx->nested.vpid02 = allocate_vpid(); 10077 vmx->nested.vpid02 = allocate_vpid();
9789 } 10078 }
9790 10079
@@ -9817,6 +10106,13 @@ free_vcpu:
9817 return ERR_PTR(err); 10106 return ERR_PTR(err);
9818} 10107}
9819 10108
10109static int vmx_vm_init(struct kvm *kvm)
10110{
10111 if (!ple_gap)
10112 kvm->arch.pause_in_guest = true;
10113 return 0;
10114}
10115
9820static void __init vmx_check_processor_compat(void *rtn) 10116static void __init vmx_check_processor_compat(void *rtn)
9821{ 10117{
9822 struct vmcs_config vmcs_conf; 10118 struct vmcs_config vmcs_conf;
@@ -9824,6 +10120,7 @@ static void __init vmx_check_processor_compat(void *rtn)
9824 *(int *)rtn = 0; 10120 *(int *)rtn = 0;
9825 if (setup_vmcs_config(&vmcs_conf) < 0) 10121 if (setup_vmcs_config(&vmcs_conf) < 0)
9826 *(int *)rtn = -EIO; 10122 *(int *)rtn = -EIO;
10123 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
9827 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 10124 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
9828 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 10125 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
9829 smp_processor_id()); 10126 smp_processor_id());
@@ -9911,12 +10208,12 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
9911 struct vcpu_vmx *vmx = to_vmx(vcpu); 10208 struct vcpu_vmx *vmx = to_vmx(vcpu);
9912 struct kvm_cpuid_entry2 *entry; 10209 struct kvm_cpuid_entry2 *entry;
9913 10210
9914 vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; 10211 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
9915 vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; 10212 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
9916 10213
9917#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 10214#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
9918 if (entry && (entry->_reg & (_cpuid_mask))) \ 10215 if (entry && (entry->_reg & (_cpuid_mask))) \
9919 vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ 10216 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
9920} while (0) 10217} while (0)
9921 10218
9922 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); 10219 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
@@ -10013,7 +10310,7 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
10013 10310
10014 kvm_mmu_unload(vcpu); 10311 kvm_mmu_unload(vcpu);
10015 kvm_init_shadow_ept_mmu(vcpu, 10312 kvm_init_shadow_ept_mmu(vcpu,
10016 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 10313 to_vmx(vcpu)->nested.msrs.ept_caps &
10017 VMX_EPT_EXECUTE_ONLY_BIT, 10314 VMX_EPT_EXECUTE_ONLY_BIT,
10018 nested_ept_ad_enabled(vcpu)); 10315 nested_ept_ad_enabled(vcpu));
10019 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 10316 vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
@@ -10952,6 +11249,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10952 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 11249 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
10953 vmx_set_efer(vcpu, vcpu->arch.efer); 11250 vmx_set_efer(vcpu, vcpu->arch.efer);
10954 11251
11252 /*
11253 * Guest state is invalid and unrestricted guest is disabled,
11254 * which means L1 attempted VMEntry to L2 with invalid state.
11255 * Fail the VMEntry.
11256 */
11257 if (vmx->emulation_required) {
11258 *entry_failure_code = ENTRY_FAIL_DEFAULT;
11259 return 1;
11260 }
11261
10955 /* Shadow page tables on either EPT or shadow page tables. */ 11262 /* Shadow page tables on either EPT or shadow page tables. */
10956 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 11263 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
10957 entry_failure_code)) 11264 entry_failure_code))
@@ -10965,6 +11272,19 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10965 return 0; 11272 return 0;
10966} 11273}
10967 11274
11275static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
11276{
11277 if (!nested_cpu_has_nmi_exiting(vmcs12) &&
11278 nested_cpu_has_virtual_nmis(vmcs12))
11279 return -EINVAL;
11280
11281 if (!nested_cpu_has_virtual_nmis(vmcs12) &&
11282 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
11283 return -EINVAL;
11284
11285 return 0;
11286}
11287
10968static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 11288static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10969{ 11289{
10970 struct vcpu_vmx *vmx = to_vmx(vcpu); 11290 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10992,26 +11312,29 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10992 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 11312 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10993 11313
10994 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 11314 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
10995 vmx->nested.nested_vmx_procbased_ctls_low, 11315 vmx->nested.msrs.procbased_ctls_low,
10996 vmx->nested.nested_vmx_procbased_ctls_high) || 11316 vmx->nested.msrs.procbased_ctls_high) ||
10997 (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 11317 (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
10998 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 11318 !vmx_control_verify(vmcs12->secondary_vm_exec_control,
10999 vmx->nested.nested_vmx_secondary_ctls_low, 11319 vmx->nested.msrs.secondary_ctls_low,
11000 vmx->nested.nested_vmx_secondary_ctls_high)) || 11320 vmx->nested.msrs.secondary_ctls_high)) ||
11001 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 11321 !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
11002 vmx->nested.nested_vmx_pinbased_ctls_low, 11322 vmx->nested.msrs.pinbased_ctls_low,
11003 vmx->nested.nested_vmx_pinbased_ctls_high) || 11323 vmx->nested.msrs.pinbased_ctls_high) ||
11004 !vmx_control_verify(vmcs12->vm_exit_controls, 11324 !vmx_control_verify(vmcs12->vm_exit_controls,
11005 vmx->nested.nested_vmx_exit_ctls_low, 11325 vmx->nested.msrs.exit_ctls_low,
11006 vmx->nested.nested_vmx_exit_ctls_high) || 11326 vmx->nested.msrs.exit_ctls_high) ||
11007 !vmx_control_verify(vmcs12->vm_entry_controls, 11327 !vmx_control_verify(vmcs12->vm_entry_controls,
11008 vmx->nested.nested_vmx_entry_ctls_low, 11328 vmx->nested.msrs.entry_ctls_low,
11009 vmx->nested.nested_vmx_entry_ctls_high)) 11329 vmx->nested.msrs.entry_ctls_high))
11330 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11331
11332 if (nested_vmx_check_nmi_controls(vmcs12))
11010 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 11333 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11011 11334
11012 if (nested_cpu_has_vmfunc(vmcs12)) { 11335 if (nested_cpu_has_vmfunc(vmcs12)) {
11013 if (vmcs12->vm_function_control & 11336 if (vmcs12->vm_function_control &
11014 ~vmx->nested.nested_vmx_vmfunc_controls) 11337 ~vmx->nested.msrs.vmfunc_controls)
11015 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 11338 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11016 11339
11017 if (nested_cpu_has_eptp_switching(vmcs12)) { 11340 if (nested_cpu_has_eptp_switching(vmcs12)) {
@@ -11293,7 +11616,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
11293 } else if (vcpu->arch.nmi_injected) { 11616 } else if (vcpu->arch.nmi_injected) {
11294 vmcs12->idt_vectoring_info_field = 11617 vmcs12->idt_vectoring_info_field =
11295 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 11618 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
11296 } else if (vcpu->arch.interrupt.pending) { 11619 } else if (vcpu->arch.interrupt.injected) {
11297 nr = vcpu->arch.interrupt.nr; 11620 nr = vcpu->arch.interrupt.nr;
11298 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 11621 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
11299 11622
@@ -11941,7 +12264,7 @@ static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
11941 12264
11942static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 12265static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
11943{ 12266{
11944 if (ple_gap) 12267 if (!kvm_pause_in_guest(vcpu->kvm))
11945 shrink_ple_window(vcpu); 12268 shrink_ple_window(vcpu);
11946} 12269}
11947 12270
@@ -12259,6 +12582,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
12259 12582
12260 vmx->nested.smm.vmxon = vmx->nested.vmxon; 12583 vmx->nested.smm.vmxon = vmx->nested.vmxon;
12261 vmx->nested.vmxon = false; 12584 vmx->nested.vmxon = false;
12585 vmx_clear_hlt(vcpu);
12262 return 0; 12586 return 0;
12263} 12587}
12264 12588
@@ -12300,6 +12624,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12300 .cpu_has_accelerated_tpr = report_flexpriority, 12624 .cpu_has_accelerated_tpr = report_flexpriority,
12301 .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, 12625 .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
12302 12626
12627 .vm_init = vmx_vm_init,
12628 .vm_alloc = vmx_vm_alloc,
12629 .vm_free = vmx_vm_free,
12630
12303 .vcpu_create = vmx_create_vcpu, 12631 .vcpu_create = vmx_create_vcpu,
12304 .vcpu_free = vmx_free_vcpu, 12632 .vcpu_free = vmx_free_vcpu,
12305 .vcpu_reset = vmx_vcpu_reset, 12633 .vcpu_reset = vmx_vcpu_reset,
@@ -12367,6 +12695,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12367 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 12695 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
12368 12696
12369 .set_tss_addr = vmx_set_tss_addr, 12697 .set_tss_addr = vmx_set_tss_addr,
12698 .set_identity_map_addr = vmx_set_identity_map_addr,
12370 .get_tdp_level = get_ept_level, 12699 .get_tdp_level = get_ept_level,
12371 .get_mt_mask = vmx_get_mt_mask, 12700 .get_mt_mask = vmx_get_mt_mask,
12372 12701
@@ -12425,7 +12754,38 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12425 12754
12426static int __init vmx_init(void) 12755static int __init vmx_init(void)
12427{ 12756{
12428 int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 12757 int r;
12758
12759#if IS_ENABLED(CONFIG_HYPERV)
12760 /*
12761 * Enlightened VMCS usage should be recommended and the host needs
12762 * to support eVMCS v1 or above. We can also disable eVMCS support
12763 * with module parameter.
12764 */
12765 if (enlightened_vmcs &&
12766 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
12767 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
12768 KVM_EVMCS_VERSION) {
12769 int cpu;
12770
12771 /* Check that we have assist pages on all online CPUs */
12772 for_each_online_cpu(cpu) {
12773 if (!hv_get_vp_assist_page(cpu)) {
12774 enlightened_vmcs = false;
12775 break;
12776 }
12777 }
12778
12779 if (enlightened_vmcs) {
12780 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
12781 static_branch_enable(&enable_evmcs);
12782 }
12783 } else {
12784 enlightened_vmcs = false;
12785 }
12786#endif
12787
12788 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
12429 __alignof__(struct vcpu_vmx), THIS_MODULE); 12789 __alignof__(struct vcpu_vmx), THIS_MODULE);
12430 if (r) 12790 if (r)
12431 return r; 12791 return r;
@@ -12446,6 +12806,29 @@ static void __exit vmx_exit(void)
12446#endif 12806#endif
12447 12807
12448 kvm_exit(); 12808 kvm_exit();
12809
12810#if IS_ENABLED(CONFIG_HYPERV)
12811 if (static_branch_unlikely(&enable_evmcs)) {
12812 int cpu;
12813 struct hv_vp_assist_page *vp_ap;
12814 /*
12815 * Reset everything to support using non-enlightened VMCS
12816 * access later (e.g. when we reload the module with
12817 * enlightened_vmcs=0)
12818 */
12819 for_each_online_cpu(cpu) {
12820 vp_ap = hv_get_vp_assist_page(cpu);
12821
12822 if (!vp_ap)
12823 continue;
12824
12825 vp_ap->current_nested_vmcs = 0;
12826 vp_ap->enlighten_vmentry = 0;
12827 }
12828
12829 static_branch_disable(&enable_evmcs);
12830 }
12831#endif
12449} 12832}
12450 12833
12451module_init(vmx_init) 12834module_init(vmx_init)
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h
new file mode 100644
index 000000000000..210a884090ad
--- /dev/null
+++ b/arch/x86/kvm/vmx_evmcs.h
@@ -0,0 +1,324 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __KVM_X86_VMX_EVMCS_H
3#define __KVM_X86_VMX_EVMCS_H
4
5#include <asm/hyperv-tlfs.h>
6
7#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
8#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
9#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
10 {EVMCS1_OFFSET(name), clean_field}
11
12struct evmcs_field {
13 u16 offset;
14 u16 clean_field;
15};
16
17static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
18 /* 64 bit rw */
19 EVMCS1_FIELD(GUEST_RIP, guest_rip,
20 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
21 EVMCS1_FIELD(GUEST_RSP, guest_rsp,
22 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
23 EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
24 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
25 EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
26 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
27 EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
28 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
29 EVMCS1_FIELD(HOST_CR0, host_cr0,
30 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
31 EVMCS1_FIELD(HOST_CR3, host_cr3,
32 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
33 EVMCS1_FIELD(HOST_CR4, host_cr4,
34 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
35 EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
36 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
37 EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
38 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
39 EVMCS1_FIELD(HOST_RIP, host_rip,
40 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
41 EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
42 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
43 EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
44 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
45 EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
46 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
47 EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
48 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
49 EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
50 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
51 EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
52 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
53 EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
54 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
55 EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
56 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
57 EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
58 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
59 EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
60 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
61 EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
62 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
63 EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
64 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
65 EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
66 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
67 EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
68 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
69 EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
70 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
71 EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
72 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
73 EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
74 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
75 EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
76 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
77 EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
78 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
79 EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
80 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
81 EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
82 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
83 EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
84 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
85 EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
86 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
87 EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
88 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
89 EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
90 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
91 EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
92 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
93 EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
94 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
95 EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
96 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
97 EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
98 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
99 EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
100 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
101 EVMCS1_FIELD(GUEST_CR0, guest_cr0,
102 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
103 EVMCS1_FIELD(GUEST_CR3, guest_cr3,
104 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
105 EVMCS1_FIELD(GUEST_CR4, guest_cr4,
106 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
107 EVMCS1_FIELD(GUEST_DR7, guest_dr7,
108 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
109 EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
110 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
111 EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
112 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
113 EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
114 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
115 EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
116 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
117 EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
118 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
119 EVMCS1_FIELD(HOST_RSP, host_rsp,
120 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
121 EVMCS1_FIELD(EPT_POINTER, ept_pointer,
122 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
123 EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
124 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
125 EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
126 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
127
128 /* 64 bit read only */
129 EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
130 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
131 EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
132 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
133 /*
134 * Not defined in KVM:
135 *
136 * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
137 * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
138 * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
139 * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
140 * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
141 * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
142 * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
143 * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
144 */
145 EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
146 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
147
148 /*
149 * No mask defined in the spec as Hyper-V doesn't currently support
150 * these. Future proof by resetting the whole clean field mask on
151 * access.
152 */
153 EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
154 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
155 EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
156 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
157 EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
158 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
159 EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
160 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
161 EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
162 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
163 EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
164 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
165 EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
166 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
167
168 /* 32 bit rw */
169 EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
170 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
171 EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
172 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
173 EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
174 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
175 EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
176 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
177 EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
178 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
179 EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
180 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
181 EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
182 vm_entry_exception_error_code,
183 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
184 EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
185 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
186 EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
187 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
188 EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
189 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
190 EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
191 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
192 EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
193 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
194 EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
195 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
196 EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
197 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
198 EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
199 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
200 EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
201 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
202 EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
203 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
204 EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
205 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
206 EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
207 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
208 EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
209 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
210 EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
211 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
212 EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
213 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
214 EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
215 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
216 EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
217 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
218 EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
219 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
220 EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
221 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
222 EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
223 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
224 EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
225 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
226 EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
227 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
228 EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
229 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
230 EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
231 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
232 EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
233 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
234
235 /* 32 bit read only */
236 EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
237 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
238 EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
239 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
240 EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
241 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
242 EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
243 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
244 EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
245 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
246 EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
247 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
248 EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
249 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
250 EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
251 HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
252
253 /* No mask defined in the spec (not used) */
254 EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
255 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
256 EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
257 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
258 EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
259 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
260 EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
261 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
262 EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
263 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
264 EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
265 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
266
267 /* 16 bit rw */
268 EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
269 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
270 EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
271 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
272 EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
273 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
274 EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
275 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
276 EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
277 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
278 EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
279 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
280 EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
281 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
282 EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
283 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
284 EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
285 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
286 EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
287 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
288 EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
289 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
290 EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
291 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
292 EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
293 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
294 EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
295 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
296 EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
297 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
298 EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
299 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
300};
301
302static __always_inline int get_evmcs_offset(unsigned long field,
303 u16 *clean_field)
304{
305 unsigned int index = ROL16(field, 6);
306 const struct evmcs_field *evmcs_field;
307
308 if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) {
309 WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
310 field);
311 return -ENOENT;
312 }
313
314 evmcs_field = &vmcs_field_to_evmcs_1[index];
315
316 if (clean_field)
317 *clean_field = evmcs_field->clean_field;
318
319 return evmcs_field->offset;
320}
321
322#undef ROL16
323
324#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 18b5ca7a3197..b2ff74b12ec4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -102,6 +102,8 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
102static void process_nmi(struct kvm_vcpu *vcpu); 102static void process_nmi(struct kvm_vcpu *vcpu);
103static void enter_smm(struct kvm_vcpu *vcpu); 103static void enter_smm(struct kvm_vcpu *vcpu);
104static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); 104static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
105static void store_regs(struct kvm_vcpu *vcpu);
106static int sync_regs(struct kvm_vcpu *vcpu);
105 107
106struct kvm_x86_ops *kvm_x86_ops __read_mostly; 108struct kvm_x86_ops *kvm_x86_ops __read_mostly;
107EXPORT_SYMBOL_GPL(kvm_x86_ops); 109EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -140,6 +142,13 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
140static bool __read_mostly vector_hashing = true; 142static bool __read_mostly vector_hashing = true;
141module_param(vector_hashing, bool, S_IRUGO); 143module_param(vector_hashing, bool, S_IRUGO);
142 144
145bool __read_mostly enable_vmware_backdoor = false;
146module_param(enable_vmware_backdoor, bool, S_IRUGO);
147EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
148
149static bool __read_mostly force_emulation_prefix = false;
150module_param(force_emulation_prefix, bool, S_IRUGO);
151
143#define KVM_NR_SHARED_MSRS 16 152#define KVM_NR_SHARED_MSRS 16
144 153
145struct kvm_shared_msrs_global { 154struct kvm_shared_msrs_global {
@@ -1032,7 +1041,11 @@ static u32 emulated_msrs[] = {
1032 HV_X64_MSR_VP_RUNTIME, 1041 HV_X64_MSR_VP_RUNTIME,
1033 HV_X64_MSR_SCONTROL, 1042 HV_X64_MSR_SCONTROL,
1034 HV_X64_MSR_STIMER0_CONFIG, 1043 HV_X64_MSR_STIMER0_CONFIG,
1035 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1044 HV_X64_MSR_VP_ASSIST_PAGE,
1045 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1046 HV_X64_MSR_TSC_EMULATION_STATUS,
1047
1048 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1036 MSR_KVM_PV_EOI_EN, 1049 MSR_KVM_PV_EOI_EN,
1037 1050
1038 MSR_IA32_TSC_ADJUST, 1051 MSR_IA32_TSC_ADJUST,
@@ -1054,6 +1067,25 @@ static unsigned num_emulated_msrs;
1054 * can be used by a hypervisor to validate requested CPU features. 1067 * can be used by a hypervisor to validate requested CPU features.
1055 */ 1068 */
1056static u32 msr_based_features[] = { 1069static u32 msr_based_features[] = {
1070 MSR_IA32_VMX_BASIC,
1071 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1072 MSR_IA32_VMX_PINBASED_CTLS,
1073 MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1074 MSR_IA32_VMX_PROCBASED_CTLS,
1075 MSR_IA32_VMX_TRUE_EXIT_CTLS,
1076 MSR_IA32_VMX_EXIT_CTLS,
1077 MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1078 MSR_IA32_VMX_ENTRY_CTLS,
1079 MSR_IA32_VMX_MISC,
1080 MSR_IA32_VMX_CR0_FIXED0,
1081 MSR_IA32_VMX_CR0_FIXED1,
1082 MSR_IA32_VMX_CR4_FIXED0,
1083 MSR_IA32_VMX_CR4_FIXED1,
1084 MSR_IA32_VMX_VMCS_ENUM,
1085 MSR_IA32_VMX_PROCBASED_CTLS2,
1086 MSR_IA32_VMX_EPT_VPID_CAP,
1087 MSR_IA32_VMX_VMFUNC,
1088
1057 MSR_F10H_DECFG, 1089 MSR_F10H_DECFG,
1058 MSR_IA32_UCODE_REV, 1090 MSR_IA32_UCODE_REV,
1059}; 1091};
@@ -2432,6 +2464,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2432 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 2464 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2433 case HV_X64_MSR_CRASH_CTL: 2465 case HV_X64_MSR_CRASH_CTL:
2434 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 2466 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2467 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2468 case HV_X64_MSR_TSC_EMULATION_CONTROL:
2469 case HV_X64_MSR_TSC_EMULATION_STATUS:
2435 return kvm_hv_set_msr_common(vcpu, msr, data, 2470 return kvm_hv_set_msr_common(vcpu, msr, data,
2436 msr_info->host_initiated); 2471 msr_info->host_initiated);
2437 case MSR_IA32_BBL_CR_CTL3: 2472 case MSR_IA32_BBL_CR_CTL3:
@@ -2558,6 +2593,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2558 case MSR_AMD64_DC_CFG: 2593 case MSR_AMD64_DC_CFG:
2559 msr_info->data = 0; 2594 msr_info->data = 0;
2560 break; 2595 break;
2596 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
2561 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 2597 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2562 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 2598 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2563 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 2599 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -2661,6 +2697,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2661 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 2697 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2662 case HV_X64_MSR_CRASH_CTL: 2698 case HV_X64_MSR_CRASH_CTL:
2663 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: 2699 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
2700 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2701 case HV_X64_MSR_TSC_EMULATION_CONTROL:
2702 case HV_X64_MSR_TSC_EMULATION_STATUS:
2664 return kvm_hv_get_msr_common(vcpu, 2703 return kvm_hv_get_msr_common(vcpu,
2665 msr_info->index, &msr_info->data); 2704 msr_info->index, &msr_info->data);
2666 break; 2705 break;
@@ -2777,9 +2816,15 @@ out:
2777 return r; 2816 return r;
2778} 2817}
2779 2818
2819static inline bool kvm_can_mwait_in_guest(void)
2820{
2821 return boot_cpu_has(X86_FEATURE_MWAIT) &&
2822 !boot_cpu_has_bug(X86_BUG_MONITOR);
2823}
2824
2780int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) 2825int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2781{ 2826{
2782 int r; 2827 int r = 0;
2783 2828
2784 switch (ext) { 2829 switch (ext) {
2785 case KVM_CAP_IRQCHIP: 2830 case KVM_CAP_IRQCHIP:
@@ -2809,6 +2854,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2809 case KVM_CAP_HYPERV_SYNIC: 2854 case KVM_CAP_HYPERV_SYNIC:
2810 case KVM_CAP_HYPERV_SYNIC2: 2855 case KVM_CAP_HYPERV_SYNIC2:
2811 case KVM_CAP_HYPERV_VP_INDEX: 2856 case KVM_CAP_HYPERV_VP_INDEX:
2857 case KVM_CAP_HYPERV_EVENTFD:
2812 case KVM_CAP_PCI_SEGMENT: 2858 case KVM_CAP_PCI_SEGMENT:
2813 case KVM_CAP_DEBUGREGS: 2859 case KVM_CAP_DEBUGREGS:
2814 case KVM_CAP_X86_ROBUST_SINGLESTEP: 2860 case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2828,11 +2874,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2828 case KVM_CAP_GET_MSR_FEATURES: 2874 case KVM_CAP_GET_MSR_FEATURES:
2829 r = 1; 2875 r = 1;
2830 break; 2876 break;
2877 case KVM_CAP_SYNC_REGS:
2878 r = KVM_SYNC_X86_VALID_FIELDS;
2879 break;
2831 case KVM_CAP_ADJUST_CLOCK: 2880 case KVM_CAP_ADJUST_CLOCK:
2832 r = KVM_CLOCK_TSC_STABLE; 2881 r = KVM_CLOCK_TSC_STABLE;
2833 break; 2882 break;
2834 case KVM_CAP_X86_GUEST_MWAIT: 2883 case KVM_CAP_X86_DISABLE_EXITS:
2835 r = kvm_mwait_in_guest(); 2884 r |= KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
2885 if(kvm_can_mwait_in_guest())
2886 r |= KVM_X86_DISABLE_EXITS_MWAIT;
2836 break; 2887 break;
2837 case KVM_CAP_X86_SMM: 2888 case KVM_CAP_X86_SMM:
2838 /* SMBASE is usually relocated above 1M on modern chipsets, 2889 /* SMBASE is usually relocated above 1M on modern chipsets,
@@ -2873,7 +2924,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2873 r = KVM_X2APIC_API_VALID_FLAGS; 2924 r = KVM_X2APIC_API_VALID_FLAGS;
2874 break; 2925 break;
2875 default: 2926 default:
2876 r = 0;
2877 break; 2927 break;
2878 } 2928 }
2879 return r; 2929 return r;
@@ -3265,7 +3315,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3265 events->exception.error_code = vcpu->arch.exception.error_code; 3315 events->exception.error_code = vcpu->arch.exception.error_code;
3266 3316
3267 events->interrupt.injected = 3317 events->interrupt.injected =
3268 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; 3318 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3269 events->interrupt.nr = vcpu->arch.interrupt.nr; 3319 events->interrupt.nr = vcpu->arch.interrupt.nr;
3270 events->interrupt.soft = 0; 3320 events->interrupt.soft = 0;
3271 events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); 3321 events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
@@ -3318,7 +3368,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3318 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 3368 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3319 vcpu->arch.exception.error_code = events->exception.error_code; 3369 vcpu->arch.exception.error_code = events->exception.error_code;
3320 3370
3321 vcpu->arch.interrupt.pending = events->interrupt.injected; 3371 vcpu->arch.interrupt.injected = events->interrupt.injected;
3322 vcpu->arch.interrupt.nr = events->interrupt.nr; 3372 vcpu->arch.interrupt.nr = events->interrupt.nr;
3323 vcpu->arch.interrupt.soft = events->interrupt.soft; 3373 vcpu->arch.interrupt.soft = events->interrupt.soft;
3324 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 3374 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
@@ -3917,8 +3967,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3917static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, 3967static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3918 u64 ident_addr) 3968 u64 ident_addr)
3919{ 3969{
3920 kvm->arch.ept_identity_map_addr = ident_addr; 3970 return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
3921 return 0;
3922} 3971}
3923 3972
3924static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 3973static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -4178,6 +4227,20 @@ split_irqchip_unlock:
4178 4227
4179 r = 0; 4228 r = 0;
4180 break; 4229 break;
4230 case KVM_CAP_X86_DISABLE_EXITS:
4231 r = -EINVAL;
4232 if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
4233 break;
4234
4235 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
4236 kvm_can_mwait_in_guest())
4237 kvm->arch.mwait_in_guest = true;
4238 if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
4239 kvm->arch.hlt_in_guest = true;
4240 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4241 kvm->arch.pause_in_guest = true;
4242 r = 0;
4243 break;
4181 default: 4244 default:
4182 r = -EINVAL; 4245 r = -EINVAL;
4183 break; 4246 break;
@@ -4482,6 +4545,15 @@ set_identity_unlock:
4482 r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region); 4545 r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
4483 break; 4546 break;
4484 } 4547 }
4548 case KVM_HYPERV_EVENTFD: {
4549 struct kvm_hyperv_eventfd hvevfd;
4550
4551 r = -EFAULT;
4552 if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
4553 goto out;
4554 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4555 break;
4556 }
4485 default: 4557 default:
4486 r = -ENOTTY; 4558 r = -ENOTTY;
4487 } 4559 }
@@ -4771,6 +4843,30 @@ out:
4771} 4843}
4772EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); 4844EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4773 4845
4846int handle_ud(struct kvm_vcpu *vcpu)
4847{
4848 int emul_type = EMULTYPE_TRAP_UD;
4849 enum emulation_result er;
4850 char sig[5]; /* ud2; .ascii "kvm" */
4851 struct x86_exception e;
4852
4853 if (force_emulation_prefix &&
4854 kvm_read_guest_virt(&vcpu->arch.emulate_ctxt,
4855 kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 &&
4856 memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
4857 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
4858 emul_type = 0;
4859 }
4860
4861 er = emulate_instruction(vcpu, emul_type);
4862 if (er == EMULATE_USER_EXIT)
4863 return 0;
4864 if (er != EMULATE_DONE)
4865 kvm_queue_exception(vcpu, UD_VECTOR);
4866 return 1;
4867}
4868EXPORT_SYMBOL_GPL(handle_ud);
4869
4774static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, 4870static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4775 gpa_t gpa, bool write) 4871 gpa_t gpa, bool write)
4776{ 4872{
@@ -5612,27 +5708,27 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
5612 kvm_rip_write(vcpu, ctxt->eip); 5708 kvm_rip_write(vcpu, ctxt->eip);
5613 kvm_set_rflags(vcpu, ctxt->eflags); 5709 kvm_set_rflags(vcpu, ctxt->eflags);
5614 5710
5615 if (irq == NMI_VECTOR)
5616 vcpu->arch.nmi_pending = 0;
5617 else
5618 vcpu->arch.interrupt.pending = false;
5619
5620 return EMULATE_DONE; 5711 return EMULATE_DONE;
5621} 5712}
5622EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); 5713EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
5623 5714
5624static int handle_emulation_failure(struct kvm_vcpu *vcpu) 5715static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
5625{ 5716{
5626 int r = EMULATE_DONE; 5717 int r = EMULATE_DONE;
5627 5718
5628 ++vcpu->stat.insn_emulation_fail; 5719 ++vcpu->stat.insn_emulation_fail;
5629 trace_kvm_emulate_insn_failed(vcpu); 5720 trace_kvm_emulate_insn_failed(vcpu);
5721
5722 if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
5723 return EMULATE_FAIL;
5724
5630 if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { 5725 if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
5631 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5726 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5632 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5727 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5633 vcpu->run->internal.ndata = 0; 5728 vcpu->run->internal.ndata = 0;
5634 r = EMULATE_USER_EXIT; 5729 r = EMULATE_USER_EXIT;
5635 } 5730 }
5731
5636 kvm_queue_exception(vcpu, UD_VECTOR); 5732 kvm_queue_exception(vcpu, UD_VECTOR);
5637 5733
5638 return r; 5734 return r;
@@ -5876,6 +5972,37 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5876 return false; 5972 return false;
5877} 5973}
5878 5974
5975static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
5976{
5977 switch (ctxt->opcode_len) {
5978 case 1:
5979 switch (ctxt->b) {
5980 case 0xe4: /* IN */
5981 case 0xe5:
5982 case 0xec:
5983 case 0xed:
5984 case 0xe6: /* OUT */
5985 case 0xe7:
5986 case 0xee:
5987 case 0xef:
5988 case 0x6c: /* INS */
5989 case 0x6d:
5990 case 0x6e: /* OUTS */
5991 case 0x6f:
5992 return true;
5993 }
5994 break;
5995 case 2:
5996 switch (ctxt->b) {
5997 case 0x33: /* RDPMC */
5998 return true;
5999 }
6000 break;
6001 }
6002
6003 return false;
6004}
6005
5879int x86_emulate_instruction(struct kvm_vcpu *vcpu, 6006int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5880 unsigned long cr2, 6007 unsigned long cr2,
5881 int emulation_type, 6008 int emulation_type,
@@ -5928,10 +6055,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5928 return EMULATE_DONE; 6055 return EMULATE_DONE;
5929 if (emulation_type & EMULTYPE_SKIP) 6056 if (emulation_type & EMULTYPE_SKIP)
5930 return EMULATE_FAIL; 6057 return EMULATE_FAIL;
5931 return handle_emulation_failure(vcpu); 6058 return handle_emulation_failure(vcpu, emulation_type);
5932 } 6059 }
5933 } 6060 }
5934 6061
6062 if ((emulation_type & EMULTYPE_VMWARE) &&
6063 !is_vmware_backdoor_opcode(ctxt))
6064 return EMULATE_FAIL;
6065
5935 if (emulation_type & EMULTYPE_SKIP) { 6066 if (emulation_type & EMULTYPE_SKIP) {
5936 kvm_rip_write(vcpu, ctxt->_eip); 6067 kvm_rip_write(vcpu, ctxt->_eip);
5937 if (ctxt->eflags & X86_EFLAGS_RF) 6068 if (ctxt->eflags & X86_EFLAGS_RF)
@@ -5963,7 +6094,7 @@ restart:
5963 emulation_type)) 6094 emulation_type))
5964 return EMULATE_DONE; 6095 return EMULATE_DONE;
5965 6096
5966 return handle_emulation_failure(vcpu); 6097 return handle_emulation_failure(vcpu, emulation_type);
5967 } 6098 }
5968 6099
5969 if (ctxt->have_exception) { 6100 if (ctxt->have_exception) {
@@ -6016,7 +6147,8 @@ restart:
6016} 6147}
6017EXPORT_SYMBOL_GPL(x86_emulate_instruction); 6148EXPORT_SYMBOL_GPL(x86_emulate_instruction);
6018 6149
6019int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 6150static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6151 unsigned short port)
6020{ 6152{
6021 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 6153 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6022 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, 6154 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
@@ -6025,7 +6157,6 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
6025 vcpu->arch.pio.count = 0; 6157 vcpu->arch.pio.count = 0;
6026 return ret; 6158 return ret;
6027} 6159}
6028EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
6029 6160
6030static int complete_fast_pio_in(struct kvm_vcpu *vcpu) 6161static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6031{ 6162{
@@ -6049,7 +6180,8 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
6049 return 1; 6180 return 1;
6050} 6181}
6051 6182
6052int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) 6183static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
6184 unsigned short port)
6053{ 6185{
6054 unsigned long val; 6186 unsigned long val;
6055 int ret; 6187 int ret;
@@ -6068,7 +6200,21 @@ int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
6068 6200
6069 return 0; 6201 return 0;
6070} 6202}
6071EXPORT_SYMBOL_GPL(kvm_fast_pio_in); 6203
6204int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
6205{
6206 int ret = kvm_skip_emulated_instruction(vcpu);
6207
6208 /*
6209 * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
6210 * KVM_EXIT_DEBUG here.
6211 */
6212 if (in)
6213 return kvm_fast_pio_in(vcpu, size, port) && ret;
6214 else
6215 return kvm_fast_pio_out(vcpu, size, port) && ret;
6216}
6217EXPORT_SYMBOL_GPL(kvm_fast_pio);
6072 6218
6073static int kvmclock_cpu_down_prep(unsigned int cpu) 6219static int kvmclock_cpu_down_prep(unsigned int cpu)
6074{ 6220{
@@ -6246,7 +6392,8 @@ static void kvm_timer_init(void)
6246 kvmclock_cpu_online, kvmclock_cpu_down_prep); 6392 kvmclock_cpu_online, kvmclock_cpu_down_prep);
6247} 6393}
6248 6394
6249static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 6395DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
6396EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
6250 6397
6251int kvm_is_in_guest(void) 6398int kvm_is_in_guest(void)
6252{ 6399{
@@ -6279,18 +6426,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
6279 .get_guest_ip = kvm_get_guest_ip, 6426 .get_guest_ip = kvm_get_guest_ip,
6280}; 6427};
6281 6428
6282void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
6283{
6284 __this_cpu_write(current_vcpu, vcpu);
6285}
6286EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
6287
6288void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
6289{
6290 __this_cpu_write(current_vcpu, NULL);
6291}
6292EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
6293
6294static void kvm_set_mmio_spte_mask(void) 6429static void kvm_set_mmio_spte_mask(void)
6295{ 6430{
6296 u64 mask; 6431 u64 mask;
@@ -6644,27 +6779,36 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6644 int r; 6779 int r;
6645 6780
6646 /* try to reinject previous events if any */ 6781 /* try to reinject previous events if any */
6647 if (vcpu->arch.exception.injected) {
6648 kvm_x86_ops->queue_exception(vcpu);
6649 return 0;
6650 }
6651 6782
6783 if (vcpu->arch.exception.injected)
6784 kvm_x86_ops->queue_exception(vcpu);
6652 /* 6785 /*
6653 * Exceptions must be injected immediately, or the exception 6786 * Do not inject an NMI or interrupt if there is a pending
6654 * frame will have the address of the NMI or interrupt handler. 6787 * exception. Exceptions and interrupts are recognized at
6788 * instruction boundaries, i.e. the start of an instruction.
6789 * Trap-like exceptions, e.g. #DB, have higher priority than
6790 * NMIs and interrupts, i.e. traps are recognized before an
6791 * NMI/interrupt that's pending on the same instruction.
6792 * Fault-like exceptions, e.g. #GP and #PF, are the lowest
6793 * priority, but are only generated (pended) during instruction
6794 * execution, i.e. a pending fault-like exception means the
6795 * fault occurred on the *previous* instruction and must be
6796 * serviced prior to recognizing any new events in order to
6797 * fully complete the previous instruction.
6655 */ 6798 */
6656 if (!vcpu->arch.exception.pending) { 6799 else if (!vcpu->arch.exception.pending) {
6657 if (vcpu->arch.nmi_injected) { 6800 if (vcpu->arch.nmi_injected)
6658 kvm_x86_ops->set_nmi(vcpu); 6801 kvm_x86_ops->set_nmi(vcpu);
6659 return 0; 6802 else if (vcpu->arch.interrupt.injected)
6660 }
6661
6662 if (vcpu->arch.interrupt.pending) {
6663 kvm_x86_ops->set_irq(vcpu); 6803 kvm_x86_ops->set_irq(vcpu);
6664 return 0;
6665 }
6666 } 6804 }
6667 6805
6806 /*
6807 * Call check_nested_events() even if we reinjected a previous event
6808 * in order for caller to determine if it should require immediate-exit
6809 * from L2 to L1 due to pending L1 events which require exit
6810 * from L2 to L1.
6811 */
6668 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { 6812 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
6669 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); 6813 r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
6670 if (r != 0) 6814 if (r != 0)
@@ -6677,6 +6821,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6677 vcpu->arch.exception.has_error_code, 6821 vcpu->arch.exception.has_error_code,
6678 vcpu->arch.exception.error_code); 6822 vcpu->arch.exception.error_code);
6679 6823
6824 WARN_ON_ONCE(vcpu->arch.exception.injected);
6680 vcpu->arch.exception.pending = false; 6825 vcpu->arch.exception.pending = false;
6681 vcpu->arch.exception.injected = true; 6826 vcpu->arch.exception.injected = true;
6682 6827
@@ -6691,7 +6836,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6691 } 6836 }
6692 6837
6693 kvm_x86_ops->queue_exception(vcpu); 6838 kvm_x86_ops->queue_exception(vcpu);
6694 } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { 6839 }
6840
6841 /* Don't consider new event if we re-injected an event */
6842 if (kvm_event_needs_reinjection(vcpu))
6843 return 0;
6844
6845 if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
6846 kvm_x86_ops->smi_allowed(vcpu)) {
6695 vcpu->arch.smi_pending = false; 6847 vcpu->arch.smi_pending = false;
6696 ++vcpu->arch.smi_count; 6848 ++vcpu->arch.smi_count;
6697 enter_smm(vcpu); 6849 enter_smm(vcpu);
@@ -6985,8 +7137,6 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
6985 7137
6986static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) 7138static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6987{ 7139{
6988 u64 eoi_exit_bitmap[4];
6989
6990 if (!kvm_apic_hw_enabled(vcpu->arch.apic)) 7140 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
6991 return; 7141 return;
6992 7142
@@ -6999,6 +7149,20 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6999 kvm_x86_ops->sync_pir_to_irr(vcpu); 7149 kvm_x86_ops->sync_pir_to_irr(vcpu);
7000 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); 7150 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
7001 } 7151 }
7152
7153 if (is_guest_mode(vcpu))
7154 vcpu->arch.load_eoi_exitmap_pending = true;
7155 else
7156 kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
7157}
7158
7159static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
7160{
7161 u64 eoi_exit_bitmap[4];
7162
7163 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
7164 return;
7165
7002 bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, 7166 bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
7003 vcpu_to_synic(vcpu)->vec_bitmap, 256); 7167 vcpu_to_synic(vcpu)->vec_bitmap, 256);
7004 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 7168 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
@@ -7113,6 +7277,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
7113 } 7277 }
7114 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) 7278 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
7115 vcpu_scan_ioapic(vcpu); 7279 vcpu_scan_ioapic(vcpu);
7280 if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
7281 vcpu_load_eoi_exitmap(vcpu);
7116 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) 7282 if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
7117 kvm_vcpu_reload_apic_access_page(vcpu); 7283 kvm_vcpu_reload_apic_access_page(vcpu);
7118 if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { 7284 if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
@@ -7291,7 +7457,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
7291 7457
7292 kvm_put_guest_xcr0(vcpu); 7458 kvm_put_guest_xcr0(vcpu);
7293 7459
7460 kvm_before_interrupt(vcpu);
7294 kvm_x86_ops->handle_external_intr(vcpu); 7461 kvm_x86_ops->handle_external_intr(vcpu);
7462 kvm_after_interrupt(vcpu);
7295 7463
7296 ++vcpu->stat.exits; 7464 ++vcpu->stat.exits;
7297 7465
@@ -7500,7 +7668,6 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
7500 return 0; 7668 return 0;
7501} 7669}
7502 7670
7503
7504int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 7671int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7505{ 7672{
7506 int r; 7673 int r;
@@ -7526,6 +7693,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7526 goto out; 7693 goto out;
7527 } 7694 }
7528 7695
7696 if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
7697 r = -EINVAL;
7698 goto out;
7699 }
7700
7701 if (vcpu->run->kvm_dirty_regs) {
7702 r = sync_regs(vcpu);
7703 if (r != 0)
7704 goto out;
7705 }
7706
7529 /* re-sync apic's tpr */ 7707 /* re-sync apic's tpr */
7530 if (!lapic_in_kernel(vcpu)) { 7708 if (!lapic_in_kernel(vcpu)) {
7531 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { 7709 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
@@ -7550,6 +7728,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7550 7728
7551out: 7729out:
7552 kvm_put_guest_fpu(vcpu); 7730 kvm_put_guest_fpu(vcpu);
7731 if (vcpu->run->kvm_valid_regs)
7732 store_regs(vcpu);
7553 post_kvm_run_save(vcpu); 7733 post_kvm_run_save(vcpu);
7554 kvm_sigset_deactivate(vcpu); 7734 kvm_sigset_deactivate(vcpu);
7555 7735
@@ -7557,10 +7737,8 @@ out:
7557 return r; 7737 return r;
7558} 7738}
7559 7739
7560int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 7740static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7561{ 7741{
7562 vcpu_load(vcpu);
7563
7564 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { 7742 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
7565 /* 7743 /*
7566 * We are here if userspace calls get_regs() in the middle of 7744 * We are here if userspace calls get_regs() in the middle of
@@ -7593,15 +7771,18 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7593 7771
7594 regs->rip = kvm_rip_read(vcpu); 7772 regs->rip = kvm_rip_read(vcpu);
7595 regs->rflags = kvm_get_rflags(vcpu); 7773 regs->rflags = kvm_get_rflags(vcpu);
7774}
7596 7775
7776int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7777{
7778 vcpu_load(vcpu);
7779 __get_regs(vcpu, regs);
7597 vcpu_put(vcpu); 7780 vcpu_put(vcpu);
7598 return 0; 7781 return 0;
7599} 7782}
7600 7783
7601int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 7784static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7602{ 7785{
7603 vcpu_load(vcpu);
7604
7605 vcpu->arch.emulate_regs_need_sync_from_vcpu = true; 7786 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
7606 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 7787 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
7607 7788
@@ -7630,7 +7811,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7630 vcpu->arch.exception.pending = false; 7811 vcpu->arch.exception.pending = false;
7631 7812
7632 kvm_make_request(KVM_REQ_EVENT, vcpu); 7813 kvm_make_request(KVM_REQ_EVENT, vcpu);
7814}
7633 7815
7816int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
7817{
7818 vcpu_load(vcpu);
7819 __set_regs(vcpu, regs);
7634 vcpu_put(vcpu); 7820 vcpu_put(vcpu);
7635 return 0; 7821 return 0;
7636} 7822}
@@ -7645,13 +7831,10 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
7645} 7831}
7646EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); 7832EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
7647 7833
7648int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 7834static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
7649 struct kvm_sregs *sregs)
7650{ 7835{
7651 struct desc_ptr dt; 7836 struct desc_ptr dt;
7652 7837
7653 vcpu_load(vcpu);
7654
7655 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 7838 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
7656 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 7839 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
7657 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 7840 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7679,10 +7862,16 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
7679 7862
7680 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 7863 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
7681 7864
7682 if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) 7865 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
7683 set_bit(vcpu->arch.interrupt.nr, 7866 set_bit(vcpu->arch.interrupt.nr,
7684 (unsigned long *)sregs->interrupt_bitmap); 7867 (unsigned long *)sregs->interrupt_bitmap);
7868}
7685 7869
7870int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
7871 struct kvm_sregs *sregs)
7872{
7873 vcpu_load(vcpu);
7874 __get_sregs(vcpu, sregs);
7686 vcpu_put(vcpu); 7875 vcpu_put(vcpu);
7687 return 0; 7876 return 0;
7688} 7877}
@@ -7754,7 +7943,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
7754} 7943}
7755EXPORT_SYMBOL_GPL(kvm_task_switch); 7944EXPORT_SYMBOL_GPL(kvm_task_switch);
7756 7945
7757int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) 7946static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
7758{ 7947{
7759 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { 7948 if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
7760 /* 7949 /*
@@ -7777,8 +7966,7 @@ int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
7777 return 0; 7966 return 0;
7778} 7967}
7779 7968
7780int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 7969static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
7781 struct kvm_sregs *sregs)
7782{ 7970{
7783 struct msr_data apic_base_msr; 7971 struct msr_data apic_base_msr;
7784 int mmu_reset_needed = 0; 7972 int mmu_reset_needed = 0;
@@ -7786,8 +7974,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
7786 struct desc_ptr dt; 7974 struct desc_ptr dt;
7787 int ret = -EINVAL; 7975 int ret = -EINVAL;
7788 7976
7789 vcpu_load(vcpu);
7790
7791 if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 7977 if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
7792 (sregs->cr4 & X86_CR4_OSXSAVE)) 7978 (sregs->cr4 & X86_CR4_OSXSAVE))
7793 goto out; 7979 goto out;
@@ -7866,6 +8052,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
7866 8052
7867 ret = 0; 8053 ret = 0;
7868out: 8054out:
8055 return ret;
8056}
8057
8058int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
8059 struct kvm_sregs *sregs)
8060{
8061 int ret;
8062
8063 vcpu_load(vcpu);
8064 ret = __set_sregs(vcpu, sregs);
7869 vcpu_put(vcpu); 8065 vcpu_put(vcpu);
7870 return ret; 8066 return ret;
7871} 8067}
@@ -7992,6 +8188,45 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
7992 return 0; 8188 return 0;
7993} 8189}
7994 8190
8191static void store_regs(struct kvm_vcpu *vcpu)
8192{
8193 BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
8194
8195 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
8196 __get_regs(vcpu, &vcpu->run->s.regs.regs);
8197
8198 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
8199 __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
8200
8201 if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
8202 kvm_vcpu_ioctl_x86_get_vcpu_events(
8203 vcpu, &vcpu->run->s.regs.events);
8204}
8205
8206static int sync_regs(struct kvm_vcpu *vcpu)
8207{
8208 if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
8209 return -EINVAL;
8210
8211 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
8212 __set_regs(vcpu, &vcpu->run->s.regs.regs);
8213 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
8214 }
8215 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
8216 if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
8217 return -EINVAL;
8218 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
8219 }
8220 if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
8221 if (kvm_vcpu_ioctl_x86_set_vcpu_events(
8222 vcpu, &vcpu->run->s.regs.events))
8223 return -EINVAL;
8224 vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
8225 }
8226
8227 return 0;
8228}
8229
7995static void fx_init(struct kvm_vcpu *vcpu) 8230static void fx_init(struct kvm_vcpu *vcpu)
7996{ 8231{
7997 fpstate_init(&vcpu->arch.guest_fpu.state); 8232 fpstate_init(&vcpu->arch.guest_fpu.state);
@@ -8447,7 +8682,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
8447 8682
8448 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 8683 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
8449 mutex_init(&kvm->arch.apic_map_lock); 8684 mutex_init(&kvm->arch.apic_map_lock);
8450 mutex_init(&kvm->arch.hyperv.hv_lock);
8451 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); 8685 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
8452 8686
8453 kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); 8687 kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
@@ -8456,6 +8690,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
8456 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 8690 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
8457 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 8691 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
8458 8692
8693 kvm_hv_init_vm(kvm);
8459 kvm_page_track_init(kvm); 8694 kvm_page_track_init(kvm);
8460 kvm_mmu_init_vm(kvm); 8695 kvm_mmu_init_vm(kvm);
8461 8696
@@ -8586,6 +8821,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
8586 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); 8821 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
8587 kvm_mmu_uninit_vm(kvm); 8822 kvm_mmu_uninit_vm(kvm);
8588 kvm_page_track_cleanup(kvm); 8823 kvm_page_track_cleanup(kvm);
8824 kvm_hv_destroy_vm(kvm);
8589} 8825}
8590 8826
8591void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 8827void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b91215d1fd80..7d35ce672989 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,12 +2,48 @@
2#ifndef ARCH_X86_KVM_X86_H 2#ifndef ARCH_X86_KVM_X86_H
3#define ARCH_X86_KVM_X86_H 3#define ARCH_X86_KVM_X86_H
4 4
5#include <asm/processor.h>
6#include <asm/mwait.h>
7#include <linux/kvm_host.h> 5#include <linux/kvm_host.h>
8#include <asm/pvclock.h> 6#include <asm/pvclock.h>
9#include "kvm_cache_regs.h" 7#include "kvm_cache_regs.h"
10 8
9#define KVM_DEFAULT_PLE_GAP 128
10#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
11#define KVM_DEFAULT_PLE_WINDOW_GROW 2
12#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
13#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
14#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
15#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
16
17static inline unsigned int __grow_ple_window(unsigned int val,
18 unsigned int base, unsigned int modifier, unsigned int max)
19{
20 u64 ret = val;
21
22 if (modifier < 1)
23 return base;
24
25 if (modifier < base)
26 ret *= modifier;
27 else
28 ret += modifier;
29
30 return min(ret, (u64)max);
31}
32
33static inline unsigned int __shrink_ple_window(unsigned int val,
34 unsigned int base, unsigned int modifier, unsigned int min)
35{
36 if (modifier < 1)
37 return base;
38
39 if (modifier < base)
40 val /= modifier;
41 else
42 val -= modifier;
43
44 return max(val, min);
45}
46
11#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL 47#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
12 48
13static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 49static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -19,19 +55,19 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
19static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, 55static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
20 bool soft) 56 bool soft)
21{ 57{
22 vcpu->arch.interrupt.pending = true; 58 vcpu->arch.interrupt.injected = true;
23 vcpu->arch.interrupt.soft = soft; 59 vcpu->arch.interrupt.soft = soft;
24 vcpu->arch.interrupt.nr = vector; 60 vcpu->arch.interrupt.nr = vector;
25} 61}
26 62
27static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) 63static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
28{ 64{
29 vcpu->arch.interrupt.pending = false; 65 vcpu->arch.interrupt.injected = false;
30} 66}
31 67
32static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) 68static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
33{ 69{
34 return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending || 70 return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
35 vcpu->arch.nmi_injected; 71 vcpu->arch.nmi_injected;
36} 72}
37 73
@@ -205,8 +241,6 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
205 return !(kvm->arch.disabled_quirks & quirk); 241 return !(kvm->arch.disabled_quirks & quirk);
206} 242}
207 243
208void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
209void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
210void kvm_set_pending_timer(struct kvm_vcpu *vcpu); 244void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
211int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 245int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
212 246
@@ -221,6 +255,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
221 gva_t addr, void *val, unsigned int bytes, 255 gva_t addr, void *val, unsigned int bytes,
222 struct x86_exception *exception); 256 struct x86_exception *exception);
223 257
258int handle_ud(struct kvm_vcpu *vcpu);
259
224void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); 260void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
225u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); 261u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
226bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); 262bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -242,6 +278,8 @@ extern unsigned int min_timer_period_us;
242 278
243extern unsigned int lapic_timer_advance_ns; 279extern unsigned int lapic_timer_advance_ns;
244 280
281extern bool enable_vmware_backdoor;
282
245extern struct static_key kvm_no_apic_vcpu; 283extern struct static_key kvm_no_apic_vcpu;
246 284
247static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 285static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
@@ -264,10 +302,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
264 __rem; \ 302 __rem; \
265 }) 303 })
266 304
267static inline bool kvm_mwait_in_guest(void) 305#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
306#define KVM_X86_DISABLE_EXITS_HTL (1 << 1)
307#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
308#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \
309 KVM_X86_DISABLE_EXITS_HTL | \
310 KVM_X86_DISABLE_EXITS_PAUSE)
311
312static inline bool kvm_mwait_in_guest(struct kvm *kvm)
313{
314 return kvm->arch.mwait_in_guest;
315}
316
317static inline bool kvm_hlt_in_guest(struct kvm *kvm)
318{
319 return kvm->arch.hlt_in_guest;
320}
321
322static inline bool kvm_pause_in_guest(struct kvm *kvm)
323{
324 return kvm->arch.pause_in_guest;
325}
326
327DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
328
329static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
330{
331 __this_cpu_write(current_vcpu, vcpu);
332}
333
334static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
268{ 335{
269 return boot_cpu_has(X86_FEATURE_MWAIT) && 336 __this_cpu_write(current_vcpu, NULL);
270 !boot_cpu_has_bug(X86_BUG_MONITOR);
271} 337}
272 338
273#endif 339#endif
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 447371f4de56..72855182b191 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -31,7 +31,6 @@
31#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
32#include <linux/hyperv.h> 32#include <linux/hyperv.h>
33#include <linux/export.h> 33#include <linux/export.h>
34#include <asm/hyperv.h>
35#include <asm/mshyperv.h> 34#include <asm/mshyperv.h>
36 35
37#include "hyperv_vmbus.h" 36#include "hyperv_vmbus.h"
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 8137b3885b99..9b82549cbbc8 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -29,7 +29,6 @@
29#include <linux/version.h> 29#include <linux/version.h>
30#include <linux/random.h> 30#include <linux/random.h>
31#include <linux/clockchips.h> 31#include <linux/clockchips.h>
32#include <asm/hyperv.h>
33#include <asm/mshyperv.h> 32#include <asm/mshyperv.h>
34#include "hyperv_vmbus.h" 33#include "hyperv_vmbus.h"
35 34
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 36d34fe3ccb3..f761bef36e77 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -27,6 +27,7 @@
27 27
28#include <linux/list.h> 28#include <linux/list.h>
29#include <asm/sync_bitops.h> 29#include <asm/sync_bitops.h>
30#include <asm/hyperv-tlfs.h>
30#include <linux/atomic.h> 31#include <linux/atomic.h>
31#include <linux/hyperv.h> 32#include <linux/hyperv.h>
32#include <linux/interrupt.h> 33#include <linux/interrupt.h>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index bc65c4d79c1f..b10fe26c4891 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,7 +36,6 @@
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/sched/task_stack.h> 37#include <linux/sched/task_stack.h>
38 38
39#include <asm/hyperv.h>
40#include <asm/mshyperv.h> 39#include <asm/mshyperv.h>
41#include <linux/notifier.h> 40#include <linux/notifier.h>
42#include <linux/ptrace.h> 41#include <linux/ptrace.h>
diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 18c6abe81fbd..728e5c5706c4 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -19,6 +19,11 @@ static inline unsigned int kvm_arch_para_features(void)
19 return 0; 19 return 0;
20} 20}
21 21
22static inline unsigned int kvm_arch_para_hints(void)
23{
24 return 0;
25}
26
22static inline bool kvm_para_available(void) 27static inline bool kvm_para_available(void)
23{ 28{
24 return false; 29 return false;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 02924ae2527e..24f03941ada8 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -57,11 +57,15 @@ struct vgic_global {
57 /* Physical address of vgic virtual cpu interface */ 57 /* Physical address of vgic virtual cpu interface */
58 phys_addr_t vcpu_base; 58 phys_addr_t vcpu_base;
59 59
60 /* GICV mapping */ 60 /* GICV mapping, kernel VA */
61 void __iomem *vcpu_base_va; 61 void __iomem *vcpu_base_va;
62 /* GICV mapping, HYP VA */
63 void __iomem *vcpu_hyp_va;
62 64
63 /* virtual control interface mapping */ 65 /* virtual control interface mapping, kernel VA */
64 void __iomem *vctrl_base; 66 void __iomem *vctrl_base;
67 /* virtual control interface mapping, HYP VA */
68 void __iomem *vctrl_hyp;
65 69
66 /* Number of implemented list registers */ 70 /* Number of implemented list registers */
67 int nr_lr; 71 int nr_lr;
@@ -209,10 +213,6 @@ struct vgic_dist {
209 213
210 int nr_spis; 214 int nr_spis;
211 215
212 /* TODO: Consider moving to global state */
213 /* Virtual control interface mapping */
214 void __iomem *vctrl_base;
215
216 /* base addresses in guest physical address space: */ 216 /* base addresses in guest physical address space: */
217 gpa_t vgic_dist_base; /* distributor */ 217 gpa_t vgic_dist_base; /* distributor */
218 union { 218 union {
@@ -263,7 +263,6 @@ struct vgic_dist {
263struct vgic_v2_cpu_if { 263struct vgic_v2_cpu_if {
264 u32 vgic_hcr; 264 u32 vgic_hcr;
265 u32 vgic_vmcr; 265 u32 vgic_vmcr;
266 u64 vgic_elrsr; /* Saved only */
267 u32 vgic_apr; 266 u32 vgic_apr;
268 u32 vgic_lr[VGIC_V2_MAX_LRS]; 267 u32 vgic_lr[VGIC_V2_MAX_LRS];
269}; 268};
@@ -272,7 +271,6 @@ struct vgic_v3_cpu_if {
272 u32 vgic_hcr; 271 u32 vgic_hcr;
273 u32 vgic_vmcr; 272 u32 vgic_vmcr;
274 u32 vgic_sre; /* Restored only, change ignored */ 273 u32 vgic_sre; /* Restored only, change ignored */
275 u32 vgic_elrsr; /* Saved only */
276 u32 vgic_ap0r[4]; 274 u32 vgic_ap0r[4];
277 u32 vgic_ap1r[4]; 275 u32 vgic_ap1r[4];
278 u64 vgic_lr[VGIC_V3_MAX_LRS]; 276 u64 vgic_lr[VGIC_V3_MAX_LRS];
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2048f3c3b68a..192ed8fbc403 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -26,7 +26,6 @@
26#define _HYPERV_H 26#define _HYPERV_H
27 27
28#include <uapi/linux/hyperv.h> 28#include <uapi/linux/hyperv.h>
29#include <uapi/asm/hyperv.h>
30 29
31#include <linux/types.h> 30#include <linux/types.h>
32#include <linux/scatterlist.h> 31#include <linux/scatterlist.h>
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 51f6ef2c2ff4..f23b90b02898 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -9,4 +9,9 @@ static inline bool kvm_para_has_feature(unsigned int feature)
9{ 9{
10 return !!(kvm_arch_para_features() & (1UL << feature)); 10 return !!(kvm_arch_para_features() & (1UL << feature));
11} 11}
12
13static inline bool kvm_para_has_hint(unsigned int feature)
14{
15 return !!(kvm_arch_para_hints() & (1UL << feature));
16}
12#endif /* __LINUX_KVM_PARA_H */ 17#endif /* __LINUX_KVM_PARA_H */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7b26d4b0b052..1065006c9bf5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -396,6 +396,10 @@ struct kvm_run {
396 char padding[256]; 396 char padding[256];
397 }; 397 };
398 398
399 /* 2048 is the size of the char array used to bound/pad the size
400 * of the union that holds sync regs.
401 */
402 #define SYNC_REGS_SIZE_BYTES 2048
399 /* 403 /*
400 * shared registers between kvm and userspace. 404 * shared registers between kvm and userspace.
401 * kvm_valid_regs specifies the register classes set by the host 405 * kvm_valid_regs specifies the register classes set by the host
@@ -407,7 +411,7 @@ struct kvm_run {
407 __u64 kvm_dirty_regs; 411 __u64 kvm_dirty_regs;
408 union { 412 union {
409 struct kvm_sync_regs regs; 413 struct kvm_sync_regs regs;
410 char padding[2048]; 414 char padding[SYNC_REGS_SIZE_BYTES];
411 } s; 415 } s;
412}; 416};
413 417
@@ -925,7 +929,7 @@ struct kvm_ppc_resize_hpt {
925#define KVM_CAP_S390_GS 140 929#define KVM_CAP_S390_GS 140
926#define KVM_CAP_S390_AIS 141 930#define KVM_CAP_S390_AIS 141
927#define KVM_CAP_SPAPR_TCE_VFIO 142 931#define KVM_CAP_SPAPR_TCE_VFIO 142
928#define KVM_CAP_X86_GUEST_MWAIT 143 932#define KVM_CAP_X86_DISABLE_EXITS 143
929#define KVM_CAP_ARM_USER_IRQ 144 933#define KVM_CAP_ARM_USER_IRQ 144
930#define KVM_CAP_S390_CMMA_MIGRATION 145 934#define KVM_CAP_S390_CMMA_MIGRATION 145
931#define KVM_CAP_PPC_FWNMI 146 935#define KVM_CAP_PPC_FWNMI 146
@@ -936,6 +940,7 @@ struct kvm_ppc_resize_hpt {
936#define KVM_CAP_PPC_GET_CPU_CHAR 151 940#define KVM_CAP_PPC_GET_CPU_CHAR 151
937#define KVM_CAP_S390_BPB 152 941#define KVM_CAP_S390_BPB 152
938#define KVM_CAP_GET_MSR_FEATURES 153 942#define KVM_CAP_GET_MSR_FEATURES 153
943#define KVM_CAP_HYPERV_EVENTFD 154
939 944
940#ifdef KVM_CAP_IRQ_ROUTING 945#ifdef KVM_CAP_IRQ_ROUTING
941 946
@@ -1375,6 +1380,10 @@ struct kvm_enc_region {
1375#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region) 1380#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region)
1376#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region) 1381#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region)
1377 1382
1383/* Available with KVM_CAP_HYPERV_EVENTFD */
1384#define KVM_HYPERV_EVENTFD _IOW(KVMIO, 0xbd, struct kvm_hyperv_eventfd)
1385
1386
1378/* Secure Encrypted Virtualization command */ 1387/* Secure Encrypted Virtualization command */
1379enum sev_cmd_id { 1388enum sev_cmd_id {
1380 /* Guest initialization commands */ 1389 /* Guest initialization commands */
@@ -1515,4 +1524,14 @@ struct kvm_assigned_msix_entry {
1515#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) 1524#define KVM_ARM_DEV_EL1_PTIMER (1 << 1)
1516#define KVM_ARM_DEV_PMU (1 << 2) 1525#define KVM_ARM_DEV_PMU (1 << 2)
1517 1526
1527struct kvm_hyperv_eventfd {
1528 __u32 conn_id;
1529 __s32 fd;
1530 __u32 flags;
1531 __u32 padding[3];
1532};
1533
1534#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
1535#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
1536
1518#endif /* __LINUX_KVM_H */ 1537#endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 7b26d4b0b052..6b89f87db200 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -925,7 +925,7 @@ struct kvm_ppc_resize_hpt {
925#define KVM_CAP_S390_GS 140 925#define KVM_CAP_S390_GS 140
926#define KVM_CAP_S390_AIS 141 926#define KVM_CAP_S390_AIS 141
927#define KVM_CAP_SPAPR_TCE_VFIO 142 927#define KVM_CAP_SPAPR_TCE_VFIO 142
928#define KVM_CAP_X86_GUEST_MWAIT 143 928#define KVM_CAP_X86_DISABLE_EXITS 143
929#define KVM_CAP_ARM_USER_IRQ 144 929#define KVM_CAP_ARM_USER_IRQ 144
930#define KVM_CAP_S390_CMMA_MIGRATION 145 930#define KVM_CAP_S390_CMMA_MIGRATION 145
931#define KVM_CAP_PPC_FWNMI 146 931#define KVM_CAP_PPC_FWNMI 146
diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat
index 5898c22ba310..56c4b3f8a01b 100755
--- a/tools/kvm/kvm_stat/kvm_stat
+++ b/tools/kvm/kvm_stat/kvm_stat
@@ -1121,9 +1121,6 @@ class Tui(object):
1121 self.screen.refresh() 1121 self.screen.refresh()
1122 1122
1123 def _refresh_body(self, sleeptime): 1123 def _refresh_body(self, sleeptime):
1124 def is_child_field(field):
1125 return field.find('(') != -1
1126
1127 def insert_child(sorted_items, child, values, parent): 1124 def insert_child(sorted_items, child, values, parent):
1128 num = len(sorted_items) 1125 num = len(sorted_items)
1129 for i in range(0, num): 1126 for i in range(0, num):
@@ -1134,12 +1131,14 @@ class Tui(object):
1134 def get_sorted_events(self, stats): 1131 def get_sorted_events(self, stats):
1135 """ separate parent and child events """ 1132 """ separate parent and child events """
1136 if self._sorting == SORT_DEFAULT: 1133 if self._sorting == SORT_DEFAULT:
1137 def sortkey((_k, v)): 1134 def sortkey(pair):
1138 # sort by (delta value, overall value) 1135 # sort by (delta value, overall value)
1136 v = pair[1]
1139 return (v.delta, v.value) 1137 return (v.delta, v.value)
1140 else: 1138 else:
1141 def sortkey((_k, v)): 1139 def sortkey(pair):
1142 # sort by overall value 1140 # sort by overall value
1141 v = pair[1]
1143 return v.value 1142 return v.value
1144 1143
1145 childs = [] 1144 childs = []
@@ -1613,7 +1612,7 @@ def assign_globals():
1613 global PATH_DEBUGFS_TRACING 1612 global PATH_DEBUGFS_TRACING
1614 1613
1615 debugfs = '' 1614 debugfs = ''
1616 for line in file('/proc/mounts'): 1615 for line in open('/proc/mounts'):
1617 if line.split(' ')[0] == 'debugfs': 1616 if line.split(' ')[0] == 'debugfs':
1618 debugfs = line.split(' ')[1] 1617 debugfs = line.split(' ')[1]
1619 break 1618 break
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index bae6a4e9f2ee..2fc410bc4f33 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += gpio
15TARGETS += intel_pstate 15TARGETS += intel_pstate
16TARGETS += ipc 16TARGETS += ipc
17TARGETS += kcmp 17TARGETS += kcmp
18TARGETS += kvm
18TARGETS += lib 19TARGETS += lib
19TARGETS += membarrier 20TARGETS += membarrier
20TARGETS += memfd 21TARGETS += memfd
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
new file mode 100644
index 000000000000..dc44de904797
--- /dev/null
+++ b/tools/testing/selftests/kvm/Makefile
@@ -0,0 +1,39 @@
1all:
2
3top_srcdir = ../../../../
4UNAME_M := $(shell uname -m)
5
6LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
7LIBKVM_x86_64 = lib/x86.c
8
9TEST_GEN_PROGS_x86_64 = set_sregs_test
10TEST_GEN_PROGS_x86_64 += sync_regs_test
11
12TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
13LIBKVM += $(LIBKVM_$(UNAME_M))
14
15INSTALL_HDR_PATH = $(top_srcdir)/usr
16LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
17CFLAGS += -O2 -g -I$(LINUX_HDR_PATH) -Iinclude -I$(<D)
18
19# After inclusion, $(OUTPUT) is defined and
20# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
21include ../lib.mk
22
23STATIC_LIBS := $(OUTPUT)/libkvm.a
24LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM))
25EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS)
26
27x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ))))
28$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c
29 $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
30
31$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
32 $(AR) crs $@ $^
33
34$(LINUX_HDR_PATH):
35 make -C $(top_srcdir) headers_install
36
37all: $(STATIC_LIBS) $(LINUX_HDR_PATH)
38$(TEST_GEN_PROGS): $(STATIC_LIBS)
39$(TEST_GEN_PROGS) $(LIBKVM_OBJ): | $(LINUX_HDR_PATH)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
new file mode 100644
index 000000000000..57974ad46373
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -0,0 +1,142 @@
1/*
2 * tools/testing/selftests/kvm/include/kvm_util.h
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 */
9#ifndef SELFTEST_KVM_UTIL_H
10#define SELFTEST_KVM_UTIL_H 1
11
12#include "test_util.h"
13
14#include "asm/kvm.h"
15#include "linux/kvm.h"
16#include <sys/ioctl.h>
17
18#include "sparsebit.h"
19
20/*
21 * Memslots can't cover the gfn starting at this gpa otherwise vCPUs can't be
22 * created. Only applies to VMs using EPT.
23 */
24#define KVM_DEFAULT_IDENTITY_MAP_ADDRESS 0xfffbc000ul
25
26
27/* Callers of kvm_util only have an incomplete/opaque description of the
28 * structure kvm_util is using to maintain the state of a VM.
29 */
30struct kvm_vm;
31
32typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
33typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
34
35/* Minimum allocated guest virtual and physical addresses */
36#define KVM_UTIL_MIN_VADDR 0x2000
37
38#define DEFAULT_GUEST_PHY_PAGES 512
39#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000
40#define DEFAULT_STACK_PGS 5
41
42enum vm_guest_mode {
43 VM_MODE_FLAT48PG,
44};
45
46enum vm_mem_backing_src_type {
47 VM_MEM_SRC_ANONYMOUS,
48 VM_MEM_SRC_ANONYMOUS_THP,
49 VM_MEM_SRC_ANONYMOUS_HUGETLB,
50};
51
52int kvm_check_cap(long cap);
53
54struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
55void kvm_vm_free(struct kvm_vm *vmp);
56
57int kvm_memcmp_hva_gva(void *hva,
58 struct kvm_vm *vm, const vm_vaddr_t gva, size_t len);
59
60void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
61 uint32_t data_memslot, uint32_t pgd_memslot);
62
63void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
64void vcpu_dump(FILE *stream, struct kvm_vm *vm,
65 uint32_t vcpuid, uint8_t indent);
66
67void vm_create_irqchip(struct kvm_vm *vm);
68
69void vm_userspace_mem_region_add(struct kvm_vm *vm,
70 enum vm_mem_backing_src_type src_type,
71 uint64_t guest_paddr, uint32_t slot, uint64_t npages,
72 uint32_t flags);
73
74void vcpu_ioctl(struct kvm_vm *vm,
75 uint32_t vcpuid, unsigned long ioctl, void *arg);
76void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
77void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
78void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
79vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
80 uint32_t data_memslot, uint32_t pgd_memslot);
81void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
82void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
83vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
84vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
85
86struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
87void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
88int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
89void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
90 struct kvm_mp_state *mp_state);
91void vcpu_regs_get(struct kvm_vm *vm,
92 uint32_t vcpuid, struct kvm_regs *regs);
93void vcpu_regs_set(struct kvm_vm *vm,
94 uint32_t vcpuid, struct kvm_regs *regs);
95void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
96void vcpu_sregs_get(struct kvm_vm *vm,
97 uint32_t vcpuid, struct kvm_sregs *sregs);
98void vcpu_sregs_set(struct kvm_vm *vm,
99 uint32_t vcpuid, struct kvm_sregs *sregs);
100int _vcpu_sregs_set(struct kvm_vm *vm,
101 uint32_t vcpuid, struct kvm_sregs *sregs);
102void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
103 struct kvm_vcpu_events *events);
104void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
105 struct kvm_vcpu_events *events);
106
107const char *exit_reason_str(unsigned int exit_reason);
108
109void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
110void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
111 uint32_t pgd_memslot);
112vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
113 vm_paddr_t paddr_min, uint32_t memslot);
114
115void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid);
116void vcpu_set_cpuid(
117 struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid);
118
119struct kvm_cpuid2 *allocate_kvm_cpuid2(void);
120struct kvm_cpuid_entry2 *
121find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
122 uint32_t index);
123
124static inline struct kvm_cpuid_entry2 *
125find_cpuid_entry(struct kvm_cpuid2 *cpuid, uint32_t function)
126{
127 return find_cpuid_index_entry(cpuid, function, 0);
128}
129
130struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code);
131void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
132
133struct kvm_userspace_memory_region *
134kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
135 uint64_t end);
136
137struct kvm_dirty_log *
138allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
139
140int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
141
142#endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h
new file mode 100644
index 000000000000..54cfeb6568d3
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/sparsebit.h
@@ -0,0 +1,75 @@
1/*
2 * tools/testing/selftests/kvm/include/sparsebit.h
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 *
9 * Header file that describes API to the sparsebit library.
10 * This library provides a memory efficient means of storing
11 * the settings of bits indexed via a uint64_t. Memory usage
12 * is reasonable, significantly less than (2^64 / 8) bytes, as
13 * long as bits that are mostly set or mostly cleared are close
14 * to each other. This library is efficient in memory usage
15 * even in the case where most bits are set.
16 */
17
18#ifndef _TEST_SPARSEBIT_H_
19#define _TEST_SPARSEBIT_H_
20
21#include <stdbool.h>
22#include <stdint.h>
23#include <stdio.h>
24
25#ifdef __cplusplus
26extern "C" {
27#endif
28
29struct sparsebit;
30typedef uint64_t sparsebit_idx_t;
31typedef uint64_t sparsebit_num_t;
32
33struct sparsebit *sparsebit_alloc(void);
34void sparsebit_free(struct sparsebit **sbitp);
35void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src);
36
37bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx);
38bool sparsebit_is_set_num(struct sparsebit *sbit,
39 sparsebit_idx_t idx, sparsebit_num_t num);
40bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx);
41bool sparsebit_is_clear_num(struct sparsebit *sbit,
42 sparsebit_idx_t idx, sparsebit_num_t num);
43sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit);
44bool sparsebit_any_set(struct sparsebit *sbit);
45bool sparsebit_any_clear(struct sparsebit *sbit);
46bool sparsebit_all_set(struct sparsebit *sbit);
47bool sparsebit_all_clear(struct sparsebit *sbit);
48sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit);
49sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit);
50sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev);
51sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev);
52sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit,
53 sparsebit_idx_t start, sparsebit_num_t num);
54sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit,
55 sparsebit_idx_t start, sparsebit_num_t num);
56
57void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx);
58void sparsebit_set_num(struct sparsebit *sbitp, sparsebit_idx_t start,
59 sparsebit_num_t num);
60void sparsebit_set_all(struct sparsebit *sbitp);
61
62void sparsebit_clear(struct sparsebit *sbitp, sparsebit_idx_t idx);
63void sparsebit_clear_num(struct sparsebit *sbitp,
64 sparsebit_idx_t start, sparsebit_num_t num);
65void sparsebit_clear_all(struct sparsebit *sbitp);
66
67void sparsebit_dump(FILE *stream, struct sparsebit *sbit,
68 unsigned int indent);
69void sparsebit_validate_internal(struct sparsebit *sbit);
70
71#ifdef __cplusplus
72}
73#endif
74
75#endif /* _TEST_SPARSEBIT_H_ */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
new file mode 100644
index 000000000000..7ab98e41324f
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -0,0 +1,45 @@
1/*
2 * tools/testing/selftests/kvm/include/test_util.h
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 */
9
10#ifndef TEST_UTIL_H
11#define TEST_UTIL_H 1
12
13#include <stdlib.h>
14#include <stdarg.h>
15#include <stdbool.h>
16#include <stdio.h>
17#include <string.h>
18#include <inttypes.h>
19#include <errno.h>
20#include <unistd.h>
21#include <fcntl.h>
22
23ssize_t test_write(int fd, const void *buf, size_t count);
24ssize_t test_read(int fd, void *buf, size_t count);
25int test_seq_read(const char *path, char **bufp, size_t *sizep);
26
27void test_assert(bool exp, const char *exp_str,
28 const char *file, unsigned int line, const char *fmt, ...);
29
30#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
31
32#define TEST_ASSERT(e, fmt, ...) \
33 test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
34
35#define ASSERT_EQ(a, b) do { \
36 typeof(a) __a = (a); \
37 typeof(b) __b = (b); \
38 TEST_ASSERT(__a == __b, \
39 "ASSERT_EQ(%s, %s) failed.\n" \
40 "\t%s is %#lx\n" \
41 "\t%s is %#lx", \
42 #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
43} while (0)
44
45#endif /* TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86.h b/tools/testing/selftests/kvm/include/x86.h
new file mode 100644
index 000000000000..4a5b2c4c1a0f
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86.h
@@ -0,0 +1,1043 @@
1/*
2 * tools/testing/selftests/kvm/include/x86.h
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 */
9
10#ifndef SELFTEST_KVM_X86_H
11#define SELFTEST_KVM_X86_H
12
13#include <assert.h>
14#include <stdint.h>
15
16#define X86_EFLAGS_FIXED (1u << 1)
17
18#define X86_CR4_VME (1ul << 0)
19#define X86_CR4_PVI (1ul << 1)
20#define X86_CR4_TSD (1ul << 2)
21#define X86_CR4_DE (1ul << 3)
22#define X86_CR4_PSE (1ul << 4)
23#define X86_CR4_PAE (1ul << 5)
24#define X86_CR4_MCE (1ul << 6)
25#define X86_CR4_PGE (1ul << 7)
26#define X86_CR4_PCE (1ul << 8)
27#define X86_CR4_OSFXSR (1ul << 9)
28#define X86_CR4_OSXMMEXCPT (1ul << 10)
29#define X86_CR4_UMIP (1ul << 11)
30#define X86_CR4_VMXE (1ul << 13)
31#define X86_CR4_SMXE (1ul << 14)
32#define X86_CR4_FSGSBASE (1ul << 16)
33#define X86_CR4_PCIDE (1ul << 17)
34#define X86_CR4_OSXSAVE (1ul << 18)
35#define X86_CR4_SMEP (1ul << 20)
36#define X86_CR4_SMAP (1ul << 21)
37#define X86_CR4_PKE (1ul << 22)
38
39/* The enum values match the intruction encoding of each register */
40enum x86_register {
41 RAX = 0,
42 RCX,
43 RDX,
44 RBX,
45 RSP,
46 RBP,
47 RSI,
48 RDI,
49 R8,
50 R9,
51 R10,
52 R11,
53 R12,
54 R13,
55 R14,
56 R15,
57};
58
59struct desc64 {
60 uint16_t limit0;
61 uint16_t base0;
62 unsigned base1:8, type:5, dpl:2, p:1;
63 unsigned limit1:4, zero0:3, g:1, base2:8;
64 uint32_t base3;
65 uint32_t zero1;
66} __attribute__((packed));
67
68struct desc_ptr {
69 uint16_t size;
70 uint64_t address;
71} __attribute__((packed));
72
73static inline uint64_t get_desc64_base(const struct desc64 *desc)
74{
75 return ((uint64_t)desc->base3 << 32) |
76 (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
77}
78
79static inline uint64_t rdtsc(void)
80{
81 uint32_t eax, edx;
82
83 /*
84 * The lfence is to wait (on Intel CPUs) until all previous
85 * instructions have been executed.
86 */
87 __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
88 return ((uint64_t)edx) << 32 | eax;
89}
90
91static inline uint64_t rdtscp(uint32_t *aux)
92{
93 uint32_t eax, edx;
94
95 __asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
96 return ((uint64_t)edx) << 32 | eax;
97}
98
99static inline uint64_t rdmsr(uint32_t msr)
100{
101 uint32_t a, d;
102
103 __asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
104
105 return a | ((uint64_t) d << 32);
106}
107
108static inline void wrmsr(uint32_t msr, uint64_t value)
109{
110 uint32_t a = value;
111 uint32_t d = value >> 32;
112
113 __asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
114}
115
116
117static inline uint16_t inw(uint16_t port)
118{
119 uint16_t tmp;
120
121 __asm__ __volatile__("in %%dx, %%ax"
122 : /* output */ "=a" (tmp)
123 : /* input */ "d" (port));
124
125 return tmp;
126}
127
128static inline uint16_t get_es(void)
129{
130 uint16_t es;
131
132 __asm__ __volatile__("mov %%es, %[es]"
133 : /* output */ [es]"=rm"(es));
134 return es;
135}
136
137static inline uint16_t get_cs(void)
138{
139 uint16_t cs;
140
141 __asm__ __volatile__("mov %%cs, %[cs]"
142 : /* output */ [cs]"=rm"(cs));
143 return cs;
144}
145
146static inline uint16_t get_ss(void)
147{
148 uint16_t ss;
149
150 __asm__ __volatile__("mov %%ss, %[ss]"
151 : /* output */ [ss]"=rm"(ss));
152 return ss;
153}
154
155static inline uint16_t get_ds(void)
156{
157 uint16_t ds;
158
159 __asm__ __volatile__("mov %%ds, %[ds]"
160 : /* output */ [ds]"=rm"(ds));
161 return ds;
162}
163
164static inline uint16_t get_fs(void)
165{
166 uint16_t fs;
167
168 __asm__ __volatile__("mov %%fs, %[fs]"
169 : /* output */ [fs]"=rm"(fs));
170 return fs;
171}
172
173static inline uint16_t get_gs(void)
174{
175 uint16_t gs;
176
177 __asm__ __volatile__("mov %%gs, %[gs]"
178 : /* output */ [gs]"=rm"(gs));
179 return gs;
180}
181
182static inline uint16_t get_tr(void)
183{
184 uint16_t tr;
185
186 __asm__ __volatile__("str %[tr]"
187 : /* output */ [tr]"=rm"(tr));
188 return tr;
189}
190
191static inline uint64_t get_cr0(void)
192{
193 uint64_t cr0;
194
195 __asm__ __volatile__("mov %%cr0, %[cr0]"
196 : /* output */ [cr0]"=r"(cr0));
197 return cr0;
198}
199
200static inline uint64_t get_cr3(void)
201{
202 uint64_t cr3;
203
204 __asm__ __volatile__("mov %%cr3, %[cr3]"
205 : /* output */ [cr3]"=r"(cr3));
206 return cr3;
207}
208
209static inline uint64_t get_cr4(void)
210{
211 uint64_t cr4;
212
213 __asm__ __volatile__("mov %%cr4, %[cr4]"
214 : /* output */ [cr4]"=r"(cr4));
215 return cr4;
216}
217
218static inline void set_cr4(uint64_t val)
219{
220 __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
221}
222
223static inline uint64_t get_gdt_base(void)
224{
225 struct desc_ptr gdt;
226 __asm__ __volatile__("sgdt %[gdt]"
227 : /* output */ [gdt]"=m"(gdt));
228 return gdt.address;
229}
230
231static inline uint64_t get_idt_base(void)
232{
233 struct desc_ptr idt;
234 __asm__ __volatile__("sidt %[idt]"
235 : /* output */ [idt]"=m"(idt));
236 return idt.address;
237}
238
239#define SET_XMM(__var, __xmm) \
240 asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
241
242static inline void set_xmm(int n, unsigned long val)
243{
244 switch (n) {
245 case 0:
246 SET_XMM(val, xmm0);
247 break;
248 case 1:
249 SET_XMM(val, xmm1);
250 break;
251 case 2:
252 SET_XMM(val, xmm2);
253 break;
254 case 3:
255 SET_XMM(val, xmm3);
256 break;
257 case 4:
258 SET_XMM(val, xmm4);
259 break;
260 case 5:
261 SET_XMM(val, xmm5);
262 break;
263 case 6:
264 SET_XMM(val, xmm6);
265 break;
266 case 7:
267 SET_XMM(val, xmm7);
268 break;
269 }
270}
271
272typedef unsigned long v1di __attribute__ ((vector_size (8)));
273static inline unsigned long get_xmm(int n)
274{
275 assert(n >= 0 && n <= 7);
276
277 register v1di xmm0 __asm__("%xmm0");
278 register v1di xmm1 __asm__("%xmm1");
279 register v1di xmm2 __asm__("%xmm2");
280 register v1di xmm3 __asm__("%xmm3");
281 register v1di xmm4 __asm__("%xmm4");
282 register v1di xmm5 __asm__("%xmm5");
283 register v1di xmm6 __asm__("%xmm6");
284 register v1di xmm7 __asm__("%xmm7");
285 switch (n) {
286 case 0:
287 return (unsigned long)xmm0;
288 case 1:
289 return (unsigned long)xmm1;
290 case 2:
291 return (unsigned long)xmm2;
292 case 3:
293 return (unsigned long)xmm3;
294 case 4:
295 return (unsigned long)xmm4;
296 case 5:
297 return (unsigned long)xmm5;
298 case 6:
299 return (unsigned long)xmm6;
300 case 7:
301 return (unsigned long)xmm7;
302 }
303 return 0;
304}
305
306/*
307 * Basic CPU control in CR0
308 */
309#define X86_CR0_PE (1UL<<0) /* Protection Enable */
310#define X86_CR0_MP (1UL<<1) /* Monitor Coprocessor */
311#define X86_CR0_EM (1UL<<2) /* Emulation */
312#define X86_CR0_TS (1UL<<3) /* Task Switched */
313#define X86_CR0_ET (1UL<<4) /* Extension Type */
314#define X86_CR0_NE (1UL<<5) /* Numeric Error */
315#define X86_CR0_WP (1UL<<16) /* Write Protect */
316#define X86_CR0_AM (1UL<<18) /* Alignment Mask */
317#define X86_CR0_NW (1UL<<29) /* Not Write-through */
318#define X86_CR0_CD (1UL<<30) /* Cache Disable */
319#define X86_CR0_PG (1UL<<31) /* Paging */
320
321/*
322 * CPU model specific register (MSR) numbers.
323 */
324
325/* x86-64 specific MSRs */
326#define MSR_EFER 0xc0000080 /* extended feature register */
327#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
328#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
329#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
330#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
331#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
332#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
333#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
334#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
335
336/* EFER bits: */
337#define EFER_SCE (1<<0) /* SYSCALL/SYSRET */
338#define EFER_LME (1<<8) /* Long mode enable */
339#define EFER_LMA (1<<10) /* Long mode active (read-only) */
340#define EFER_NX (1<<11) /* No execute enable */
341#define EFER_SVME (1<<12) /* Enable virtualization */
342#define EFER_LMSLE (1<<13) /* Long Mode Segment Limit Enable */
343#define EFER_FFXSR (1<<14) /* Enable Fast FXSAVE/FXRSTOR */
344
345/* Intel MSRs. Some also available on other CPUs */
346
347#define MSR_PPIN_CTL 0x0000004e
348#define MSR_PPIN 0x0000004f
349
350#define MSR_IA32_PERFCTR0 0x000000c1
351#define MSR_IA32_PERFCTR1 0x000000c2
352#define MSR_FSB_FREQ 0x000000cd
353#define MSR_PLATFORM_INFO 0x000000ce
354#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
355#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
356
357#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
358#define NHM_C3_AUTO_DEMOTE (1UL << 25)
359#define NHM_C1_AUTO_DEMOTE (1UL << 26)
360#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
361#define SNB_C1_AUTO_UNDEMOTE (1UL << 27)
362#define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
363
364#define MSR_MTRRcap 0x000000fe
365#define MSR_IA32_BBL_CR_CTL 0x00000119
366#define MSR_IA32_BBL_CR_CTL3 0x0000011e
367
368#define MSR_IA32_SYSENTER_CS 0x00000174
369#define MSR_IA32_SYSENTER_ESP 0x00000175
370#define MSR_IA32_SYSENTER_EIP 0x00000176
371
372#define MSR_IA32_MCG_CAP 0x00000179
373#define MSR_IA32_MCG_STATUS 0x0000017a
374#define MSR_IA32_MCG_CTL 0x0000017b
375#define MSR_IA32_MCG_EXT_CTL 0x000004d0
376
377#define MSR_OFFCORE_RSP_0 0x000001a6
378#define MSR_OFFCORE_RSP_1 0x000001a7
379#define MSR_TURBO_RATIO_LIMIT 0x000001ad
380#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
381#define MSR_TURBO_RATIO_LIMIT2 0x000001af
382
383#define MSR_LBR_SELECT 0x000001c8
384#define MSR_LBR_TOS 0x000001c9
385#define MSR_LBR_NHM_FROM 0x00000680
386#define MSR_LBR_NHM_TO 0x000006c0
387#define MSR_LBR_CORE_FROM 0x00000040
388#define MSR_LBR_CORE_TO 0x00000060
389
390#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
391#define LBR_INFO_MISPRED BIT_ULL(63)
392#define LBR_INFO_IN_TX BIT_ULL(62)
393#define LBR_INFO_ABORT BIT_ULL(61)
394#define LBR_INFO_CYCLES 0xffff
395
396#define MSR_IA32_PEBS_ENABLE 0x000003f1
397#define MSR_IA32_DS_AREA 0x00000600
398#define MSR_IA32_PERF_CAPABILITIES 0x00000345
399#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
400
401#define MSR_IA32_RTIT_CTL 0x00000570
402#define MSR_IA32_RTIT_STATUS 0x00000571
403#define MSR_IA32_RTIT_ADDR0_A 0x00000580
404#define MSR_IA32_RTIT_ADDR0_B 0x00000581
405#define MSR_IA32_RTIT_ADDR1_A 0x00000582
406#define MSR_IA32_RTIT_ADDR1_B 0x00000583
407#define MSR_IA32_RTIT_ADDR2_A 0x00000584
408#define MSR_IA32_RTIT_ADDR2_B 0x00000585
409#define MSR_IA32_RTIT_ADDR3_A 0x00000586
410#define MSR_IA32_RTIT_ADDR3_B 0x00000587
411#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
412#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
413#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
414
415#define MSR_MTRRfix64K_00000 0x00000250
416#define MSR_MTRRfix16K_80000 0x00000258
417#define MSR_MTRRfix16K_A0000 0x00000259
418#define MSR_MTRRfix4K_C0000 0x00000268
419#define MSR_MTRRfix4K_C8000 0x00000269
420#define MSR_MTRRfix4K_D0000 0x0000026a
421#define MSR_MTRRfix4K_D8000 0x0000026b
422#define MSR_MTRRfix4K_E0000 0x0000026c
423#define MSR_MTRRfix4K_E8000 0x0000026d
424#define MSR_MTRRfix4K_F0000 0x0000026e
425#define MSR_MTRRfix4K_F8000 0x0000026f
426#define MSR_MTRRdefType 0x000002ff
427
428#define MSR_IA32_CR_PAT 0x00000277
429
430#define MSR_IA32_DEBUGCTLMSR 0x000001d9
431#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
432#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
433#define MSR_IA32_LASTINTFROMIP 0x000001dd
434#define MSR_IA32_LASTINTTOIP 0x000001de
435
436/* DEBUGCTLMSR bits (others vary by model): */
437#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
438#define DEBUGCTLMSR_BTF_SHIFT 1
439#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
440#define DEBUGCTLMSR_TR (1UL << 6)
441#define DEBUGCTLMSR_BTS (1UL << 7)
442#define DEBUGCTLMSR_BTINT (1UL << 8)
443#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
444#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
445#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
446#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
447#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
448
449#define MSR_PEBS_FRONTEND 0x000003f7
450
451#define MSR_IA32_POWER_CTL 0x000001fc
452
453#define MSR_IA32_MC0_CTL 0x00000400
454#define MSR_IA32_MC0_STATUS 0x00000401
455#define MSR_IA32_MC0_ADDR 0x00000402
456#define MSR_IA32_MC0_MISC 0x00000403
457
458/* C-state Residency Counters */
459#define MSR_PKG_C3_RESIDENCY 0x000003f8
460#define MSR_PKG_C6_RESIDENCY 0x000003f9
461#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa
462#define MSR_PKG_C7_RESIDENCY 0x000003fa
463#define MSR_CORE_C3_RESIDENCY 0x000003fc
464#define MSR_CORE_C6_RESIDENCY 0x000003fd
465#define MSR_CORE_C7_RESIDENCY 0x000003fe
466#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff
467#define MSR_PKG_C2_RESIDENCY 0x0000060d
468#define MSR_PKG_C8_RESIDENCY 0x00000630
469#define MSR_PKG_C9_RESIDENCY 0x00000631
470#define MSR_PKG_C10_RESIDENCY 0x00000632
471
472/* Interrupt Response Limit */
473#define MSR_PKGC3_IRTL 0x0000060a
474#define MSR_PKGC6_IRTL 0x0000060b
475#define MSR_PKGC7_IRTL 0x0000060c
476#define MSR_PKGC8_IRTL 0x00000633
477#define MSR_PKGC9_IRTL 0x00000634
478#define MSR_PKGC10_IRTL 0x00000635
479
480/* Run Time Average Power Limiting (RAPL) Interface */
481
482#define MSR_RAPL_POWER_UNIT 0x00000606
483
484#define MSR_PKG_POWER_LIMIT 0x00000610
485#define MSR_PKG_ENERGY_STATUS 0x00000611
486#define MSR_PKG_PERF_STATUS 0x00000613
487#define MSR_PKG_POWER_INFO 0x00000614
488
489#define MSR_DRAM_POWER_LIMIT 0x00000618
490#define MSR_DRAM_ENERGY_STATUS 0x00000619
491#define MSR_DRAM_PERF_STATUS 0x0000061b
492#define MSR_DRAM_POWER_INFO 0x0000061c
493
494#define MSR_PP0_POWER_LIMIT 0x00000638
495#define MSR_PP0_ENERGY_STATUS 0x00000639
496#define MSR_PP0_POLICY 0x0000063a
497#define MSR_PP0_PERF_STATUS 0x0000063b
498
499#define MSR_PP1_POWER_LIMIT 0x00000640
500#define MSR_PP1_ENERGY_STATUS 0x00000641
501#define MSR_PP1_POLICY 0x00000642
502
503/* Config TDP MSRs */
504#define MSR_CONFIG_TDP_NOMINAL 0x00000648
505#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
506#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
507#define MSR_CONFIG_TDP_CONTROL 0x0000064B
508#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
509
510#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D
511
512#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
513#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
514#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
515#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B
516
517#define MSR_CORE_C1_RES 0x00000660
518#define MSR_MODULE_C6_RES_MS 0x00000664
519
520#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
521#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
522
523#define MSR_ATOM_CORE_RATIOS 0x0000066a
524#define MSR_ATOM_CORE_VIDS 0x0000066b
525#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c
526#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d
527
528
529#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
530#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
531#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
532
533/* Hardware P state interface */
534#define MSR_PPERF 0x0000064e
535#define MSR_PERF_LIMIT_REASONS 0x0000064f
536#define MSR_PM_ENABLE 0x00000770
537#define MSR_HWP_CAPABILITIES 0x00000771
538#define MSR_HWP_REQUEST_PKG 0x00000772
539#define MSR_HWP_INTERRUPT 0x00000773
540#define MSR_HWP_REQUEST 0x00000774
541#define MSR_HWP_STATUS 0x00000777
542
543/* CPUID.6.EAX */
544#define HWP_BASE_BIT (1<<7)
545#define HWP_NOTIFICATIONS_BIT (1<<8)
546#define HWP_ACTIVITY_WINDOW_BIT (1<<9)
547#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10)
548#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11)
549
550/* IA32_HWP_CAPABILITIES */
551#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff)
552#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff)
553#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff)
554#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff)
555
556/* IA32_HWP_REQUEST */
557#define HWP_MIN_PERF(x) (x & 0xff)
558#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
559#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
560#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24)
561#define HWP_EPP_PERFORMANCE 0x00
562#define HWP_EPP_BALANCE_PERFORMANCE 0x80
563#define HWP_EPP_BALANCE_POWERSAVE 0xC0
564#define HWP_EPP_POWERSAVE 0xFF
565#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32)
566#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42)
567
568/* IA32_HWP_STATUS */
569#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
570#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4)
571
572/* IA32_HWP_INTERRUPT */
573#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1)
574#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2)
575
576#define MSR_AMD64_MC0_MASK 0xc0010044
577
578#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
579#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
580#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
581#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
582
583#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
584
585/* These are consecutive and not in the normal 4er MCE bank block */
586#define MSR_IA32_MC0_CTL2 0x00000280
587#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
588
589#define MSR_P6_PERFCTR0 0x000000c1
590#define MSR_P6_PERFCTR1 0x000000c2
591#define MSR_P6_EVNTSEL0 0x00000186
592#define MSR_P6_EVNTSEL1 0x00000187
593
594#define MSR_KNC_PERFCTR0 0x00000020
595#define MSR_KNC_PERFCTR1 0x00000021
596#define MSR_KNC_EVNTSEL0 0x00000028
597#define MSR_KNC_EVNTSEL1 0x00000029
598
599/* Alternative perfctr range with full access. */
600#define MSR_IA32_PMC0 0x000004c1
601
602/* AMD64 MSRs. Not complete. See the architecture manual for a more
603 complete list. */
604
605#define MSR_AMD64_PATCH_LEVEL 0x0000008b
606#define MSR_AMD64_TSC_RATIO 0xc0000104
607#define MSR_AMD64_NB_CFG 0xc001001f
608#define MSR_AMD64_PATCH_LOADER 0xc0010020
609#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
610#define MSR_AMD64_OSVW_STATUS 0xc0010141
611#define MSR_AMD64_LS_CFG 0xc0011020
612#define MSR_AMD64_DC_CFG 0xc0011022
613#define MSR_AMD64_BU_CFG2 0xc001102a
614#define MSR_AMD64_IBSFETCHCTL 0xc0011030
615#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
616#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
617#define MSR_AMD64_IBSFETCH_REG_COUNT 3
618#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
619#define MSR_AMD64_IBSOPCTL 0xc0011033
620#define MSR_AMD64_IBSOPRIP 0xc0011034
621#define MSR_AMD64_IBSOPDATA 0xc0011035
622#define MSR_AMD64_IBSOPDATA2 0xc0011036
623#define MSR_AMD64_IBSOPDATA3 0xc0011037
624#define MSR_AMD64_IBSDCLINAD 0xc0011038
625#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
626#define MSR_AMD64_IBSOP_REG_COUNT 7
627#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
628#define MSR_AMD64_IBSCTL 0xc001103a
629#define MSR_AMD64_IBSBRTARGET 0xc001103b
630#define MSR_AMD64_IBSOPDATA4 0xc001103d
631#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
632#define MSR_AMD64_SEV 0xc0010131
633#define MSR_AMD64_SEV_ENABLED_BIT 0
634#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
635
636/* Fam 17h MSRs */
637#define MSR_F17H_IRPERF 0xc00000e9
638
639/* Fam 16h MSRs */
640#define MSR_F16H_L2I_PERF_CTL 0xc0010230
641#define MSR_F16H_L2I_PERF_CTR 0xc0010231
642#define MSR_F16H_DR1_ADDR_MASK 0xc0011019
643#define MSR_F16H_DR2_ADDR_MASK 0xc001101a
644#define MSR_F16H_DR3_ADDR_MASK 0xc001101b
645#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
646
647/* Fam 15h MSRs */
648#define MSR_F15H_PERF_CTL 0xc0010200
649#define MSR_F15H_PERF_CTR 0xc0010201
650#define MSR_F15H_NB_PERF_CTL 0xc0010240
651#define MSR_F15H_NB_PERF_CTR 0xc0010241
652#define MSR_F15H_PTSC 0xc0010280
653#define MSR_F15H_IC_CFG 0xc0011021
654
655/* Fam 10h MSRs */
656#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
657#define FAM10H_MMIO_CONF_ENABLE (1<<0)
658#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
659#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
660#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
661#define FAM10H_MMIO_CONF_BASE_SHIFT 20
662#define MSR_FAM10H_NODE_ID 0xc001100c
663#define MSR_F10H_DECFG 0xc0011029
664#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
665#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
666
667/* K8 MSRs */
668#define MSR_K8_TOP_MEM1 0xc001001a
669#define MSR_K8_TOP_MEM2 0xc001001d
670#define MSR_K8_SYSCFG 0xc0010010
671#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
672#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
673#define MSR_K8_INT_PENDING_MSG 0xc0010055
674/* C1E active bits in int pending message */
675#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
676#define MSR_K8_TSEG_ADDR 0xc0010112
677#define MSR_K8_TSEG_MASK 0xc0010113
678#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
679#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
680#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
681
682/* K7 MSRs */
683#define MSR_K7_EVNTSEL0 0xc0010000
684#define MSR_K7_PERFCTR0 0xc0010004
685#define MSR_K7_EVNTSEL1 0xc0010001
686#define MSR_K7_PERFCTR1 0xc0010005
687#define MSR_K7_EVNTSEL2 0xc0010002
688#define MSR_K7_PERFCTR2 0xc0010006
689#define MSR_K7_EVNTSEL3 0xc0010003
690#define MSR_K7_PERFCTR3 0xc0010007
691#define MSR_K7_CLK_CTL 0xc001001b
692#define MSR_K7_HWCR 0xc0010015
693#define MSR_K7_HWCR_SMMLOCK_BIT 0
694#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
695#define MSR_K7_FID_VID_CTL 0xc0010041
696#define MSR_K7_FID_VID_STATUS 0xc0010042
697
698/* K6 MSRs */
699#define MSR_K6_WHCR 0xc0000082
700#define MSR_K6_UWCCR 0xc0000085
701#define MSR_K6_EPMR 0xc0000086
702#define MSR_K6_PSOR 0xc0000087
703#define MSR_K6_PFIR 0xc0000088
704
705/* Centaur-Hauls/IDT defined MSRs. */
706#define MSR_IDT_FCR1 0x00000107
707#define MSR_IDT_FCR2 0x00000108
708#define MSR_IDT_FCR3 0x00000109
709#define MSR_IDT_FCR4 0x0000010a
710
711#define MSR_IDT_MCR0 0x00000110
712#define MSR_IDT_MCR1 0x00000111
713#define MSR_IDT_MCR2 0x00000112
714#define MSR_IDT_MCR3 0x00000113
715#define MSR_IDT_MCR4 0x00000114
716#define MSR_IDT_MCR5 0x00000115
717#define MSR_IDT_MCR6 0x00000116
718#define MSR_IDT_MCR7 0x00000117
719#define MSR_IDT_MCR_CTRL 0x00000120
720
721/* VIA Cyrix defined MSRs*/
722#define MSR_VIA_FCR 0x00001107
723#define MSR_VIA_LONGHAUL 0x0000110a
724#define MSR_VIA_RNG 0x0000110b
725#define MSR_VIA_BCR2 0x00001147
726
727/* Transmeta defined MSRs */
728#define MSR_TMTA_LONGRUN_CTRL 0x80868010
729#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
730#define MSR_TMTA_LRTI_READOUT 0x80868018
731#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
732
733/* Intel defined MSRs. */
734#define MSR_IA32_P5_MC_ADDR 0x00000000
735#define MSR_IA32_P5_MC_TYPE 0x00000001
736#define MSR_IA32_TSC 0x00000010
737#define MSR_IA32_PLATFORM_ID 0x00000017
738#define MSR_IA32_EBL_CR_POWERON 0x0000002a
739#define MSR_EBC_FREQUENCY_ID 0x0000002c
740#define MSR_SMI_COUNT 0x00000034
741#define MSR_IA32_FEATURE_CONTROL 0x0000003a
742#define MSR_IA32_TSC_ADJUST 0x0000003b
743#define MSR_IA32_BNDCFGS 0x00000d90
744
745#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
746
747#define MSR_IA32_XSS 0x00000da0
748
749#define FEATURE_CONTROL_LOCKED (1<<0)
750#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
751#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
752#define FEATURE_CONTROL_LMCE (1<<20)
753
754#define MSR_IA32_APICBASE 0x0000001b
755#define MSR_IA32_APICBASE_BSP (1<<8)
756#define MSR_IA32_APICBASE_ENABLE (1<<11)
757#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
758
759#define MSR_IA32_TSCDEADLINE 0x000006e0
760
761#define MSR_IA32_UCODE_WRITE 0x00000079
762#define MSR_IA32_UCODE_REV 0x0000008b
763
764#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
765#define MSR_IA32_SMBASE 0x0000009e
766
767#define MSR_IA32_PERF_STATUS 0x00000198
768#define MSR_IA32_PERF_CTL 0x00000199
769#define INTEL_PERF_CTL_MASK 0xffff
770#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
771#define MSR_AMD_PERF_STATUS 0xc0010063
772#define MSR_AMD_PERF_CTL 0xc0010062
773
774#define MSR_IA32_MPERF 0x000000e7
775#define MSR_IA32_APERF 0x000000e8
776
777#define MSR_IA32_THERM_CONTROL 0x0000019a
778#define MSR_IA32_THERM_INTERRUPT 0x0000019b
779
780#define THERM_INT_HIGH_ENABLE (1 << 0)
781#define THERM_INT_LOW_ENABLE (1 << 1)
782#define THERM_INT_PLN_ENABLE (1 << 24)
783
784#define MSR_IA32_THERM_STATUS 0x0000019c
785
786#define THERM_STATUS_PROCHOT (1 << 0)
787#define THERM_STATUS_POWER_LIMIT (1 << 10)
788
789#define MSR_THERM2_CTL 0x0000019d
790
791#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
792
793#define MSR_IA32_MISC_ENABLE 0x000001a0
794
795#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
796
797#define MSR_MISC_FEATURE_CONTROL 0x000001a4
798#define MSR_MISC_PWR_MGMT 0x000001aa
799
800#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
801#define ENERGY_PERF_BIAS_PERFORMANCE 0
802#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
803#define ENERGY_PERF_BIAS_NORMAL 6
804#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
805#define ENERGY_PERF_BIAS_POWERSAVE 15
806
807#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
808
809#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
810#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
811
812#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
813
814#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
815#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
816#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
817
818/* Thermal Thresholds Support */
819#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
820#define THERM_SHIFT_THRESHOLD0 8
821#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
822#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
823#define THERM_SHIFT_THRESHOLD1 16
824#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
825#define THERM_STATUS_THRESHOLD0 (1 << 6)
826#define THERM_LOG_THRESHOLD0 (1 << 7)
827#define THERM_STATUS_THRESHOLD1 (1 << 8)
828#define THERM_LOG_THRESHOLD1 (1 << 9)
829
830/* MISC_ENABLE bits: architectural */
831#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
832#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
833#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
834#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
835#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
836#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
837#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
838#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
839#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
840#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
841#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
842#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
843#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
844#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
845#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
846#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT)
847#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
848#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
849#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
850#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
851
852/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
853#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
854#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
855#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
856#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
857#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
858#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
859#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
860#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
861#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
862#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
863#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
864#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
865#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
866#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
867#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
868#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
869#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
870#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
871#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
872#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
873#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
874#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
875#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
876#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
877#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
878#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
879#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
880#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
881#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
882#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
883
884/* MISC_FEATURES_ENABLES non-architectural features */
885#define MSR_MISC_FEATURES_ENABLES 0x00000140
886
887#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0
888#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
889#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1
890
891#define MSR_IA32_TSC_DEADLINE 0x000006E0
892
893/* P4/Xeon+ specific */
894#define MSR_IA32_MCG_EAX 0x00000180
895#define MSR_IA32_MCG_EBX 0x00000181
896#define MSR_IA32_MCG_ECX 0x00000182
897#define MSR_IA32_MCG_EDX 0x00000183
898#define MSR_IA32_MCG_ESI 0x00000184
899#define MSR_IA32_MCG_EDI 0x00000185
900#define MSR_IA32_MCG_EBP 0x00000186
901#define MSR_IA32_MCG_ESP 0x00000187
902#define MSR_IA32_MCG_EFLAGS 0x00000188
903#define MSR_IA32_MCG_EIP 0x00000189
904#define MSR_IA32_MCG_RESERVED 0x0000018a
905
906/* Pentium IV performance counter MSRs */
907#define MSR_P4_BPU_PERFCTR0 0x00000300
908#define MSR_P4_BPU_PERFCTR1 0x00000301
909#define MSR_P4_BPU_PERFCTR2 0x00000302
910#define MSR_P4_BPU_PERFCTR3 0x00000303
911#define MSR_P4_MS_PERFCTR0 0x00000304
912#define MSR_P4_MS_PERFCTR1 0x00000305
913#define MSR_P4_MS_PERFCTR2 0x00000306
914#define MSR_P4_MS_PERFCTR3 0x00000307
915#define MSR_P4_FLAME_PERFCTR0 0x00000308
916#define MSR_P4_FLAME_PERFCTR1 0x00000309
917#define MSR_P4_FLAME_PERFCTR2 0x0000030a
918#define MSR_P4_FLAME_PERFCTR3 0x0000030b
919#define MSR_P4_IQ_PERFCTR0 0x0000030c
920#define MSR_P4_IQ_PERFCTR1 0x0000030d
921#define MSR_P4_IQ_PERFCTR2 0x0000030e
922#define MSR_P4_IQ_PERFCTR3 0x0000030f
923#define MSR_P4_IQ_PERFCTR4 0x00000310
924#define MSR_P4_IQ_PERFCTR5 0x00000311
925#define MSR_P4_BPU_CCCR0 0x00000360
926#define MSR_P4_BPU_CCCR1 0x00000361
927#define MSR_P4_BPU_CCCR2 0x00000362
928#define MSR_P4_BPU_CCCR3 0x00000363
929#define MSR_P4_MS_CCCR0 0x00000364
930#define MSR_P4_MS_CCCR1 0x00000365
931#define MSR_P4_MS_CCCR2 0x00000366
932#define MSR_P4_MS_CCCR3 0x00000367
933#define MSR_P4_FLAME_CCCR0 0x00000368
934#define MSR_P4_FLAME_CCCR1 0x00000369
935#define MSR_P4_FLAME_CCCR2 0x0000036a
936#define MSR_P4_FLAME_CCCR3 0x0000036b
937#define MSR_P4_IQ_CCCR0 0x0000036c
938#define MSR_P4_IQ_CCCR1 0x0000036d
939#define MSR_P4_IQ_CCCR2 0x0000036e
940#define MSR_P4_IQ_CCCR3 0x0000036f
941#define MSR_P4_IQ_CCCR4 0x00000370
942#define MSR_P4_IQ_CCCR5 0x00000371
943#define MSR_P4_ALF_ESCR0 0x000003ca
944#define MSR_P4_ALF_ESCR1 0x000003cb
945#define MSR_P4_BPU_ESCR0 0x000003b2
946#define MSR_P4_BPU_ESCR1 0x000003b3
947#define MSR_P4_BSU_ESCR0 0x000003a0
948#define MSR_P4_BSU_ESCR1 0x000003a1
949#define MSR_P4_CRU_ESCR0 0x000003b8
950#define MSR_P4_CRU_ESCR1 0x000003b9
951#define MSR_P4_CRU_ESCR2 0x000003cc
952#define MSR_P4_CRU_ESCR3 0x000003cd
953#define MSR_P4_CRU_ESCR4 0x000003e0
954#define MSR_P4_CRU_ESCR5 0x000003e1
955#define MSR_P4_DAC_ESCR0 0x000003a8
956#define MSR_P4_DAC_ESCR1 0x000003a9
957#define MSR_P4_FIRM_ESCR0 0x000003a4
958#define MSR_P4_FIRM_ESCR1 0x000003a5
959#define MSR_P4_FLAME_ESCR0 0x000003a6
960#define MSR_P4_FLAME_ESCR1 0x000003a7
961#define MSR_P4_FSB_ESCR0 0x000003a2
962#define MSR_P4_FSB_ESCR1 0x000003a3
963#define MSR_P4_IQ_ESCR0 0x000003ba
964#define MSR_P4_IQ_ESCR1 0x000003bb
965#define MSR_P4_IS_ESCR0 0x000003b4
966#define MSR_P4_IS_ESCR1 0x000003b5
967#define MSR_P4_ITLB_ESCR0 0x000003b6
968#define MSR_P4_ITLB_ESCR1 0x000003b7
969#define MSR_P4_IX_ESCR0 0x000003c8
970#define MSR_P4_IX_ESCR1 0x000003c9
971#define MSR_P4_MOB_ESCR0 0x000003aa
972#define MSR_P4_MOB_ESCR1 0x000003ab
973#define MSR_P4_MS_ESCR0 0x000003c0
974#define MSR_P4_MS_ESCR1 0x000003c1
975#define MSR_P4_PMH_ESCR0 0x000003ac
976#define MSR_P4_PMH_ESCR1 0x000003ad
977#define MSR_P4_RAT_ESCR0 0x000003bc
978#define MSR_P4_RAT_ESCR1 0x000003bd
979#define MSR_P4_SAAT_ESCR0 0x000003ae
980#define MSR_P4_SAAT_ESCR1 0x000003af
981#define MSR_P4_SSU_ESCR0 0x000003be
982#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
983
984#define MSR_P4_TBPU_ESCR0 0x000003c2
985#define MSR_P4_TBPU_ESCR1 0x000003c3
986#define MSR_P4_TC_ESCR0 0x000003c4
987#define MSR_P4_TC_ESCR1 0x000003c5
988#define MSR_P4_U2L_ESCR0 0x000003b0
989#define MSR_P4_U2L_ESCR1 0x000003b1
990
991#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
992
993/* Intel Core-based CPU performance counters */
994#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
995#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
996#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
997#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
998#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
999#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
1000#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
1001
1002/* Geode defined MSRs */
1003#define MSR_GEODE_BUSCONT_CONF0 0x00001900
1004
1005/* Intel VT MSRs */
1006#define MSR_IA32_VMX_BASIC 0x00000480
1007#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
1008#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
1009#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
1010#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
1011#define MSR_IA32_VMX_MISC 0x00000485
1012#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
1013#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
1014#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
1015#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
1016#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
1017#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
1018#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
1019#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
1020#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
1021#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
1022#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
1023#define MSR_IA32_VMX_VMFUNC 0x00000491
1024
1025/* VMX_BASIC bits and bitmasks */
1026#define VMX_BASIC_VMCS_SIZE_SHIFT 32
1027#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
1028#define VMX_BASIC_64 0x0001000000000000LLU
1029#define VMX_BASIC_MEM_TYPE_SHIFT 50
1030#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
1031#define VMX_BASIC_MEM_TYPE_WB 6LLU
1032#define VMX_BASIC_INOUT 0x0040000000000000LLU
1033
1034/* MSR_IA32_VMX_MISC bits */
1035#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
1036#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
1037/* AMD-V MSRs */
1038
1039#define MSR_VM_CR 0xc0010114
1040#define MSR_VM_IGNNE 0xc0010115
1041#define MSR_VM_HSAVE_PA 0xc0010117
1042
1043#endif /* !SELFTEST_KVM_X86_H */
diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
new file mode 100644
index 000000000000..c9f5b7d4ce38
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -0,0 +1,87 @@
1/*
2 * tools/testing/selftests/kvm/lib/assert.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/
10
11#include "test_util.h"
12
13#include <execinfo.h>
14#include <sys/syscall.h>
15
16/* Dumps the current stack trace to stderr. */
17static void __attribute__((noinline)) test_dump_stack(void);
18static void test_dump_stack(void)
19{
20 /*
21 * Build and run this command:
22 *
23 * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \
24 * grep -v test_dump_stack | cat -n 1>&2
25 *
26 * Note that the spacing is different and there's no newline.
27 */
28 size_t i;
29 size_t n = 20;
30 void *stack[n];
31 const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai";
32 const char *pipeline = "|cat -n 1>&2";
33 char cmd[strlen(addr2line) + strlen(pipeline) +
34 /* N bytes per addr * 2 digits per byte + 1 space per addr: */
35 n * (((sizeof(void *)) * 2) + 1) +
36 /* Null terminator: */
37 1];
38 char *c;
39
40 n = backtrace(stack, n);
41 c = &cmd[0];
42 c += sprintf(c, "%s", addr2line);
43 /*
44 * Skip the first 3 frames: backtrace, test_dump_stack, and
45 * test_assert. We hope that backtrace isn't inlined and the other two
46 * we've declared noinline.
47 */
48 for (i = 2; i < n; i++)
49 c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1);
50 c += sprintf(c, "%s", pipeline);
51#pragma GCC diagnostic push
52#pragma GCC diagnostic ignored "-Wunused-result"
53 system(cmd);
54#pragma GCC diagnostic pop
55}
56
57static pid_t gettid(void)
58{
59 return syscall(SYS_gettid);
60}
61
62void __attribute__((noinline))
63test_assert(bool exp, const char *exp_str,
64 const char *file, unsigned int line, const char *fmt, ...)
65{
66 va_list ap;
67
68 if (!(exp)) {
69 va_start(ap, fmt);
70
71 fprintf(stderr, "==== Test Assertion Failure ====\n"
72 " %s:%u: %s\n"
73 " pid=%d tid=%d\n",
74 file, line, exp_str, getpid(), gettid());
75 test_dump_stack();
76 if (fmt) {
77 fputs(" ", stderr);
78 vfprintf(stderr, fmt, ap);
79 fputs("\n", stderr);
80 }
81 va_end(ap);
82
83 exit(254);
84 }
85
86 return;
87}
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
new file mode 100644
index 000000000000..5eb857584aa3
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -0,0 +1,197 @@
1/*
2 * tools/testing/selftests/kvm/lib/elf.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#include "test_util.h"
10
11#include <bits/endian.h>
12#include <linux/elf.h>
13
14#include "kvm_util.h"
15#include "kvm_util_internal.h"
16
17static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
18{
19 off_t offset_rv;
20
21 /* Open the ELF file. */
22 int fd;
23 fd = open(filename, O_RDONLY);
24 TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
25 " filename: %s\n"
26 " rv: %i errno: %i", filename, fd, errno);
27
28 /* Read in and validate ELF Identification Record.
29 * The ELF Identification record is the first 16 (EI_NIDENT) bytes
30 * of the ELF header, which is at the beginning of the ELF file.
31 * For now it is only safe to read the first EI_NIDENT bytes. Once
32 * read and validated, the value of e_ehsize can be used to determine
33 * the real size of the ELF header.
34 */
35 unsigned char ident[EI_NIDENT];
36 test_read(fd, ident, sizeof(ident));
37 TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1)
38 && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3),
39 "ELF MAGIC Mismatch,\n"
40 " filename: %s\n"
41 " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n"
42 " Expected: %02x %02x %02x %02x",
43 filename,
44 ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3],
45 ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3);
46 TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64,
47 "Current implementation only able to handle ELFCLASS64,\n"
48 " filename: %s\n"
49 " ident[EI_CLASS]: %02x\n"
50 " expected: %02x",
51 filename,
52 ident[EI_CLASS], ELFCLASS64);
53 TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN)
54 && (ident[EI_DATA] == ELFDATA2LSB))
55 || ((BYTE_ORDER == BIG_ENDIAN)
56 && (ident[EI_DATA] == ELFDATA2MSB)), "Current "
57 "implementation only able to handle\n"
58 "cases where the host and ELF file endianness\n"
59 "is the same:\n"
60 " host BYTE_ORDER: %u\n"
61 " host LITTLE_ENDIAN: %u\n"
62 " host BIG_ENDIAN: %u\n"
63 " ident[EI_DATA]: %u\n"
64 " ELFDATA2LSB: %u\n"
65 " ELFDATA2MSB: %u",
66 BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN,
67 ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB);
68 TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT,
69 "Current implementation only able to handle current "
70 "ELF version,\n"
71 " filename: %s\n"
72 " ident[EI_VERSION]: %02x\n"
73 " expected: %02x",
74 filename, ident[EI_VERSION], EV_CURRENT);
75
76 /* Read in the ELF header.
77 * With the ELF Identification portion of the ELF header
78 * validated, especially that the value at EI_VERSION is
79 * as expected, it is now safe to read the entire ELF header.
80 */
81 offset_rv = lseek(fd, 0, SEEK_SET);
82 TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n"
83 " rv: %zi expected: %i", offset_rv, 0);
84 test_read(fd, hdrp, sizeof(*hdrp));
85 TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr),
86 "Unexpected physical header size,\n"
87 " hdrp->e_phentsize: %x\n"
88 " expected: %zx",
89 hdrp->e_phentsize, sizeof(Elf64_Phdr));
90 TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr),
91 "Unexpected section header size,\n"
92 " hdrp->e_shentsize: %x\n"
93 " expected: %zx",
94 hdrp->e_shentsize, sizeof(Elf64_Shdr));
95}
96
97/* VM ELF Load
98 *
99 * Input Args:
100 * filename - Path to ELF file
101 *
102 * Output Args: None
103 *
104 * Input/Output Args:
105 * vm - Pointer to opaque type that describes the VM.
106 *
107 * Return: None, TEST_ASSERT failures for all error conditions
108 *
109 * Loads the program image of the ELF file specified by filename,
110 * into the virtual address space of the VM pointed to by vm. On entry
111 * the VM needs to not be using any of the virtual address space used
112 * by the image and it needs to have sufficient available physical pages, to
113 * back the virtual pages used to load the image.
114 */
115void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
116 uint32_t data_memslot, uint32_t pgd_memslot)
117{
118 off_t offset, offset_rv;
119 Elf64_Ehdr hdr;
120
121 /* Open the ELF file. */
122 int fd;
123 fd = open(filename, O_RDONLY);
124 TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
125 " filename: %s\n"
126 " rv: %i errno: %i", filename, fd, errno);
127
128 /* Read in the ELF header. */
129 elfhdr_get(filename, &hdr);
130
131 /* For each program header.
132 * The following ELF header members specify the location
133 * and size of the program headers:
134 *
135 * e_phoff - File offset to start of program headers
136 * e_phentsize - Size of each program header
137 * e_phnum - Number of program header entries
138 */
139 for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) {
140 /* Seek to the beginning of the program header. */
141 offset = hdr.e_phoff + (n1 * hdr.e_phentsize);
142 offset_rv = lseek(fd, offset, SEEK_SET);
143 TEST_ASSERT(offset_rv == offset,
144 "Failed to seek to begining of program header %u,\n"
145 " filename: %s\n"
146 " rv: %jd errno: %i",
147 n1, filename, (intmax_t) offset_rv, errno);
148
149 /* Read in the program header. */
150 Elf64_Phdr phdr;
151 test_read(fd, &phdr, sizeof(phdr));
152
153 /* Skip if this header doesn't describe a loadable segment. */
154 if (phdr.p_type != PT_LOAD)
155 continue;
156
157 /* Allocate memory for this segment within the VM. */
158 TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment "
159 "memsize of 0,\n"
160 " phdr index: %u p_memsz: 0x%" PRIx64,
161 n1, (uint64_t) phdr.p_memsz);
162 vm_vaddr_t seg_vstart = phdr.p_vaddr;
163 seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1);
164 vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1;
165 seg_vend |= vm->page_size - 1;
166 size_t seg_size = seg_vend - seg_vstart + 1;
167
168 vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart,
169 data_memslot, pgd_memslot);
170 TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
171 "virtual memory for segment at requested min addr,\n"
172 " segment idx: %u\n"
173 " seg_vstart: 0x%lx\n"
174 " vaddr: 0x%lx",
175 n1, seg_vstart, vaddr);
176 memset(addr_gva2hva(vm, vaddr), 0, seg_size);
177 /* TODO(lhuemill): Set permissions of each memory segment
178 * based on the least-significant 3 bits of phdr.p_flags.
179 */
180
181 /* Load portion of initial state that is contained within
182 * the ELF file.
183 */
184 if (phdr.p_filesz) {
185 offset_rv = lseek(fd, phdr.p_offset, SEEK_SET);
186 TEST_ASSERT(offset_rv == phdr.p_offset,
187 "Seek to program segment offset failed,\n"
188 " program header idx: %u errno: %i\n"
189 " offset_rv: 0x%jx\n"
190 " expected: 0x%jx\n",
191 n1, errno, (intmax_t) offset_rv,
192 (intmax_t) phdr.p_offset);
193 test_read(fd, addr_gva2hva(vm, phdr.p_vaddr),
194 phdr.p_filesz);
195 }
196 }
197}
diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c
new file mode 100644
index 000000000000..cff869ffe6ee
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/io.c
@@ -0,0 +1,158 @@
1/*
2 * tools/testing/selftests/kvm/lib/io.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#include "test_util.h"
10
11/* Test Write
12 *
13 * A wrapper for write(2), that automatically handles the following
14 * special conditions:
15 *
16 * + Interrupted system call (EINTR)
17 * + Write of less than requested amount
18 * + Non-block return (EAGAIN)
19 *
20 * For each of the above, an additional write is performed to automatically
21 * continue writing the requested data.
22 * There are also many cases where write(2) can return an unexpected
23 * error (e.g. EIO). Such errors cause a TEST_ASSERT failure.
24 *
25 * Note, for function signature compatibility with write(2), this function
26 * returns the number of bytes written, but that value will always be equal
27 * to the number of requested bytes. All other conditions in this and
28 * future enhancements to this function either automatically issue another
29 * write(2) or cause a TEST_ASSERT failure.
30 *
31 * Args:
32 * fd - Opened file descriptor to file to be written.
33 * count - Number of bytes to write.
34 *
35 * Output:
36 * buf - Starting address of data to be written.
37 *
38 * Return:
39 * On success, number of bytes written.
40 * On failure, a TEST_ASSERT failure is caused.
41 */
42ssize_t test_write(int fd, const void *buf, size_t count)
43{
44 ssize_t rc;
45 ssize_t num_written = 0;
46 size_t num_left = count;
47 const char *ptr = buf;
48
49 /* Note: Count of zero is allowed (see "RETURN VALUE" portion of
50 * write(2) manpage for details.
51 */
52 TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
53
54 do {
55 rc = write(fd, ptr, num_left);
56
57 switch (rc) {
58 case -1:
59 TEST_ASSERT(errno == EAGAIN || errno == EINTR,
60 "Unexpected write failure,\n"
61 " rc: %zi errno: %i", rc, errno);
62 continue;
63
64 case 0:
65 TEST_ASSERT(false, "Unexpected EOF,\n"
66 " rc: %zi num_written: %zi num_left: %zu",
67 rc, num_written, num_left);
68 break;
69
70 default:
71 TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n"
72 " rc: %zi errno: %i", rc, errno);
73 num_written += rc;
74 num_left -= rc;
75 ptr += rc;
76 break;
77 }
78 } while (num_written < count);
79
80 return num_written;
81}
82
83/* Test Read
84 *
85 * A wrapper for read(2), that automatically handles the following
86 * special conditions:
87 *
88 * + Interrupted system call (EINTR)
89 * + Read of less than requested amount
90 * + Non-block return (EAGAIN)
91 *
92 * For each of the above, an additional read is performed to automatically
93 * continue reading the requested data.
94 * There are also many cases where read(2) can return an unexpected
95 * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note,
96 * it is expected that the file opened by fd at the current file position
97 * contains at least the number of requested bytes to be read. A TEST_ASSERT
98 * failure is produced if an End-Of-File condition occurs, before all the
99 * data is read. It is the callers responsibility to assure that sufficient
100 * data exists.
101 *
102 * Note, for function signature compatibility with read(2), this function
103 * returns the number of bytes read, but that value will always be equal
104 * to the number of requested bytes. All other conditions in this and
105 * future enhancements to this function either automatically issue another
106 * read(2) or cause a TEST_ASSERT failure.
107 *
108 * Args:
109 * fd - Opened file descriptor to file to be read.
110 * count - Number of bytes to read.
111 *
112 * Output:
113 * buf - Starting address of where to write the bytes read.
114 *
115 * Return:
116 * On success, number of bytes read.
117 * On failure, a TEST_ASSERT failure is caused.
118 */
119ssize_t test_read(int fd, void *buf, size_t count)
120{
121 ssize_t rc;
122 ssize_t num_read = 0;
123 size_t num_left = count;
124 char *ptr = buf;
125
126 /* Note: Count of zero is allowed (see "If count is zero" portion of
127 * read(2) manpage for details.
128 */
129 TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
130
131 do {
132 rc = read(fd, ptr, num_left);
133
134 switch (rc) {
135 case -1:
136 TEST_ASSERT(errno == EAGAIN || errno == EINTR,
137 "Unexpected read failure,\n"
138 " rc: %zi errno: %i", rc, errno);
139 break;
140
141 case 0:
142 TEST_ASSERT(false, "Unexpected EOF,\n"
143 " rc: %zi num_read: %zi num_left: %zu",
144 rc, num_read, num_left);
145 break;
146
147 default:
148 TEST_ASSERT(rc > 0, "Unexpected ret from read,\n"
149 " rc: %zi errno: %i", rc, errno);
150 num_read += rc;
151 num_left -= rc;
152 ptr += rc;
153 break;
154 }
155 } while (num_read < count);
156
157 return num_read;
158}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
new file mode 100644
index 000000000000..7ca1bb40c498
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -0,0 +1,1480 @@
1/*
2 * tools/testing/selftests/kvm/lib/kvm_util.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#include "test_util.h"
10#include "kvm_util.h"
11#include "kvm_util_internal.h"
12
13#include <assert.h>
14#include <sys/mman.h>
15#include <sys/types.h>
16#include <sys/stat.h>
17
18#define KVM_DEV_PATH "/dev/kvm"
19
20#define KVM_UTIL_PGS_PER_HUGEPG 512
21#define KVM_UTIL_MIN_PADDR 0x2000
22
23/* Aligns x up to the next multiple of size. Size must be a power of 2. */
24static void *align(void *x, size_t size)
25{
26 size_t mask = size - 1;
27 TEST_ASSERT(size != 0 && !(size & (size - 1)),
28 "size not a power of 2: %lu", size);
29 return (void *) (((size_t) x + mask) & ~mask);
30}
31
32/* Capability
33 *
34 * Input Args:
35 * cap - Capability
36 *
37 * Output Args: None
38 *
39 * Return:
40 * On success, the Value corresponding to the capability (KVM_CAP_*)
41 * specified by the value of cap. On failure a TEST_ASSERT failure
42 * is produced.
43 *
44 * Looks up and returns the value corresponding to the capability
45 * (KVM_CAP_*) given by cap.
46 */
47int kvm_check_cap(long cap)
48{
49 int ret;
50 int kvm_fd;
51
52 kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
53 TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
54 KVM_DEV_PATH, kvm_fd, errno);
55
56 ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
57 TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n"
58 " rc: %i errno: %i", ret, errno);
59
60 close(kvm_fd);
61
62 return ret;
63}
64
65/* VM Create
66 *
67 * Input Args:
68 * mode - VM Mode (e.g. VM_MODE_FLAT48PG)
69 * phy_pages - Physical memory pages
70 * perm - permission
71 *
72 * Output Args: None
73 *
74 * Return:
75 * Pointer to opaque structure that describes the created VM.
76 *
77 * Creates a VM with the mode specified by mode (e.g. VM_MODE_FLAT48PG).
78 * When phy_pages is non-zero, a memory region of phy_pages physical pages
79 * is created and mapped starting at guest physical address 0. The file
80 * descriptor to control the created VM is created with the permissions
81 * given by perm (e.g. O_RDWR).
82 */
83struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
84{
85 struct kvm_vm *vm;
86 int kvm_fd;
87
88 /* Allocate memory. */
89 vm = calloc(1, sizeof(*vm));
90 TEST_ASSERT(vm != NULL, "Insufficent Memory");
91
92 vm->mode = mode;
93 kvm_fd = open(KVM_DEV_PATH, perm);
94 TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
95 KVM_DEV_PATH, kvm_fd, errno);
96
97 /* Create VM. */
98 vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL);
99 TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
100 "rc: %i errno: %i", vm->fd, errno);
101
102 close(kvm_fd);
103
104 /* Setup mode specific traits. */
105 switch (vm->mode) {
106 case VM_MODE_FLAT48PG:
107 vm->page_size = 0x1000;
108 vm->page_shift = 12;
109
110 /* Limit to 48-bit canonical virtual addresses. */
111 vm->vpages_valid = sparsebit_alloc();
112 sparsebit_set_num(vm->vpages_valid,
113 0, (1ULL << (48 - 1)) >> vm->page_shift);
114 sparsebit_set_num(vm->vpages_valid,
115 (~((1ULL << (48 - 1)) - 1)) >> vm->page_shift,
116 (1ULL << (48 - 1)) >> vm->page_shift);
117
118 /* Limit physical addresses to 52-bits. */
119 vm->max_gfn = ((1ULL << 52) >> vm->page_shift) - 1;
120 break;
121
122 default:
123 TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
124 }
125
126 /* Allocate and setup memory for guest. */
127 vm->vpages_mapped = sparsebit_alloc();
128 if (phy_pages != 0)
129 vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
130 0, 0, phy_pages, 0);
131
132 return vm;
133}
134
135/* Userspace Memory Region Find
136 *
137 * Input Args:
138 * vm - Virtual Machine
139 * start - Starting VM physical address
140 * end - Ending VM physical address, inclusive.
141 *
142 * Output Args: None
143 *
144 * Return:
145 * Pointer to overlapping region, NULL if no such region.
146 *
147 * Searches for a region with any physical memory that overlaps with
148 * any portion of the guest physical addresses from start to end
149 * inclusive. If multiple overlapping regions exist, a pointer to any
150 * of the regions is returned. Null is returned only when no overlapping
151 * region exists.
152 */
153static struct userspace_mem_region *userspace_mem_region_find(
154 struct kvm_vm *vm, uint64_t start, uint64_t end)
155{
156 struct userspace_mem_region *region;
157
158 for (region = vm->userspace_mem_region_head; region;
159 region = region->next) {
160 uint64_t existing_start = region->region.guest_phys_addr;
161 uint64_t existing_end = region->region.guest_phys_addr
162 + region->region.memory_size - 1;
163 if (start <= existing_end && end >= existing_start)
164 return region;
165 }
166
167 return NULL;
168}
169
170/* KVM Userspace Memory Region Find
171 *
172 * Input Args:
173 * vm - Virtual Machine
174 * start - Starting VM physical address
175 * end - Ending VM physical address, inclusive.
176 *
177 * Output Args: None
178 *
179 * Return:
180 * Pointer to overlapping region, NULL if no such region.
181 *
182 * Public interface to userspace_mem_region_find. Allows tests to look up
183 * the memslot datastructure for a given range of guest physical memory.
184 */
185struct kvm_userspace_memory_region *
186kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
187 uint64_t end)
188{
189 struct userspace_mem_region *region;
190
191 region = userspace_mem_region_find(vm, start, end);
192 if (!region)
193 return NULL;
194
195 return &region->region;
196}
197
198/* VCPU Find
199 *
200 * Input Args:
201 * vm - Virtual Machine
202 * vcpuid - VCPU ID
203 *
204 * Output Args: None
205 *
206 * Return:
207 * Pointer to VCPU structure
208 *
209 * Locates a vcpu structure that describes the VCPU specified by vcpuid and
210 * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU
211 * for the specified vcpuid.
212 */
213struct vcpu *vcpu_find(struct kvm_vm *vm,
214 uint32_t vcpuid)
215{
216 struct vcpu *vcpup;
217
218 for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) {
219 if (vcpup->id == vcpuid)
220 return vcpup;
221 }
222
223 return NULL;
224}
225
226/* VM VCPU Remove
227 *
228 * Input Args:
229 * vm - Virtual Machine
230 * vcpuid - VCPU ID
231 *
232 * Output Args: None
233 *
234 * Return: None, TEST_ASSERT failures for all error conditions
235 *
236 * Within the VM specified by vm, removes the VCPU given by vcpuid.
237 */
238static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
239{
240 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
241
242 int ret = close(vcpu->fd);
243 TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
244 "errno: %i", ret, errno);
245
246 if (vcpu->next)
247 vcpu->next->prev = vcpu->prev;
248 if (vcpu->prev)
249 vcpu->prev->next = vcpu->next;
250 else
251 vm->vcpu_head = vcpu->next;
252 free(vcpu);
253}
254
255
256/* Destroys and frees the VM pointed to by vmp.
257 */
258void kvm_vm_free(struct kvm_vm *vmp)
259{
260 int ret;
261
262 if (vmp == NULL)
263 return;
264
265 /* Free userspace_mem_regions. */
266 while (vmp->userspace_mem_region_head) {
267 struct userspace_mem_region *region
268 = vmp->userspace_mem_region_head;
269
270 region->region.memory_size = 0;
271 ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION,
272 &region->region);
273 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
274 "rc: %i errno: %i", ret, errno);
275
276 vmp->userspace_mem_region_head = region->next;
277 sparsebit_free(&region->unused_phy_pages);
278 ret = munmap(region->mmap_start, region->mmap_size);
279 TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i",
280 ret, errno);
281
282 free(region);
283 }
284
285 /* Free VCPUs. */
286 while (vmp->vcpu_head)
287 vm_vcpu_rm(vmp, vmp->vcpu_head->id);
288
289 /* Free sparsebit arrays. */
290 sparsebit_free(&vmp->vpages_valid);
291 sparsebit_free(&vmp->vpages_mapped);
292
293 /* Close file descriptor for the VM. */
294 ret = close(vmp->fd);
295 TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
296 " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
297
298 /* Free the structure describing the VM. */
299 free(vmp);
300}
301
302/* Memory Compare, host virtual to guest virtual
303 *
304 * Input Args:
305 * hva - Starting host virtual address
306 * vm - Virtual Machine
307 * gva - Starting guest virtual address
308 * len - number of bytes to compare
309 *
310 * Output Args: None
311 *
312 * Input/Output Args: None
313 *
314 * Return:
315 * Returns 0 if the bytes starting at hva for a length of len
316 * are equal the guest virtual bytes starting at gva. Returns
317 * a value < 0, if bytes at hva are less than those at gva.
318 * Otherwise a value > 0 is returned.
319 *
320 * Compares the bytes starting at the host virtual address hva, for
321 * a length of len, to the guest bytes starting at the guest virtual
322 * address given by gva.
323 */
324int kvm_memcmp_hva_gva(void *hva,
325 struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
326{
327 size_t amt;
328
329 /* Compare a batch of bytes until either a match is found
330 * or all the bytes have been compared.
331 */
332 for (uintptr_t offset = 0; offset < len; offset += amt) {
333 uintptr_t ptr1 = (uintptr_t)hva + offset;
334
335 /* Determine host address for guest virtual address
336 * at offset.
337 */
338 uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
339
340 /* Determine amount to compare on this pass.
341 * Don't allow the comparsion to cross a page boundary.
342 */
343 amt = len - offset;
344 if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
345 amt = vm->page_size - (ptr1 % vm->page_size);
346 if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
347 amt = vm->page_size - (ptr2 % vm->page_size);
348
349 assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
350 assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
351
352 /* Perform the comparison. If there is a difference
353 * return that result to the caller, otherwise need
354 * to continue on looking for a mismatch.
355 */
356 int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
357 if (ret != 0)
358 return ret;
359 }
360
361 /* No mismatch found. Let the caller know the two memory
362 * areas are equal.
363 */
364 return 0;
365}
366
367/* Allocate an instance of struct kvm_cpuid2
368 *
369 * Input Args: None
370 *
371 * Output Args: None
372 *
373 * Return: A pointer to the allocated struct. The caller is responsible
374 * for freeing this struct.
375 *
376 * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
377 * array to be decided at allocation time, allocation is slightly
378 * complicated. This function uses a reasonable default length for
379 * the array and performs the appropriate allocation.
380 */
381struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
382{
383 struct kvm_cpuid2 *cpuid;
384 int nent = 100;
385 size_t size;
386
387 size = sizeof(*cpuid);
388 size += nent * sizeof(struct kvm_cpuid_entry2);
389 cpuid = malloc(size);
390 if (!cpuid) {
391 perror("malloc");
392 abort();
393 }
394
395 cpuid->nent = nent;
396
397 return cpuid;
398}
399
400/* KVM Supported CPUID Get
401 *
402 * Input Args: None
403 *
404 * Output Args:
405 * cpuid - The supported KVM CPUID
406 *
407 * Return: void
408 *
409 * Get the guest CPUID supported by KVM.
410 */
411void kvm_get_supported_cpuid(struct kvm_cpuid2 *cpuid)
412{
413 int ret;
414 int kvm_fd;
415
416 kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
417 TEST_ASSERT(kvm_fd >= 0, "open %s failed, rc: %i errno: %i",
418 KVM_DEV_PATH, kvm_fd, errno);
419
420 ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
421 TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
422 ret, errno);
423
424 close(kvm_fd);
425}
426
427/* Locate a cpuid entry.
428 *
429 * Input Args:
430 * cpuid: The cpuid.
431 * function: The function of the cpuid entry to find.
432 *
433 * Output Args: None
434 *
435 * Return: A pointer to the cpuid entry. Never returns NULL.
436 */
437struct kvm_cpuid_entry2 *
438find_cpuid_index_entry(struct kvm_cpuid2 *cpuid, uint32_t function,
439 uint32_t index)
440{
441 struct kvm_cpuid_entry2 *entry = NULL;
442 int i;
443
444 for (i = 0; i < cpuid->nent; i++) {
445 if (cpuid->entries[i].function == function &&
446 cpuid->entries[i].index == index) {
447 entry = &cpuid->entries[i];
448 break;
449 }
450 }
451
452 TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
453 function, index);
454 return entry;
455}
456
457/* VM Userspace Memory Region Add
458 *
459 * Input Args:
460 * vm - Virtual Machine
461 * backing_src - Storage source for this region.
462 * NULL to use anonymous memory.
463 * guest_paddr - Starting guest physical address
464 * slot - KVM region slot
465 * npages - Number of physical pages
466 * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
467 *
468 * Output Args: None
469 *
470 * Return: None
471 *
472 * Allocates a memory area of the number of pages specified by npages
473 * and maps it to the VM specified by vm, at a starting physical address
474 * given by guest_paddr. The region is created with a KVM region slot
475 * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The
476 * region is created with the flags given by flags.
477 */
478void vm_userspace_mem_region_add(struct kvm_vm *vm,
479 enum vm_mem_backing_src_type src_type,
480 uint64_t guest_paddr, uint32_t slot, uint64_t npages,
481 uint32_t flags)
482{
483 int ret;
484 unsigned long pmem_size = 0;
485 struct userspace_mem_region *region;
486 size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
487
488 TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
489 "address not on a page boundary.\n"
490 " guest_paddr: 0x%lx vm->page_size: 0x%x",
491 guest_paddr, vm->page_size);
492 TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
493 <= vm->max_gfn, "Physical range beyond maximum "
494 "supported physical address,\n"
495 " guest_paddr: 0x%lx npages: 0x%lx\n"
496 " vm->max_gfn: 0x%lx vm->page_size: 0x%x",
497 guest_paddr, npages, vm->max_gfn, vm->page_size);
498
499 /* Confirm a mem region with an overlapping address doesn't
500 * already exist.
501 */
502 region = (struct userspace_mem_region *) userspace_mem_region_find(
503 vm, guest_paddr, guest_paddr + npages * vm->page_size);
504 if (region != NULL)
505 TEST_ASSERT(false, "overlapping userspace_mem_region already "
506 "exists\n"
507 " requested guest_paddr: 0x%lx npages: 0x%lx "
508 "page_size: 0x%x\n"
509 " existing guest_paddr: 0x%lx size: 0x%lx",
510 guest_paddr, npages, vm->page_size,
511 (uint64_t) region->region.guest_phys_addr,
512 (uint64_t) region->region.memory_size);
513
514 /* Confirm no region with the requested slot already exists. */
515 for (region = vm->userspace_mem_region_head; region;
516 region = region->next) {
517 if (region->region.slot == slot)
518 break;
519 if ((guest_paddr <= (region->region.guest_phys_addr
520 + region->region.memory_size))
521 && ((guest_paddr + npages * vm->page_size)
522 >= region->region.guest_phys_addr))
523 break;
524 }
525 if (region != NULL)
526 TEST_ASSERT(false, "A mem region with the requested slot "
527 "or overlapping physical memory range already exists.\n"
528 " requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
529 " existing slot: %u paddr: 0x%lx size: 0x%lx",
530 slot, guest_paddr, npages,
531 region->region.slot,
532 (uint64_t) region->region.guest_phys_addr,
533 (uint64_t) region->region.memory_size);
534
535 /* Allocate and initialize new mem region structure. */
536 region = calloc(1, sizeof(*region));
537 TEST_ASSERT(region != NULL, "Insufficient Memory");
538 region->mmap_size = npages * vm->page_size;
539
540 /* Enough memory to align up to a huge page. */
541 if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
542 region->mmap_size += huge_page_size;
543 region->mmap_start = mmap(NULL, region->mmap_size,
544 PROT_READ | PROT_WRITE,
545 MAP_PRIVATE | MAP_ANONYMOUS
546 | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
547 -1, 0);
548 TEST_ASSERT(region->mmap_start != MAP_FAILED,
549 "test_malloc failed, mmap_start: %p errno: %i",
550 region->mmap_start, errno);
551
552 /* Align THP allocation up to start of a huge page. */
553 region->host_mem = align(region->mmap_start,
554 src_type == VM_MEM_SRC_ANONYMOUS_THP ? huge_page_size : 1);
555
556 /* As needed perform madvise */
557 if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
558 ret = madvise(region->host_mem, npages * vm->page_size,
559 src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
560 TEST_ASSERT(ret == 0, "madvise failed,\n"
561 " addr: %p\n"
562 " length: 0x%lx\n"
563 " src_type: %x",
564 region->host_mem, npages * vm->page_size, src_type);
565 }
566
567 region->unused_phy_pages = sparsebit_alloc();
568 sparsebit_set_num(region->unused_phy_pages,
569 guest_paddr >> vm->page_shift, npages);
570 region->region.slot = slot;
571 region->region.flags = flags;
572 region->region.guest_phys_addr = guest_paddr;
573 region->region.memory_size = npages * vm->page_size;
574 region->region.userspace_addr = (uintptr_t) region->host_mem;
575 ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
576 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
577 " rc: %i errno: %i\n"
578 " slot: %u flags: 0x%x\n"
579 " guest_phys_addr: 0x%lx size: 0x%lx",
580 ret, errno, slot, flags,
581 guest_paddr, (uint64_t) region->region.memory_size);
582
583 /* Add to linked-list of memory regions. */
584 if (vm->userspace_mem_region_head)
585 vm->userspace_mem_region_head->prev = region;
586 region->next = vm->userspace_mem_region_head;
587 vm->userspace_mem_region_head = region;
588}
589
590/* Memslot to region
591 *
592 * Input Args:
593 * vm - Virtual Machine
594 * memslot - KVM memory slot ID
595 *
596 * Output Args: None
597 *
598 * Return:
599 * Pointer to memory region structure that describe memory region
600 * using kvm memory slot ID given by memslot. TEST_ASSERT failure
601 * on error (e.g. currently no memory region using memslot as a KVM
602 * memory slot ID).
603 */
604static struct userspace_mem_region *memslot2region(struct kvm_vm *vm,
605 uint32_t memslot)
606{
607 struct userspace_mem_region *region;
608
609 for (region = vm->userspace_mem_region_head; region;
610 region = region->next) {
611 if (region->region.slot == memslot)
612 break;
613 }
614 if (region == NULL) {
615 fprintf(stderr, "No mem region with the requested slot found,\n"
616 " requested slot: %u\n", memslot);
617 fputs("---- vm dump ----\n", stderr);
618 vm_dump(stderr, vm, 2);
619 TEST_ASSERT(false, "Mem region not found");
620 }
621
622 return region;
623}
624
625/* VM Memory Region Flags Set
626 *
627 * Input Args:
628 * vm - Virtual Machine
629 * flags - Starting guest physical address
630 *
631 * Output Args: None
632 *
633 * Return: None
634 *
635 * Sets the flags of the memory region specified by the value of slot,
636 * to the values given by flags.
637 */
638void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
639{
640 int ret;
641 struct userspace_mem_region *region;
642
643 /* Locate memory region. */
644 region = memslot2region(vm, slot);
645
646 region->region.flags = flags;
647
648 ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
649
650 TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
651 " rc: %i errno: %i slot: %u flags: 0x%x",
652 ret, errno, slot, flags);
653}
654
655/* VCPU mmap Size
656 *
657 * Input Args: None
658 *
659 * Output Args: None
660 *
661 * Return:
662 * Size of VCPU state
663 *
664 * Returns the size of the structure pointed to by the return value
665 * of vcpu_state().
666 */
667static int vcpu_mmap_sz(void)
668{
669 int dev_fd, ret;
670
671 dev_fd = open(KVM_DEV_PATH, O_RDONLY);
672 TEST_ASSERT(dev_fd >= 0, "%s open %s failed, rc: %i errno: %i",
673 __func__, KVM_DEV_PATH, dev_fd, errno);
674
675 ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
676 TEST_ASSERT(ret >= sizeof(struct kvm_run),
677 "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i",
678 __func__, ret, errno);
679
680 close(dev_fd);
681
682 return ret;
683}
684
685/* VM VCPU Add
686 *
687 * Input Args:
688 * vm - Virtual Machine
689 * vcpuid - VCPU ID
690 *
691 * Output Args: None
692 *
693 * Return: None
694 *
695 * Creates and adds to the VM specified by vm and virtual CPU with
696 * the ID given by vcpuid.
697 */
698void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
699{
700 struct vcpu *vcpu;
701
702 /* Confirm a vcpu with the specified id doesn't already exist. */
703 vcpu = vcpu_find(vm, vcpuid);
704 if (vcpu != NULL)
705 TEST_ASSERT(false, "vcpu with the specified id "
706 "already exists,\n"
707 " requested vcpuid: %u\n"
708 " existing vcpuid: %u state: %p",
709 vcpuid, vcpu->id, vcpu->state);
710
711 /* Allocate and initialize new vcpu structure. */
712 vcpu = calloc(1, sizeof(*vcpu));
713 TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
714 vcpu->id = vcpuid;
715 vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid);
716 TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i",
717 vcpu->fd, errno);
718
719 TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size "
720 "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
721 vcpu_mmap_sz(), sizeof(*vcpu->state));
722 vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state),
723 PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
724 TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, "
725 "vcpu id: %u errno: %i", vcpuid, errno);
726
727 /* Add to linked-list of VCPUs. */
728 if (vm->vcpu_head)
729 vm->vcpu_head->prev = vcpu;
730 vcpu->next = vm->vcpu_head;
731 vm->vcpu_head = vcpu;
732
733 vcpu_setup(vm, vcpuid);
734}
735
736/* VM Virtual Address Unused Gap
737 *
738 * Input Args:
739 * vm - Virtual Machine
740 * sz - Size (bytes)
741 * vaddr_min - Minimum Virtual Address
742 *
743 * Output Args: None
744 *
745 * Return:
746 * Lowest virtual address at or below vaddr_min, with at least
747 * sz unused bytes. TEST_ASSERT failure if no area of at least
748 * size sz is available.
749 *
750 * Within the VM specified by vm, locates the lowest starting virtual
751 * address >= vaddr_min, that has at least sz unallocated bytes. A
752 * TEST_ASSERT failure occurs for invalid input or no area of at least
753 * sz unallocated bytes >= vaddr_min is available.
754 */
755static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
756 vm_vaddr_t vaddr_min)
757{
758 uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
759
760 /* Determine lowest permitted virtual page index. */
761 uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
762 if ((pgidx_start * vm->page_size) < vaddr_min)
763 goto no_va_found;
764
765 /* Loop over section with enough valid virtual page indexes. */
766 if (!sparsebit_is_set_num(vm->vpages_valid,
767 pgidx_start, pages))
768 pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
769 pgidx_start, pages);
770 do {
771 /*
772 * Are there enough unused virtual pages available at
773 * the currently proposed starting virtual page index.
774 * If not, adjust proposed starting index to next
775 * possible.
776 */
777 if (sparsebit_is_clear_num(vm->vpages_mapped,
778 pgidx_start, pages))
779 goto va_found;
780 pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
781 pgidx_start, pages);
782 if (pgidx_start == 0)
783 goto no_va_found;
784
785 /*
786 * If needed, adjust proposed starting virtual address,
787 * to next range of valid virtual addresses.
788 */
789 if (!sparsebit_is_set_num(vm->vpages_valid,
790 pgidx_start, pages)) {
791 pgidx_start = sparsebit_next_set_num(
792 vm->vpages_valid, pgidx_start, pages);
793 if (pgidx_start == 0)
794 goto no_va_found;
795 }
796 } while (pgidx_start != 0);
797
798no_va_found:
799 TEST_ASSERT(false, "No vaddr of specified pages available, "
800 "pages: 0x%lx", pages);
801
802 /* NOT REACHED */
803 return -1;
804
805va_found:
806 TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
807 pgidx_start, pages),
808 "Unexpected, invalid virtual page index range,\n"
809 " pgidx_start: 0x%lx\n"
810 " pages: 0x%lx",
811 pgidx_start, pages);
812 TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
813 pgidx_start, pages),
814 "Unexpected, pages already mapped,\n"
815 " pgidx_start: 0x%lx\n"
816 " pages: 0x%lx",
817 pgidx_start, pages);
818
819 return pgidx_start * vm->page_size;
820}
821
822/* VM Virtual Address Allocate
823 *
824 * Input Args:
825 * vm - Virtual Machine
826 * sz - Size in bytes
827 * vaddr_min - Minimum starting virtual address
828 * data_memslot - Memory region slot for data pages
829 * pgd_memslot - Memory region slot for new virtual translation tables
830 *
831 * Output Args: None
832 *
833 * Return:
834 * Starting guest virtual address
835 *
836 * Allocates at least sz bytes within the virtual address space of the vm
837 * given by vm. The allocated bytes are mapped to a virtual address >=
838 * the address given by vaddr_min. Note that each allocation uses a
839 * a unique set of pages, with the minimum real allocation being at least
840 * a page.
841 */
842vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
843 uint32_t data_memslot, uint32_t pgd_memslot)
844{
845 uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
846
847 virt_pgd_alloc(vm, pgd_memslot);
848
849 /* Find an unused range of virtual page addresses of at least
850 * pages in length.
851 */
852 vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
853
854 /* Map the virtual pages. */
855 for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
856 pages--, vaddr += vm->page_size) {
857 vm_paddr_t paddr;
858
859 paddr = vm_phy_page_alloc(vm, KVM_UTIL_MIN_PADDR, data_memslot);
860
861 virt_pg_map(vm, vaddr, paddr, pgd_memslot);
862
863 sparsebit_set(vm->vpages_mapped,
864 vaddr >> vm->page_shift);
865 }
866
867 return vaddr_start;
868}
869
870/* Address VM Physical to Host Virtual
871 *
872 * Input Args:
873 * vm - Virtual Machine
874 * gpa - VM physical address
875 *
876 * Output Args: None
877 *
878 * Return:
879 * Equivalent host virtual address
880 *
881 * Locates the memory region containing the VM physical address given
882 * by gpa, within the VM given by vm. When found, the host virtual
883 * address providing the memory to the vm physical address is returned.
884 * A TEST_ASSERT failure occurs if no region containing gpa exists.
885 */
886void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
887{
888 struct userspace_mem_region *region;
889 for (region = vm->userspace_mem_region_head; region;
890 region = region->next) {
891 if ((gpa >= region->region.guest_phys_addr)
892 && (gpa <= (region->region.guest_phys_addr
893 + region->region.memory_size - 1)))
894 return (void *) ((uintptr_t) region->host_mem
895 + (gpa - region->region.guest_phys_addr));
896 }
897
898 TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa);
899 return NULL;
900}
901
902/* Address Host Virtual to VM Physical
903 *
904 * Input Args:
905 * vm - Virtual Machine
906 * hva - Host virtual address
907 *
908 * Output Args: None
909 *
910 * Return:
911 * Equivalent VM physical address
912 *
913 * Locates the memory region containing the host virtual address given
914 * by hva, within the VM given by vm. When found, the equivalent
915 * VM physical address is returned. A TEST_ASSERT failure occurs if no
916 * region containing hva exists.
917 */
918vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
919{
920 struct userspace_mem_region *region;
921 for (region = vm->userspace_mem_region_head; region;
922 region = region->next) {
923 if ((hva >= region->host_mem)
924 && (hva <= (region->host_mem
925 + region->region.memory_size - 1)))
926 return (vm_paddr_t) ((uintptr_t)
927 region->region.guest_phys_addr
928 + (hva - (uintptr_t) region->host_mem));
929 }
930
931 TEST_ASSERT(false, "No mapping to a guest physical address, "
932 "hva: %p", hva);
933 return -1;
934}
935
936/* VM Create IRQ Chip
937 *
938 * Input Args:
939 * vm - Virtual Machine
940 *
941 * Output Args: None
942 *
943 * Return: None
944 *
945 * Creates an interrupt controller chip for the VM specified by vm.
946 */
947void vm_create_irqchip(struct kvm_vm *vm)
948{
949 int ret;
950
951 ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
952 TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
953 "rc: %i errno: %i", ret, errno);
954}
955
956/* VM VCPU State
957 *
958 * Input Args:
959 * vm - Virtual Machine
960 * vcpuid - VCPU ID
961 *
962 * Output Args: None
963 *
964 * Return:
965 * Pointer to structure that describes the state of the VCPU.
966 *
967 * Locates and returns a pointer to a structure that describes the
968 * state of the VCPU with the given vcpuid.
969 */
970struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
971{
972 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
973 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
974
975 return vcpu->state;
976}
977
978/* VM VCPU Run
979 *
980 * Input Args:
981 * vm - Virtual Machine
982 * vcpuid - VCPU ID
983 *
984 * Output Args: None
985 *
986 * Return: None
987 *
988 * Switch to executing the code for the VCPU given by vcpuid, within the VM
989 * given by vm.
990 */
991void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
992{
993 int ret = _vcpu_run(vm, vcpuid);
994 TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
995 "rc: %i errno: %i", ret, errno);
996}
997
998int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
999{
1000 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1001 int rc;
1002
1003 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1004 do {
1005 rc = ioctl(vcpu->fd, KVM_RUN, NULL);
1006 } while (rc == -1 && errno == EINTR);
1007 return rc;
1008}
1009
1010/* VM VCPU Set MP State
1011 *
1012 * Input Args:
1013 * vm - Virtual Machine
1014 * vcpuid - VCPU ID
1015 * mp_state - mp_state to be set
1016 *
1017 * Output Args: None
1018 *
1019 * Return: None
1020 *
1021 * Sets the MP state of the VCPU given by vcpuid, to the state given
1022 * by mp_state.
1023 */
1024void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
1025 struct kvm_mp_state *mp_state)
1026{
1027 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1028 int ret;
1029
1030 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1031
1032 ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
1033 TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, "
1034 "rc: %i errno: %i", ret, errno);
1035}
1036
1037/* VM VCPU Regs Get
1038 *
1039 * Input Args:
1040 * vm - Virtual Machine
1041 * vcpuid - VCPU ID
1042 *
1043 * Output Args:
1044 * regs - current state of VCPU regs
1045 *
1046 * Return: None
1047 *
1048 * Obtains the current register state for the VCPU specified by vcpuid
1049 * and stores it at the location given by regs.
1050 */
1051void vcpu_regs_get(struct kvm_vm *vm,
1052 uint32_t vcpuid, struct kvm_regs *regs)
1053{
1054 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1055 int ret;
1056
1057 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1058
1059 /* Get the regs. */
1060 ret = ioctl(vcpu->fd, KVM_GET_REGS, regs);
1061 TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i",
1062 ret, errno);
1063}
1064
1065/* VM VCPU Regs Set
1066 *
1067 * Input Args:
1068 * vm - Virtual Machine
1069 * vcpuid - VCPU ID
1070 * regs - Values to set VCPU regs to
1071 *
1072 * Output Args: None
1073 *
1074 * Return: None
1075 *
1076 * Sets the regs of the VCPU specified by vcpuid to the values
1077 * given by regs.
1078 */
1079void vcpu_regs_set(struct kvm_vm *vm,
1080 uint32_t vcpuid, struct kvm_regs *regs)
1081{
1082 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1083 int ret;
1084
1085 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1086
1087 /* Set the regs. */
1088 ret = ioctl(vcpu->fd, KVM_SET_REGS, regs);
1089 TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i",
1090 ret, errno);
1091}
1092
1093void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
1094 struct kvm_vcpu_events *events)
1095{
1096 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1097 int ret;
1098
1099 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1100
1101 /* Get the regs. */
1102 ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events);
1103 TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i",
1104 ret, errno);
1105}
1106
1107void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
1108 struct kvm_vcpu_events *events)
1109{
1110 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1111 int ret;
1112
1113 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1114
1115 /* Set the regs. */
1116 ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events);
1117 TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i",
1118 ret, errno);
1119}
1120
1121/* VM VCPU Args Set
1122 *
1123 * Input Args:
1124 * vm - Virtual Machine
1125 * vcpuid - VCPU ID
1126 * num - number of arguments
1127 * ... - arguments, each of type uint64_t
1128 *
1129 * Output Args: None
1130 *
1131 * Return: None
1132 *
1133 * Sets the first num function input arguments to the values
1134 * given as variable args. Each of the variable args is expected to
1135 * be of type uint64_t.
1136 */
1137void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
1138{
1139 va_list ap;
1140 struct kvm_regs regs;
1141
1142 TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
1143 " num: %u\n",
1144 num);
1145
1146 va_start(ap, num);
1147 vcpu_regs_get(vm, vcpuid, &regs);
1148
1149 if (num >= 1)
1150 regs.rdi = va_arg(ap, uint64_t);
1151
1152 if (num >= 2)
1153 regs.rsi = va_arg(ap, uint64_t);
1154
1155 if (num >= 3)
1156 regs.rdx = va_arg(ap, uint64_t);
1157
1158 if (num >= 4)
1159 regs.rcx = va_arg(ap, uint64_t);
1160
1161 if (num >= 5)
1162 regs.r8 = va_arg(ap, uint64_t);
1163
1164 if (num >= 6)
1165 regs.r9 = va_arg(ap, uint64_t);
1166
1167 vcpu_regs_set(vm, vcpuid, &regs);
1168 va_end(ap);
1169}
1170
1171/* VM VCPU System Regs Get
1172 *
1173 * Input Args:
1174 * vm - Virtual Machine
1175 * vcpuid - VCPU ID
1176 *
1177 * Output Args:
1178 * sregs - current state of VCPU system regs
1179 *
1180 * Return: None
1181 *
1182 * Obtains the current system register state for the VCPU specified by
1183 * vcpuid and stores it at the location given by sregs.
1184 */
1185void vcpu_sregs_get(struct kvm_vm *vm,
1186 uint32_t vcpuid, struct kvm_sregs *sregs)
1187{
1188 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1189 int ret;
1190
1191 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1192
1193 /* Get the regs. */
1194 /* Get the regs. */
1195 ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
1196 TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i",
1197 ret, errno);
1198}
1199
1200/* VM VCPU System Regs Set
1201 *
1202 * Input Args:
1203 * vm - Virtual Machine
1204 * vcpuid - VCPU ID
1205 * sregs - Values to set VCPU system regs to
1206 *
1207 * Output Args: None
1208 *
1209 * Return: None
1210 *
1211 * Sets the system regs of the VCPU specified by vcpuid to the values
1212 * given by sregs.
1213 */
1214void vcpu_sregs_set(struct kvm_vm *vm,
1215 uint32_t vcpuid, struct kvm_sregs *sregs)
1216{
1217 int ret = _vcpu_sregs_set(vm, vcpuid, sregs);
1218 TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
1219 "rc: %i errno: %i", ret, errno);
1220}
1221
1222int _vcpu_sregs_set(struct kvm_vm *vm,
1223 uint32_t vcpuid, struct kvm_sregs *sregs)
1224{
1225 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1226 int ret;
1227
1228 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1229
1230 /* Get the regs. */
1231 return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
1232}
1233
1234/* VCPU Ioctl
1235 *
1236 * Input Args:
1237 * vm - Virtual Machine
1238 * vcpuid - VCPU ID
1239 * cmd - Ioctl number
1240 * arg - Argument to pass to the ioctl
1241 *
1242 * Return: None
1243 *
1244 * Issues an arbitrary ioctl on a VCPU fd.
1245 */
1246void vcpu_ioctl(struct kvm_vm *vm,
1247 uint32_t vcpuid, unsigned long cmd, void *arg)
1248{
1249 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1250 int ret;
1251
1252 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
1253
1254 ret = ioctl(vcpu->fd, cmd, arg);
1255 TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)",
1256 cmd, ret, errno, strerror(errno));
1257}
1258
1259/* VM Ioctl
1260 *
1261 * Input Args:
1262 * vm - Virtual Machine
1263 * cmd - Ioctl number
1264 * arg - Argument to pass to the ioctl
1265 *
1266 * Return: None
1267 *
1268 * Issues an arbitrary ioctl on a VM fd.
1269 */
1270void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
1271{
1272 int ret;
1273
1274 ret = ioctl(vm->fd, cmd, arg);
1275 TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)",
1276 cmd, ret, errno, strerror(errno));
1277}
1278
1279/* VM Dump
1280 *
1281 * Input Args:
1282 * vm - Virtual Machine
1283 * indent - Left margin indent amount
1284 *
1285 * Output Args:
1286 * stream - Output FILE stream
1287 *
1288 * Return: None
1289 *
1290 * Dumps the current state of the VM given by vm, to the FILE stream
1291 * given by stream.
1292 */
1293void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1294{
1295 struct userspace_mem_region *region;
1296 struct vcpu *vcpu;
1297
1298 fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1299 fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1300 fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1301 fprintf(stream, "%*sMem Regions:\n", indent, "");
1302 for (region = vm->userspace_mem_region_head; region;
1303 region = region->next) {
1304 fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1305 "host_virt: %p\n", indent + 2, "",
1306 (uint64_t) region->region.guest_phys_addr,
1307 (uint64_t) region->region.memory_size,
1308 region->host_mem);
1309 fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1310 sparsebit_dump(stream, region->unused_phy_pages, 0);
1311 }
1312 fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1313 sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1314 fprintf(stream, "%*spgd_created: %u\n", indent, "",
1315 vm->pgd_created);
1316 if (vm->pgd_created) {
1317 fprintf(stream, "%*sVirtual Translation Tables:\n",
1318 indent + 2, "");
1319 virt_dump(stream, vm, indent + 4);
1320 }
1321 fprintf(stream, "%*sVCPUs:\n", indent, "");
1322 for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next)
1323 vcpu_dump(stream, vm, vcpu->id, indent + 2);
1324}
1325
1326/* VM VCPU Dump
1327 *
1328 * Input Args:
1329 * vm - Virtual Machine
1330 * vcpuid - VCPU ID
1331 * indent - Left margin indent amount
1332 *
1333 * Output Args:
1334 * stream - Output FILE stream
1335 *
1336 * Return: None
1337 *
1338 * Dumps the current state of the VCPU specified by vcpuid, within the VM
1339 * given by vm, to the FILE stream given by stream.
1340 */
1341void vcpu_dump(FILE *stream, struct kvm_vm *vm,
1342 uint32_t vcpuid, uint8_t indent)
1343{
1344 struct kvm_regs regs;
1345 struct kvm_sregs sregs;
1346
1347 fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
1348
1349 fprintf(stream, "%*sregs:\n", indent + 2, "");
1350 vcpu_regs_get(vm, vcpuid, &regs);
1351 regs_dump(stream, &regs, indent + 4);
1352
1353 fprintf(stream, "%*ssregs:\n", indent + 2, "");
1354 vcpu_sregs_get(vm, vcpuid, &sregs);
1355 sregs_dump(stream, &sregs, indent + 4);
1356}
1357
1358/* Known KVM exit reasons */
1359static struct exit_reason {
1360 unsigned int reason;
1361 const char *name;
1362} exit_reasons_known[] = {
1363 {KVM_EXIT_UNKNOWN, "UNKNOWN"},
1364 {KVM_EXIT_EXCEPTION, "EXCEPTION"},
1365 {KVM_EXIT_IO, "IO"},
1366 {KVM_EXIT_HYPERCALL, "HYPERCALL"},
1367 {KVM_EXIT_DEBUG, "DEBUG"},
1368 {KVM_EXIT_HLT, "HLT"},
1369 {KVM_EXIT_MMIO, "MMIO"},
1370 {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"},
1371 {KVM_EXIT_SHUTDOWN, "SHUTDOWN"},
1372 {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"},
1373 {KVM_EXIT_INTR, "INTR"},
1374 {KVM_EXIT_SET_TPR, "SET_TPR"},
1375 {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"},
1376 {KVM_EXIT_S390_SIEIC, "S390_SIEIC"},
1377 {KVM_EXIT_S390_RESET, "S390_RESET"},
1378 {KVM_EXIT_DCR, "DCR"},
1379 {KVM_EXIT_NMI, "NMI"},
1380 {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
1381 {KVM_EXIT_OSI, "OSI"},
1382 {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
1383#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
1384 {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
1385#endif
1386};
1387
1388/* Exit Reason String
1389 *
1390 * Input Args:
1391 * exit_reason - Exit reason
1392 *
1393 * Output Args: None
1394 *
1395 * Return:
1396 * Constant string pointer describing the exit reason.
1397 *
1398 * Locates and returns a constant string that describes the KVM exit
1399 * reason given by exit_reason. If no such string is found, a constant
1400 * string of "Unknown" is returned.
1401 */
1402const char *exit_reason_str(unsigned int exit_reason)
1403{
1404 unsigned int n1;
1405
1406 for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
1407 if (exit_reason == exit_reasons_known[n1].reason)
1408 return exit_reasons_known[n1].name;
1409 }
1410
1411 return "Unknown";
1412}
1413
1414/* Physical Page Allocate
1415 *
1416 * Input Args:
1417 * vm - Virtual Machine
1418 * paddr_min - Physical address minimum
1419 * memslot - Memory region to allocate page from
1420 *
1421 * Output Args: None
1422 *
1423 * Return:
1424 * Starting physical address
1425 *
1426 * Within the VM specified by vm, locates an available physical page
1427 * at or above paddr_min. If found, the page is marked as in use
1428 * and its address is returned. A TEST_ASSERT failure occurs if no
1429 * page is available at or above paddr_min.
1430 */
1431vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm,
1432 vm_paddr_t paddr_min, uint32_t memslot)
1433{
1434 struct userspace_mem_region *region;
1435 sparsebit_idx_t pg;
1436
1437 TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
1438 "not divisable by page size.\n"
1439 " paddr_min: 0x%lx page_size: 0x%x",
1440 paddr_min, vm->page_size);
1441
1442 /* Locate memory region. */
1443 region = memslot2region(vm, memslot);
1444
1445 /* Locate next available physical page at or above paddr_min. */
1446 pg = paddr_min >> vm->page_shift;
1447
1448 if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
1449 pg = sparsebit_next_set(region->unused_phy_pages, pg);
1450 if (pg == 0) {
1451 fprintf(stderr, "No guest physical page available, "
1452 "paddr_min: 0x%lx page_size: 0x%x memslot: %u",
1453 paddr_min, vm->page_size, memslot);
1454 fputs("---- vm dump ----\n", stderr);
1455 vm_dump(stderr, vm, 2);
1456 abort();
1457 }
1458 }
1459
1460 /* Specify page as in use and return its address. */
1461 sparsebit_clear(region->unused_phy_pages, pg);
1462
1463 return pg * vm->page_size;
1464}
1465
1466/* Address Guest Virtual to Host Virtual
1467 *
1468 * Input Args:
1469 * vm - Virtual Machine
1470 * gva - VM virtual address
1471 *
1472 * Output Args: None
1473 *
1474 * Return:
1475 * Equivalent host virtual address
1476 */
1477void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
1478{
1479 return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
1480}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
new file mode 100644
index 000000000000..a0bd1980c81c
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -0,0 +1,67 @@
1/*
2 * tools/testing/selftests/kvm/lib/kvm_util.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#ifndef KVM_UTIL_INTERNAL_H
10#define KVM_UTIL_INTERNAL_H 1
11
12#include "sparsebit.h"
13
14#ifndef BITS_PER_BYTE
15#define BITS_PER_BYTE 8
16#endif
17
18#ifndef BITS_PER_LONG
19#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long))
20#endif
21
22#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
23#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
24
25/* Concrete definition of struct kvm_vm. */
26struct userspace_mem_region {
27 struct userspace_mem_region *next, *prev;
28 struct kvm_userspace_memory_region region;
29 struct sparsebit *unused_phy_pages;
30 int fd;
31 off_t offset;
32 void *host_mem;
33 void *mmap_start;
34 size_t mmap_size;
35};
36
37struct vcpu {
38 struct vcpu *next, *prev;
39 uint32_t id;
40 int fd;
41 struct kvm_run *state;
42};
43
44struct kvm_vm {
45 int mode;
46 int fd;
47 unsigned int page_size;
48 unsigned int page_shift;
49 uint64_t max_gfn;
50 struct vcpu *vcpu_head;
51 struct userspace_mem_region *userspace_mem_region_head;
52 struct sparsebit *vpages_valid;
53 struct sparsebit *vpages_mapped;
54 bool pgd_created;
55 vm_paddr_t pgd;
56};
57
58struct vcpu *vcpu_find(struct kvm_vm *vm,
59 uint32_t vcpuid);
60void vcpu_setup(struct kvm_vm *vm, int vcpuid);
61void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
62void regs_dump(FILE *stream, struct kvm_regs *regs,
63 uint8_t indent);
64void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
65 uint8_t indent);
66
67#endif
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
new file mode 100644
index 000000000000..0c5cf3e0cb6f
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -0,0 +1,2087 @@
1/*
2 * Sparse bit array
3 *
4 * Copyright (C) 2018, Google LLC.
5 * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver)
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2.
8 *
9 * This library provides functions to support a memory efficient bit array,
10 * with an index size of 2^64. A sparsebit array is allocated through
11 * the use sparsebit_alloc() and free'd via sparsebit_free(),
12 * such as in the following:
13 *
14 * struct sparsebit *s;
15 * s = sparsebit_alloc();
16 * sparsebit_free(&s);
17 *
18 * The struct sparsebit type resolves down to a struct sparsebit.
19 * Note that, sparsebit_free() takes a pointer to the sparsebit
20 * structure. This is so that sparsebit_free() is able to poison
21 * the pointer (e.g. set it to NULL) to the struct sparsebit before
22 * returning to the caller.
23 *
24 * Between the return of sparsebit_alloc() and the call of
25 * sparsebit_free(), there are multiple query and modifying operations
26 * that can be performed on the allocated sparsebit array. All of
27 * these operations take as a parameter the value returned from
28 * sparsebit_alloc() and most also take a bit index. Frequently
29 * used routines include:
30 *
31 * ---- Query Operations
32 * sparsebit_is_set(s, idx)
33 * sparsebit_is_clear(s, idx)
34 * sparsebit_any_set(s)
35 * sparsebit_first_set(s)
36 * sparsebit_next_set(s, prev_idx)
37 *
38 * ---- Modifying Operations
39 * sparsebit_set(s, idx)
40 * sparsebit_clear(s, idx)
41 * sparsebit_set_num(s, idx, num);
42 * sparsebit_clear_num(s, idx, num);
43 *
44 * A common operation, is to itterate over all the bits set in a test
45 * sparsebit array. This can be done via code with the following structure:
46 *
47 * sparsebit_idx_t idx;
48 * if (sparsebit_any_set(s)) {
49 * idx = sparsebit_first_set(s);
50 * do {
51 * ...
52 * idx = sparsebit_next_set(s, idx);
53 * } while (idx != 0);
54 * }
55 *
56 * The index of the first bit set needs to be obtained via
57 * sparsebit_first_set(), because sparsebit_next_set(), needs
58 * the index of the previously set. The sparsebit_idx_t type is
59 * unsigned, so there is no previous index before 0 that is available.
60 * Also, the call to sparsebit_first_set() is not made unless there
61 * is at least 1 bit in the array set. This is because sparsebit_first_set()
62 * aborts if sparsebit_first_set() is called with no bits set.
63 * It is the callers responsibility to assure that the
64 * sparsebit array has at least a single bit set before calling
65 * sparsebit_first_set().
66 *
67 * ==== Implementation Overview ====
68 * For the most part the internal implementation of sparsebit is
69 * opaque to the caller. One important implementation detail that the
70 * caller may need to be aware of is the spatial complexity of the
71 * implementation. This implementation of a sparsebit array is not
72 * only sparse, in that it uses memory proportional to the number of bits
73 * set. It is also efficient in memory usage when most of the bits are
74 * set.
75 *
76 * At a high-level the state of the bit settings are maintained through
77 * the use of a binary-search tree, where each node contains at least
78 * the following members:
79 *
80 * typedef uint64_t sparsebit_idx_t;
81 * typedef uint64_t sparsebit_num_t;
82 *
83 * sparsebit_idx_t idx;
84 * uint32_t mask;
85 * sparsebit_num_t num_after;
86 *
87 * The idx member contains the bit index of the first bit described by this
88 * node, while the mask member stores the setting of the first 32-bits.
89 * The setting of the bit at idx + n, where 0 <= n < 32, is located in the
90 * mask member at 1 << n.
91 *
92 * Nodes are sorted by idx and the bits described by two nodes will never
93 * overlap. The idx member is always aligned to the mask size, i.e. a
94 * multiple of 32.
95 *
96 * Beyond a typical implementation, the nodes in this implementation also
97 * contains a member named num_after. The num_after member holds the
98 * number of bits immediately after the mask bits that are contiguously set.
99 * The use of the num_after member allows this implementation to efficiently
100 * represent cases where most bits are set. For example, the case of all
101 * but the last two bits set, is represented by the following two nodes:
102 *
103 * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0
104 * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0
105 *
106 * ==== Invariants ====
107 * This implementation usses the following invariants:
108 *
109 * + Node are only used to represent bits that are set.
110 * Nodes with a mask of 0 and num_after of 0 are not allowed.
111 *
112 * + Sum of bits set in all the nodes is equal to the value of
113 * the struct sparsebit_pvt num_set member.
114 *
115 * + The setting of at least one bit is always described in a nodes
116 * mask (mask >= 1).
117 *
118 * + A node with all mask bits set only occurs when the last bit
119 * described by the previous node is not equal to this nodes
120 * starting index - 1. All such occurences of this condition are
121 * avoided by moving the setting of the nodes mask bits into
122 * the previous nodes num_after setting.
123 *
124 * + Node starting index is evenly divisable by the number of bits
125 * within a nodes mask member.
126 *
127 * + Nodes never represent a range of bits that wrap around the
128 * highest supported index.
129 *
130 * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1)
131 *
132 * As a consequence of the above, the num_after member of a node
133 * will always be <=:
134 *
135 * maximum_index - nodes_starting_index - number_of_mask_bits
136 *
137 * + Nodes within the binary search tree are sorted based on each
138 * nodes starting index.
139 *
140 * + The range of bits described by any two nodes do not overlap. The
141 * range of bits described by a single node is:
142 *
143 * start: node->idx
144 * end (inclusive): node->idx + MASK_BITS + node->num_after - 1;
145 *
146 * Note, at times these invariants are temporarily violated for a
147 * specific portion of the code. For example, when setting a mask
148 * bit, there is a small delay between when the mask bit is set and the
149 * value in the struct sparsebit_pvt num_set member is updated. Other
150 * temporary violations occur when node_split() is called with a specified
151 * index and assures that a node where its mask represents the bit
152 * at the specified index exists. At times to do this node_split()
153 * must split an existing node into two nodes or create a node that
154 * has no bits set. Such temporary violations must be corrected before
155 * returning to the caller. These corrections are typically performed
156 * by the local function node_reduce().
157 */
158
159#include "test_util.h"
160#include "sparsebit.h"
161#include <limits.h>
162#include <assert.h>
163
164#define DUMP_LINE_MAX 100 /* Does not include indent amount */
165
166typedef uint32_t mask_t;
167#define MASK_BITS (sizeof(mask_t) * CHAR_BIT)
168
169struct node {
170 struct node *parent;
171 struct node *left;
172 struct node *right;
173 sparsebit_idx_t idx; /* index of least-significant bit in mask */
174 sparsebit_num_t num_after; /* num contiguously set after mask */
175 mask_t mask;
176};
177
178struct sparsebit {
179 /*
180 * Points to root node of the binary search
181 * tree. Equal to NULL when no bits are set in
182 * the entire sparsebit array.
183 */
184 struct node *root;
185
186 /*
187 * A redundant count of the total number of bits set. Used for
188 * diagnostic purposes and to change the time complexity of
189 * sparsebit_num_set() from O(n) to O(1).
190 * Note: Due to overflow, a value of 0 means none or all set.
191 */
192 sparsebit_num_t num_set;
193};
194
195/* Returns the number of set bits described by the settings
196 * of the node pointed to by nodep.
197 */
198static sparsebit_num_t node_num_set(struct node *nodep)
199{
200 return nodep->num_after + __builtin_popcount(nodep->mask);
201}
202
203/* Returns a pointer to the node that describes the
204 * lowest bit index.
205 */
206static struct node *node_first(struct sparsebit *s)
207{
208 struct node *nodep;
209
210 for (nodep = s->root; nodep && nodep->left; nodep = nodep->left)
211 ;
212
213 return nodep;
214}
215
216/* Returns a pointer to the node that describes the
217 * lowest bit index > the index of the node pointed to by np.
218 * Returns NULL if no node with a higher index exists.
219 */
220static struct node *node_next(struct sparsebit *s, struct node *np)
221{
222 struct node *nodep = np;
223
224 /*
225 * If current node has a right child, next node is the left-most
226 * of the right child.
227 */
228 if (nodep->right) {
229 for (nodep = nodep->right; nodep->left; nodep = nodep->left)
230 ;
231 return nodep;
232 }
233
234 /*
235 * No right child. Go up until node is left child of a parent.
236 * That parent is then the next node.
237 */
238 while (nodep->parent && nodep == nodep->parent->right)
239 nodep = nodep->parent;
240
241 return nodep->parent;
242}
243
244/* Searches for and returns a pointer to the node that describes the
245 * highest index < the index of the node pointed to by np.
246 * Returns NULL if no node with a lower index exists.
247 */
248static struct node *node_prev(struct sparsebit *s, struct node *np)
249{
250 struct node *nodep = np;
251
252 /*
253 * If current node has a left child, next node is the right-most
254 * of the left child.
255 */
256 if (nodep->left) {
257 for (nodep = nodep->left; nodep->right; nodep = nodep->right)
258 ;
259 return (struct node *) nodep;
260 }
261
262 /*
263 * No left child. Go up until node is right child of a parent.
264 * That parent is then the next node.
265 */
266 while (nodep->parent && nodep == nodep->parent->left)
267 nodep = nodep->parent;
268
269 return (struct node *) nodep->parent;
270}
271
272
273/* Allocates space to hold a copy of the node sub-tree pointed to by
274 * subtree and duplicates the bit settings to the newly allocated nodes.
275 * Returns the newly allocated copy of subtree.
276 */
277static struct node *node_copy_subtree(struct node *subtree)
278{
279 struct node *root;
280
281 /* Duplicate the node at the root of the subtree */
282 root = calloc(1, sizeof(*root));
283 if (!root) {
284 perror("calloc");
285 abort();
286 }
287
288 root->idx = subtree->idx;
289 root->mask = subtree->mask;
290 root->num_after = subtree->num_after;
291
292 /* As needed, recursively duplicate the left and right subtrees */
293 if (subtree->left) {
294 root->left = node_copy_subtree(subtree->left);
295 root->left->parent = root;
296 }
297
298 if (subtree->right) {
299 root->right = node_copy_subtree(subtree->right);
300 root->right->parent = root;
301 }
302
303 return root;
304}
305
306/* Searches for and returns a pointer to the node that describes the setting
307 * of the bit given by idx. A node describes the setting of a bit if its
308 * index is within the bits described by the mask bits or the number of
309 * contiguous bits set after the mask. Returns NULL if there is no such node.
310 */
311static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx)
312{
313 struct node *nodep;
314
315 /* Find the node that describes the setting of the bit at idx */
316 for (nodep = s->root; nodep;
317 nodep = nodep->idx > idx ? nodep->left : nodep->right) {
318 if (idx >= nodep->idx &&
319 idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
320 break;
321 }
322
323 return nodep;
324}
325
326/* Entry Requirements:
327 * + A node that describes the setting of idx is not already present.
328 *
329 * Adds a new node to describe the setting of the bit at the index given
330 * by idx. Returns a pointer to the newly added node.
331 *
332 * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced.
333 */
334static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx)
335{
336 struct node *nodep, *parentp, *prev;
337
338 /* Allocate and initialize the new node. */
339 nodep = calloc(1, sizeof(*nodep));
340 if (!nodep) {
341 perror("calloc");
342 abort();
343 }
344
345 nodep->idx = idx & -MASK_BITS;
346
347 /* If no nodes, set it up as the root node. */
348 if (!s->root) {
349 s->root = nodep;
350 return nodep;
351 }
352
353 /*
354 * Find the parent where the new node should be attached
355 * and add the node there.
356 */
357 parentp = s->root;
358 while (true) {
359 if (idx < parentp->idx) {
360 if (!parentp->left) {
361 parentp->left = nodep;
362 nodep->parent = parentp;
363 break;
364 }
365 parentp = parentp->left;
366 } else {
367 assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1);
368 if (!parentp->right) {
369 parentp->right = nodep;
370 nodep->parent = parentp;
371 break;
372 }
373 parentp = parentp->right;
374 }
375 }
376
377 /*
378 * Does num_after bits of previous node overlap with the mask
379 * of the new node? If so set the bits in the new nodes mask
380 * and reduce the previous nodes num_after.
381 */
382 prev = node_prev(s, nodep);
383 while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) {
384 unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1)
385 - nodep->idx;
386 assert(prev->num_after > 0);
387 assert(n1 < MASK_BITS);
388 assert(!(nodep->mask & (1 << n1)));
389 nodep->mask |= (1 << n1);
390 prev->num_after--;
391 }
392
393 return nodep;
394}
395
396/* Returns whether all the bits in the sparsebit array are set. */
397bool sparsebit_all_set(struct sparsebit *s)
398{
399 /*
400 * If any nodes there must be at least one bit set. Only case
401 * where a bit is set and total num set is 0, is when all bits
402 * are set.
403 */
404 return s->root && s->num_set == 0;
405}
406
407/* Clears all bits described by the node pointed to by nodep, then
408 * removes the node.
409 */
410static void node_rm(struct sparsebit *s, struct node *nodep)
411{
412 struct node *tmp;
413 sparsebit_num_t num_set;
414
415 num_set = node_num_set(nodep);
416 assert(s->num_set >= num_set || sparsebit_all_set(s));
417 s->num_set -= node_num_set(nodep);
418
419 /* Have both left and right child */
420 if (nodep->left && nodep->right) {
421 /*
422 * Move left children to the leftmost leaf node
423 * of the right child.
424 */
425 for (tmp = nodep->right; tmp->left; tmp = tmp->left)
426 ;
427 tmp->left = nodep->left;
428 nodep->left = NULL;
429 tmp->left->parent = tmp;
430 }
431
432 /* Left only child */
433 if (nodep->left) {
434 if (!nodep->parent) {
435 s->root = nodep->left;
436 nodep->left->parent = NULL;
437 } else {
438 nodep->left->parent = nodep->parent;
439 if (nodep == nodep->parent->left)
440 nodep->parent->left = nodep->left;
441 else {
442 assert(nodep == nodep->parent->right);
443 nodep->parent->right = nodep->left;
444 }
445 }
446
447 nodep->parent = nodep->left = nodep->right = NULL;
448 free(nodep);
449
450 return;
451 }
452
453
454 /* Right only child */
455 if (nodep->right) {
456 if (!nodep->parent) {
457 s->root = nodep->right;
458 nodep->right->parent = NULL;
459 } else {
460 nodep->right->parent = nodep->parent;
461 if (nodep == nodep->parent->left)
462 nodep->parent->left = nodep->right;
463 else {
464 assert(nodep == nodep->parent->right);
465 nodep->parent->right = nodep->right;
466 }
467 }
468
469 nodep->parent = nodep->left = nodep->right = NULL;
470 free(nodep);
471
472 return;
473 }
474
475 /* Leaf Node */
476 if (!nodep->parent) {
477 s->root = NULL;
478 } else {
479 if (nodep->parent->left == nodep)
480 nodep->parent->left = NULL;
481 else {
482 assert(nodep == nodep->parent->right);
483 nodep->parent->right = NULL;
484 }
485 }
486
487 nodep->parent = nodep->left = nodep->right = NULL;
488 free(nodep);
489
490 return;
491}
492
493/* Splits the node containing the bit at idx so that there is a node
494 * that starts at the specified index. If no such node exists, a new
495 * node at the specified index is created. Returns the new node.
496 *
497 * idx must start of a mask boundary.
498 */
499static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx)
500{
501 struct node *nodep1, *nodep2;
502 sparsebit_idx_t offset;
503 sparsebit_num_t orig_num_after;
504
505 assert(!(idx % MASK_BITS));
506
507 /*
508 * Is there a node that describes the setting of idx?
509 * If not, add it.
510 */
511 nodep1 = node_find(s, idx);
512 if (!nodep1)
513 return node_add(s, idx);
514
515 /*
516 * All done if the starting index of the node is where the
517 * split should occur.
518 */
519 if (nodep1->idx == idx)
520 return nodep1;
521
522 /*
523 * Split point not at start of mask, so it must be part of
524 * bits described by num_after.
525 */
526
527 /*
528 * Calculate offset within num_after for where the split is
529 * to occur.
530 */
531 offset = idx - (nodep1->idx + MASK_BITS);
532 orig_num_after = nodep1->num_after;
533
534 /*
535 * Add a new node to describe the bits starting at
536 * the split point.
537 */
538 nodep1->num_after = offset;
539 nodep2 = node_add(s, idx);
540
541 /* Move bits after the split point into the new node */
542 nodep2->num_after = orig_num_after - offset;
543 if (nodep2->num_after >= MASK_BITS) {
544 nodep2->mask = ~(mask_t) 0;
545 nodep2->num_after -= MASK_BITS;
546 } else {
547 nodep2->mask = (1 << nodep2->num_after) - 1;
548 nodep2->num_after = 0;
549 }
550
551 return nodep2;
552}
553
554/* Iteratively reduces the node pointed to by nodep and its adjacent
555 * nodes into a more compact form. For example, a node with a mask with
556 * all bits set adjacent to a previous node, will get combined into a
557 * single node with an increased num_after setting.
558 *
559 * After each reduction, a further check is made to see if additional
560 * reductions are possible with the new previous and next nodes. Note,
561 * a search for a reduction is only done across the nodes nearest nodep
562 * and those that became part of a reduction. Reductions beyond nodep
563 * and the adjacent nodes that are reduced are not discovered. It is the
564 * responsibility of the caller to pass a nodep that is within one node
565 * of each possible reduction.
566 *
567 * This function does not fix the temporary violation of all invariants.
568 * For example it does not fix the case where the bit settings described
569 * by two or more nodes overlap. Such a violation introduces the potential
570 * complication of a bit setting for a specific index having different settings
571 * in different nodes. This would then introduce the further complication
572 * of which node has the correct setting of the bit and thus such conditions
573 * are not allowed.
574 *
575 * This function is designed to fix invariant violations that are introduced
576 * by node_split() and by changes to the nodes mask or num_after members.
577 * For example, when setting a bit within a nodes mask, the function that
578 * sets the bit doesn't have to worry about whether the setting of that
579 * bit caused the mask to have leading only or trailing only bits set.
580 * Instead, the function can call node_reduce(), with nodep equal to the
581 * node address that it set a mask bit in, and node_reduce() will notice
582 * the cases of leading or trailing only bits and that there is an
583 * adjacent node that the bit settings could be merged into.
584 *
585 * This implementation specifically detects and corrects violation of the
586 * following invariants:
587 *
588 * + Node are only used to represent bits that are set.
589 * Nodes with a mask of 0 and num_after of 0 are not allowed.
590 *
591 * + The setting of at least one bit is always described in a nodes
592 * mask (mask >= 1).
593 *
594 * + A node with all mask bits set only occurs when the last bit
595 * described by the previous node is not equal to this nodes
596 * starting index - 1. All such occurences of this condition are
597 * avoided by moving the setting of the nodes mask bits into
598 * the previous nodes num_after setting.
599 */
600static void node_reduce(struct sparsebit *s, struct node *nodep)
601{
602 bool reduction_performed;
603
604 do {
605 reduction_performed = false;
606 struct node *prev, *next, *tmp;
607
608 /* 1) Potential reductions within the current node. */
609
610 /* Nodes with all bits cleared may be removed. */
611 if (nodep->mask == 0 && nodep->num_after == 0) {
612 /*
613 * About to remove the node pointed to by
614 * nodep, which normally would cause a problem
615 * for the next pass through the reduction loop,
616 * because the node at the starting point no longer
617 * exists. This potential problem is handled
618 * by first remembering the location of the next
619 * or previous nodes. Doesn't matter which, because
620 * once the node at nodep is removed, there will be
621 * no other nodes between prev and next.
622 *
623 * Note, the checks performed on nodep against both
624 * both prev and next both check for an adjacent
625 * node that can be reduced into a single node. As
626 * such, after removing the node at nodep, doesn't
627 * matter whether the nodep for the next pass
628 * through the loop is equal to the previous pass
629 * prev or next node. Either way, on the next pass
630 * the one not selected will become either the
631 * prev or next node.
632 */
633 tmp = node_next(s, nodep);
634 if (!tmp)
635 tmp = node_prev(s, nodep);
636
637 node_rm(s, nodep);
638 nodep = NULL;
639
640 nodep = tmp;
641 reduction_performed = true;
642 continue;
643 }
644
645 /*
646 * When the mask is 0, can reduce the amount of num_after
647 * bits by moving the initial num_after bits into the mask.
648 */
649 if (nodep->mask == 0) {
650 assert(nodep->num_after != 0);
651 assert(nodep->idx + MASK_BITS > nodep->idx);
652
653 nodep->idx += MASK_BITS;
654
655 if (nodep->num_after >= MASK_BITS) {
656 nodep->mask = ~0;
657 nodep->num_after -= MASK_BITS;
658 } else {
659 nodep->mask = (1u << nodep->num_after) - 1;
660 nodep->num_after = 0;
661 }
662
663 reduction_performed = true;
664 continue;
665 }
666
667 /*
668 * 2) Potential reductions between the current and
669 * previous nodes.
670 */
671 prev = node_prev(s, nodep);
672 if (prev) {
673 sparsebit_idx_t prev_highest_bit;
674
675 /* Nodes with no bits set can be removed. */
676 if (prev->mask == 0 && prev->num_after == 0) {
677 node_rm(s, prev);
678
679 reduction_performed = true;
680 continue;
681 }
682
683 /*
684 * All mask bits set and previous node has
685 * adjacent index.
686 */
687 if (nodep->mask + 1 == 0 &&
688 prev->idx + MASK_BITS == nodep->idx) {
689 prev->num_after += MASK_BITS + nodep->num_after;
690 nodep->mask = 0;
691 nodep->num_after = 0;
692
693 reduction_performed = true;
694 continue;
695 }
696
697 /*
698 * Is node adjacent to previous node and the node
699 * contains a single contiguous range of bits
700 * starting from the beginning of the mask?
701 */
702 prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after;
703 if (prev_highest_bit + 1 == nodep->idx &&
704 (nodep->mask | (nodep->mask >> 1)) == nodep->mask) {
705 /*
706 * How many contiguous bits are there?
707 * Is equal to the total number of set
708 * bits, due to an earlier check that
709 * there is a single contiguous range of
710 * set bits.
711 */
712 unsigned int num_contiguous
713 = __builtin_popcount(nodep->mask);
714 assert((num_contiguous > 0) &&
715 ((1ULL << num_contiguous) - 1) == nodep->mask);
716
717 prev->num_after += num_contiguous;
718 nodep->mask = 0;
719
720 /*
721 * For predictable performance, handle special
722 * case where all mask bits are set and there
723 * is a non-zero num_after setting. This code
724 * is functionally correct without the following
725 * conditionalized statements, but without them
726 * the value of num_after is only reduced by
727 * the number of mask bits per pass. There are
728 * cases where num_after can be close to 2^64.
729 * Without this code it could take nearly
730 * (2^64) / 32 passes to perform the full
731 * reduction.
732 */
733 if (num_contiguous == MASK_BITS) {
734 prev->num_after += nodep->num_after;
735 nodep->num_after = 0;
736 }
737
738 reduction_performed = true;
739 continue;
740 }
741 }
742
743 /*
744 * 3) Potential reductions between the current and
745 * next nodes.
746 */
747 next = node_next(s, nodep);
748 if (next) {
749 /* Nodes with no bits set can be removed. */
750 if (next->mask == 0 && next->num_after == 0) {
751 node_rm(s, next);
752 reduction_performed = true;
753 continue;
754 }
755
756 /*
757 * Is next node index adjacent to current node
758 * and has a mask with all bits set?
759 */
760 if (next->idx == nodep->idx + MASK_BITS + nodep->num_after &&
761 next->mask == ~(mask_t) 0) {
762 nodep->num_after += MASK_BITS;
763 next->mask = 0;
764 nodep->num_after += next->num_after;
765 next->num_after = 0;
766
767 node_rm(s, next);
768 next = NULL;
769
770 reduction_performed = true;
771 continue;
772 }
773 }
774 } while (nodep && reduction_performed);
775}
776
777/* Returns whether the bit at the index given by idx, within the
778 * sparsebit array is set or not.
779 */
780bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx)
781{
782 struct node *nodep;
783
784 /* Find the node that describes the setting of the bit at idx */
785 for (nodep = s->root; nodep;
786 nodep = nodep->idx > idx ? nodep->left : nodep->right)
787 if (idx >= nodep->idx &&
788 idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
789 goto have_node;
790
791 return false;
792
793have_node:
794 /* Bit is set if it is any of the bits described by num_after */
795 if (nodep->num_after && idx >= nodep->idx + MASK_BITS)
796 return true;
797
798 /* Is the corresponding mask bit set */
799 assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS);
800 return !!(nodep->mask & (1 << (idx - nodep->idx)));
801}
802
803/* Within the sparsebit array pointed to by s, sets the bit
804 * at the index given by idx.
805 */
806static void bit_set(struct sparsebit *s, sparsebit_idx_t idx)
807{
808 struct node *nodep;
809
810 /* Skip bits that are already set */
811 if (sparsebit_is_set(s, idx))
812 return;
813
814 /*
815 * Get a node where the bit at idx is described by the mask.
816 * The node_split will also create a node, if there isn't
817 * already a node that describes the setting of bit.
818 */
819 nodep = node_split(s, idx & -MASK_BITS);
820
821 /* Set the bit within the nodes mask */
822 assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
823 assert(!(nodep->mask & (1 << (idx - nodep->idx))));
824 nodep->mask |= 1 << (idx - nodep->idx);
825 s->num_set++;
826
827 node_reduce(s, nodep);
828}
829
830/* Within the sparsebit array pointed to by s, clears the bit
831 * at the index given by idx.
832 */
833static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx)
834{
835 struct node *nodep;
836
837 /* Skip bits that are already cleared */
838 if (!sparsebit_is_set(s, idx))
839 return;
840
841 /* Is there a node that describes the setting of this bit? */
842 nodep = node_find(s, idx);
843 if (!nodep)
844 return;
845
846 /*
847 * If a num_after bit, split the node, so that the bit is
848 * part of a node mask.
849 */
850 if (idx >= nodep->idx + MASK_BITS)
851 nodep = node_split(s, idx & -MASK_BITS);
852
853 /*
854 * After node_split above, bit at idx should be within the mask.
855 * Clear that bit.
856 */
857 assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
858 assert(nodep->mask & (1 << (idx - nodep->idx)));
859 nodep->mask &= ~(1 << (idx - nodep->idx));
860 assert(s->num_set > 0 || sparsebit_all_set(s));
861 s->num_set--;
862
863 node_reduce(s, nodep);
864}
865
866/* Recursively dumps to the FILE stream given by stream the contents
867 * of the sub-tree of nodes pointed to by nodep. Each line of output
868 * is prefixed by the number of spaces given by indent. On each
869 * recursion, the indent amount is increased by 2. This causes nodes
870 * at each level deeper into the binary search tree to be displayed
871 * with a greater indent.
872 */
873static void dump_nodes(FILE *stream, struct node *nodep,
874 unsigned int indent)
875{
876 char *node_type;
877
878 /* Dump contents of node */
879 if (!nodep->parent)
880 node_type = "root";
881 else if (nodep == nodep->parent->left)
882 node_type = "left";
883 else {
884 assert(nodep == nodep->parent->right);
885 node_type = "right";
886 }
887 fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep);
888 fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "",
889 nodep->parent, nodep->left, nodep->right);
890 fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n",
891 indent, "", nodep->idx, nodep->mask, nodep->num_after);
892
893 /* If present, dump contents of left child nodes */
894 if (nodep->left)
895 dump_nodes(stream, nodep->left, indent + 2);
896
897 /* If present, dump contents of right child nodes */
898 if (nodep->right)
899 dump_nodes(stream, nodep->right, indent + 2);
900}
901
902static inline sparsebit_idx_t node_first_set(struct node *nodep, int start)
903{
904 mask_t leading = (mask_t)1 << start;
905 int n1 = __builtin_ctz(nodep->mask & -leading);
906
907 return nodep->idx + n1;
908}
909
910static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start)
911{
912 mask_t leading = (mask_t)1 << start;
913 int n1 = __builtin_ctz(~nodep->mask & -leading);
914
915 return nodep->idx + n1;
916}
917
918/* Dumps to the FILE stream specified by stream, the implementation dependent
919 * internal state of s. Each line of output is prefixed with the number
920 * of spaces given by indent. The output is completely implementation
921 * dependent and subject to change. Output from this function should only
922 * be used for diagnostic purposes. For example, this function can be
923 * used by test cases after they detect an unexpected condition, as a means
924 * to capture diagnostic information.
925 */
926static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s,
927 unsigned int indent)
928{
929 /* Dump the contents of s */
930 fprintf(stream, "%*sroot: %p\n", indent, "", s->root);
931 fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set);
932
933 if (s->root)
934 dump_nodes(stream, s->root, indent);
935}
936
937/* Allocates and returns a new sparsebit array. The initial state
938 * of the newly allocated sparsebit array has all bits cleared.
939 */
940struct sparsebit *sparsebit_alloc(void)
941{
942 struct sparsebit *s;
943
944 /* Allocate top level structure. */
945 s = calloc(1, sizeof(*s));
946 if (!s) {
947 perror("calloc");
948 abort();
949 }
950
951 return s;
952}
953
954/* Frees the implementation dependent data for the sparsebit array
955 * pointed to by s and poisons the pointer to that data.
956 */
957void sparsebit_free(struct sparsebit **sbitp)
958{
959 struct sparsebit *s = *sbitp;
960
961 if (!s)
962 return;
963
964 sparsebit_clear_all(s);
965 free(s);
966 *sbitp = NULL;
967}
968
969/* Makes a copy of the sparsebit array given by s, to the sparsebit
970 * array given by d. Note, d must have already been allocated via
971 * sparsebit_alloc(). It can though already have bits set, which
972 * if different from src will be cleared.
973 */
974void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
975{
976 /* First clear any bits already set in the destination */
977 sparsebit_clear_all(d);
978
979 if (s->root) {
980 d->root = node_copy_subtree(s->root);
981 d->num_set = s->num_set;
982 }
983}
984
985/* Returns whether num consecutive bits starting at idx are all set. */
986bool sparsebit_is_set_num(struct sparsebit *s,
987 sparsebit_idx_t idx, sparsebit_num_t num)
988{
989 sparsebit_idx_t next_cleared;
990
991 assert(num > 0);
992 assert(idx + num - 1 >= idx);
993
994 /* With num > 0, the first bit must be set. */
995 if (!sparsebit_is_set(s, idx))
996 return false;
997
998 /* Find the next cleared bit */
999 next_cleared = sparsebit_next_clear(s, idx);
1000
1001 /*
1002 * If no cleared bits beyond idx, then there are at least num
1003 * set bits. idx + num doesn't wrap. Otherwise check if
1004 * there are enough set bits between idx and the next cleared bit.
1005 */
1006 return next_cleared == 0 || next_cleared - idx >= num;
1007}
1008
1009/* Returns whether the bit at the index given by idx. */
1010bool sparsebit_is_clear(struct sparsebit *s,
1011 sparsebit_idx_t idx)
1012{
1013 return !sparsebit_is_set(s, idx);
1014}
1015
1016/* Returns whether num consecutive bits starting at idx are all cleared. */
1017bool sparsebit_is_clear_num(struct sparsebit *s,
1018 sparsebit_idx_t idx, sparsebit_num_t num)
1019{
1020 sparsebit_idx_t next_set;
1021
1022 assert(num > 0);
1023 assert(idx + num - 1 >= idx);
1024
1025 /* With num > 0, the first bit must be cleared. */
1026 if (!sparsebit_is_clear(s, idx))
1027 return false;
1028
1029 /* Find the next set bit */
1030 next_set = sparsebit_next_set(s, idx);
1031
1032 /*
1033 * If no set bits beyond idx, then there are at least num
1034 * cleared bits. idx + num doesn't wrap. Otherwise check if
1035 * there are enough cleared bits between idx and the next set bit.
1036 */
1037 return next_set == 0 || next_set - idx >= num;
1038}
1039
1040/* Returns the total number of bits set. Note: 0 is also returned for
1041 * the case of all bits set. This is because with all bits set, there
1042 * is 1 additional bit set beyond what can be represented in the return
1043 * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0,
1044 * to determine if the sparsebit array has any bits set.
1045 */
1046sparsebit_num_t sparsebit_num_set(struct sparsebit *s)
1047{
1048 return s->num_set;
1049}
1050
1051/* Returns whether any bit is set in the sparsebit array. */
1052bool sparsebit_any_set(struct sparsebit *s)
1053{
1054 /*
1055 * Nodes only describe set bits. If any nodes then there
1056 * is at least 1 bit set.
1057 */
1058 if (!s->root)
1059 return false;
1060
1061 /*
1062 * Every node should have a non-zero mask. For now will
1063 * just assure that the root node has a non-zero mask,
1064 * which is a quick check that at least 1 bit is set.
1065 */
1066 assert(s->root->mask != 0);
1067 assert(s->num_set > 0 ||
1068 (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS &&
1069 s->root->mask == ~(mask_t) 0));
1070
1071 return true;
1072}
1073
1074/* Returns whether all the bits in the sparsebit array are cleared. */
1075bool sparsebit_all_clear(struct sparsebit *s)
1076{
1077 return !sparsebit_any_set(s);
1078}
1079
1080/* Returns whether all the bits in the sparsebit array are set. */
1081bool sparsebit_any_clear(struct sparsebit *s)
1082{
1083 return !sparsebit_all_set(s);
1084}
1085
1086/* Returns the index of the first set bit. Abort if no bits are set.
1087 */
1088sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
1089{
1090 struct node *nodep;
1091
1092 /* Validate at least 1 bit is set */
1093 assert(sparsebit_any_set(s));
1094
1095 nodep = node_first(s);
1096 return node_first_set(nodep, 0);
1097}
1098
1099/* Returns the index of the first cleared bit. Abort if
1100 * no bits are cleared.
1101 */
1102sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
1103{
1104 struct node *nodep1, *nodep2;
1105
1106 /* Validate at least 1 bit is cleared. */
1107 assert(sparsebit_any_clear(s));
1108
1109 /* If no nodes or first node index > 0 then lowest cleared is 0 */
1110 nodep1 = node_first(s);
1111 if (!nodep1 || nodep1->idx > 0)
1112 return 0;
1113
1114 /* Does the mask in the first node contain any cleared bits. */
1115 if (nodep1->mask != ~(mask_t) 0)
1116 return node_first_clear(nodep1, 0);
1117
1118 /*
1119 * All mask bits set in first node. If there isn't a second node
1120 * then the first cleared bit is the first bit after the bits
1121 * described by the first node.
1122 */
1123 nodep2 = node_next(s, nodep1);
1124 if (!nodep2) {
1125 /*
1126 * No second node. First cleared bit is first bit beyond
1127 * bits described by first node.
1128 */
1129 assert(nodep1->mask == ~(mask_t) 0);
1130 assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0);
1131 return nodep1->idx + MASK_BITS + nodep1->num_after;
1132 }
1133
1134 /*
1135 * There is a second node.
1136 * If it is not adjacent to the first node, then there is a gap
1137 * of cleared bits between the nodes, and the first cleared bit
1138 * is the first bit within the gap.
1139 */
1140 if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
1141 return nodep1->idx + MASK_BITS + nodep1->num_after;
1142
1143 /*
1144 * Second node is adjacent to the first node.
1145 * Because it is adjacent, its mask should be non-zero. If all
1146 * its mask bits are set, then with it being adjacent, it should
1147 * have had the mask bits moved into the num_after setting of the
1148 * previous node.
1149 */
1150 return node_first_clear(nodep2, 0);
1151}
1152
1153/* Returns index of next bit set within s after the index given by prev.
1154 * Returns 0 if there are no bits after prev that are set.
1155 */
1156sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
1157 sparsebit_idx_t prev)
1158{
1159 sparsebit_idx_t lowest_possible = prev + 1;
1160 sparsebit_idx_t start;
1161 struct node *nodep;
1162
1163 /* A bit after the highest index can't be set. */
1164 if (lowest_possible == 0)
1165 return 0;
1166
1167 /*
1168 * Find the leftmost 'candidate' overlapping or to the right
1169 * of lowest_possible.
1170 */
1171 struct node *candidate = NULL;
1172
1173 /* True iff lowest_possible is within candidate */
1174 bool contains = false;
1175
1176 /*
1177 * Find node that describes setting of bit at lowest_possible.
1178 * If such a node doesn't exist, find the node with the lowest
1179 * starting index that is > lowest_possible.
1180 */
1181 for (nodep = s->root; nodep;) {
1182 if ((nodep->idx + MASK_BITS + nodep->num_after - 1)
1183 >= lowest_possible) {
1184 candidate = nodep;
1185 if (candidate->idx <= lowest_possible) {
1186 contains = true;
1187 break;
1188 }
1189 nodep = nodep->left;
1190 } else {
1191 nodep = nodep->right;
1192 }
1193 }
1194 if (!candidate)
1195 return 0;
1196
1197 assert(candidate->mask != 0);
1198
1199 /* Does the candidate node describe the setting of lowest_possible? */
1200 if (!contains) {
1201 /*
1202 * Candidate doesn't describe setting of bit at lowest_possible.
1203 * Candidate points to the first node with a starting index
1204 * > lowest_possible.
1205 */
1206 assert(candidate->idx > lowest_possible);
1207
1208 return node_first_set(candidate, 0);
1209 }
1210
1211 /*
1212 * Candidate describes setting of bit at lowest_possible.
1213 * Note: although the node describes the setting of the bit
1214 * at lowest_possible, its possible that its setting and the
1215 * setting of all latter bits described by this node are 0.
1216 * For now, just handle the cases where this node describes
1217 * a bit at or after an index of lowest_possible that is set.
1218 */
1219 start = lowest_possible - candidate->idx;
1220
1221 if (start < MASK_BITS && candidate->mask >= (1 << start))
1222 return node_first_set(candidate, start);
1223
1224 if (candidate->num_after) {
1225 sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS;
1226
1227 return lowest_possible < first_num_after_idx
1228 ? first_num_after_idx : lowest_possible;
1229 }
1230
1231 /*
1232 * Although candidate node describes setting of bit at
1233 * the index of lowest_possible, all bits at that index and
1234 * latter that are described by candidate are cleared. With
1235 * this, the next bit is the first bit in the next node, if
1236 * such a node exists. If a next node doesn't exist, then
1237 * there is no next set bit.
1238 */
1239 candidate = node_next(s, candidate);
1240 if (!candidate)
1241 return 0;
1242
1243 return node_first_set(candidate, 0);
1244}
1245
1246/* Returns index of next bit cleared within s after the index given by prev.
1247 * Returns 0 if there are no bits after prev that are cleared.
1248 */
1249sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
1250 sparsebit_idx_t prev)
1251{
1252 sparsebit_idx_t lowest_possible = prev + 1;
1253 sparsebit_idx_t idx;
1254 struct node *nodep1, *nodep2;
1255
1256 /* A bit after the highest index can't be set. */
1257 if (lowest_possible == 0)
1258 return 0;
1259
1260 /*
1261 * Does a node describing the setting of lowest_possible exist?
1262 * If not, the bit at lowest_possible is cleared.
1263 */
1264 nodep1 = node_find(s, lowest_possible);
1265 if (!nodep1)
1266 return lowest_possible;
1267
1268 /* Does a mask bit in node 1 describe the next cleared bit. */
1269 for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++)
1270 if (!(nodep1->mask & (1 << idx)))
1271 return nodep1->idx + idx;
1272
1273 /*
1274 * Next cleared bit is not described by node 1. If there
1275 * isn't a next node, then next cleared bit is described
1276 * by bit after the bits described by the first node.
1277 */
1278 nodep2 = node_next(s, nodep1);
1279 if (!nodep2)
1280 return nodep1->idx + MASK_BITS + nodep1->num_after;
1281
1282 /*
1283 * There is a second node.
1284 * If it is not adjacent to the first node, then there is a gap
1285 * of cleared bits between the nodes, and the next cleared bit
1286 * is the first bit within the gap.
1287 */
1288 if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
1289 return nodep1->idx + MASK_BITS + nodep1->num_after;
1290
1291 /*
1292 * Second node is adjacent to the first node.
1293 * Because it is adjacent, its mask should be non-zero. If all
1294 * its mask bits are set, then with it being adjacent, it should
1295 * have had the mask bits moved into the num_after setting of the
1296 * previous node.
1297 */
1298 return node_first_clear(nodep2, 0);
1299}
1300
1301/* Starting with the index 1 greater than the index given by start, finds
1302 * and returns the index of the first sequence of num consecutively set
1303 * bits. Returns a value of 0 of no such sequence exists.
1304 */
1305sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
1306 sparsebit_idx_t start, sparsebit_num_t num)
1307{
1308 sparsebit_idx_t idx;
1309
1310 assert(num >= 1);
1311
1312 for (idx = sparsebit_next_set(s, start);
1313 idx != 0 && idx + num - 1 >= idx;
1314 idx = sparsebit_next_set(s, idx)) {
1315 assert(sparsebit_is_set(s, idx));
1316
1317 /*
1318 * Does the sequence of bits starting at idx consist of
1319 * num set bits?
1320 */
1321 if (sparsebit_is_set_num(s, idx, num))
1322 return idx;
1323
1324 /*
1325 * Sequence of set bits at idx isn't large enough.
1326 * Skip this entire sequence of set bits.
1327 */
1328 idx = sparsebit_next_clear(s, idx);
1329 if (idx == 0)
1330 return 0;
1331 }
1332
1333 return 0;
1334}
1335
1336/* Starting with the index 1 greater than the index given by start, finds
1337 * and returns the index of the first sequence of num consecutively cleared
1338 * bits. Returns a value of 0 of no such sequence exists.
1339 */
1340sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s,
1341 sparsebit_idx_t start, sparsebit_num_t num)
1342{
1343 sparsebit_idx_t idx;
1344
1345 assert(num >= 1);
1346
1347 for (idx = sparsebit_next_clear(s, start);
1348 idx != 0 && idx + num - 1 >= idx;
1349 idx = sparsebit_next_clear(s, idx)) {
1350 assert(sparsebit_is_clear(s, idx));
1351
1352 /*
1353 * Does the sequence of bits starting at idx consist of
1354 * num cleared bits?
1355 */
1356 if (sparsebit_is_clear_num(s, idx, num))
1357 return idx;
1358
1359 /*
1360 * Sequence of cleared bits at idx isn't large enough.
1361 * Skip this entire sequence of cleared bits.
1362 */
1363 idx = sparsebit_next_set(s, idx);
1364 if (idx == 0)
1365 return 0;
1366 }
1367
1368 return 0;
1369}
1370
1371/* Sets the bits * in the inclusive range idx through idx + num - 1. */
1372void sparsebit_set_num(struct sparsebit *s,
1373 sparsebit_idx_t start, sparsebit_num_t num)
1374{
1375 struct node *nodep, *next;
1376 unsigned int n1;
1377 sparsebit_idx_t idx;
1378 sparsebit_num_t n;
1379 sparsebit_idx_t middle_start, middle_end;
1380
1381 assert(num > 0);
1382 assert(start + num - 1 >= start);
1383
1384 /*
1385 * Leading - bits before first mask boundary.
1386 *
1387 * TODO(lhuemill): With some effort it may be possible to
1388 * replace the following loop with a sequential sequence
1389 * of statements. High level sequence would be:
1390 *
1391 * 1. Use node_split() to force node that describes setting
1392 * of idx to be within the mask portion of a node.
1393 * 2. Form mask of bits to be set.
1394 * 3. Determine number of mask bits already set in the node
1395 * and store in a local variable named num_already_set.
1396 * 4. Set the appropriate mask bits within the node.
1397 * 5. Increment struct sparsebit_pvt num_set member
1398 * by the number of bits that were actually set.
1399 * Exclude from the counts bits that were already set.
1400 * 6. Before returning to the caller, use node_reduce() to
1401 * handle the multiple corner cases that this method
1402 * introduces.
1403 */
1404 for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
1405 bit_set(s, idx);
1406
1407 /* Middle - bits spanning one or more entire mask */
1408 middle_start = idx;
1409 middle_end = middle_start + (n & -MASK_BITS) - 1;
1410 if (n >= MASK_BITS) {
1411 nodep = node_split(s, middle_start);
1412
1413 /*
1414 * As needed, split just after end of middle bits.
1415 * No split needed if end of middle bits is at highest
1416 * supported bit index.
1417 */
1418 if (middle_end + 1 > middle_end)
1419 (void) node_split(s, middle_end + 1);
1420
1421 /* Delete nodes that only describe bits within the middle. */
1422 for (next = node_next(s, nodep);
1423 next && (next->idx < middle_end);
1424 next = node_next(s, nodep)) {
1425 assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
1426 node_rm(s, next);
1427 next = NULL;
1428 }
1429
1430 /* As needed set each of the mask bits */
1431 for (n1 = 0; n1 < MASK_BITS; n1++) {
1432 if (!(nodep->mask & (1 << n1))) {
1433 nodep->mask |= 1 << n1;
1434 s->num_set++;
1435 }
1436 }
1437
1438 s->num_set -= nodep->num_after;
1439 nodep->num_after = middle_end - middle_start + 1 - MASK_BITS;
1440 s->num_set += nodep->num_after;
1441
1442 node_reduce(s, nodep);
1443 }
1444 idx = middle_end + 1;
1445 n -= middle_end - middle_start + 1;
1446
1447 /* Trailing - bits at and beyond last mask boundary */
1448 assert(n < MASK_BITS);
1449 for (; n > 0; idx++, n--)
1450 bit_set(s, idx);
1451}
1452
1453/* Clears the bits * in the inclusive range idx through idx + num - 1. */
1454void sparsebit_clear_num(struct sparsebit *s,
1455 sparsebit_idx_t start, sparsebit_num_t num)
1456{
1457 struct node *nodep, *next;
1458 unsigned int n1;
1459 sparsebit_idx_t idx;
1460 sparsebit_num_t n;
1461 sparsebit_idx_t middle_start, middle_end;
1462
1463 assert(num > 0);
1464 assert(start + num - 1 >= start);
1465
1466 /* Leading - bits before first mask boundary */
1467 for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
1468 bit_clear(s, idx);
1469
1470 /* Middle - bits spanning one or more entire mask */
1471 middle_start = idx;
1472 middle_end = middle_start + (n & -MASK_BITS) - 1;
1473 if (n >= MASK_BITS) {
1474 nodep = node_split(s, middle_start);
1475
1476 /*
1477 * As needed, split just after end of middle bits.
1478 * No split needed if end of middle bits is at highest
1479 * supported bit index.
1480 */
1481 if (middle_end + 1 > middle_end)
1482 (void) node_split(s, middle_end + 1);
1483
1484 /* Delete nodes that only describe bits within the middle. */
1485 for (next = node_next(s, nodep);
1486 next && (next->idx < middle_end);
1487 next = node_next(s, nodep)) {
1488 assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
1489 node_rm(s, next);
1490 next = NULL;
1491 }
1492
1493 /* As needed clear each of the mask bits */
1494 for (n1 = 0; n1 < MASK_BITS; n1++) {
1495 if (nodep->mask & (1 << n1)) {
1496 nodep->mask &= ~(1 << n1);
1497 s->num_set--;
1498 }
1499 }
1500
1501 /* Clear any bits described by num_after */
1502 s->num_set -= nodep->num_after;
1503 nodep->num_after = 0;
1504
1505 /*
1506 * Delete the node that describes the beginning of
1507 * the middle bits and perform any allowed reductions
1508 * with the nodes prev or next of nodep.
1509 */
1510 node_reduce(s, nodep);
1511 nodep = NULL;
1512 }
1513 idx = middle_end + 1;
1514 n -= middle_end - middle_start + 1;
1515
1516 /* Trailing - bits at and beyond last mask boundary */
1517 assert(n < MASK_BITS);
1518 for (; n > 0; idx++, n--)
1519 bit_clear(s, idx);
1520}
1521
1522/* Sets the bit at the index given by idx. */
1523void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx)
1524{
1525 sparsebit_set_num(s, idx, 1);
1526}
1527
1528/* Clears the bit at the index given by idx. */
1529void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx)
1530{
1531 sparsebit_clear_num(s, idx, 1);
1532}
1533
1534/* Sets the bits in the entire addressable range of the sparsebit array. */
1535void sparsebit_set_all(struct sparsebit *s)
1536{
1537 sparsebit_set(s, 0);
1538 sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0);
1539 assert(sparsebit_all_set(s));
1540}
1541
1542/* Clears the bits in the entire addressable range of the sparsebit array. */
1543void sparsebit_clear_all(struct sparsebit *s)
1544{
1545 sparsebit_clear(s, 0);
1546 sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0);
1547 assert(!sparsebit_any_set(s));
1548}
1549
1550static size_t display_range(FILE *stream, sparsebit_idx_t low,
1551 sparsebit_idx_t high, bool prepend_comma_space)
1552{
1553 char *fmt_str;
1554 size_t sz;
1555
1556 /* Determine the printf format string */
1557 if (low == high)
1558 fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx";
1559 else
1560 fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx";
1561
1562 /*
1563 * When stream is NULL, just determine the size of what would
1564 * have been printed, else print the range.
1565 */
1566 if (!stream)
1567 sz = snprintf(NULL, 0, fmt_str, low, high);
1568 else
1569 sz = fprintf(stream, fmt_str, low, high);
1570
1571 return sz;
1572}
1573
1574
1575/* Dumps to the FILE stream given by stream, the bit settings
1576 * of s. Each line of output is prefixed with the number of
1577 * spaces given by indent. The length of each line is implementation
1578 * dependent and does not depend on the indent amount. The following
1579 * is an example output of a sparsebit array that has bits:
1580 *
1581 * 0x5, 0x8, 0xa:0xe, 0x12
1582 *
1583 * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18
1584 * are set. Note that a ':', instead of a '-' is used to specify a range of
1585 * contiguous bits. This is done because '-' is used to specify command-line
1586 * options, and sometimes ranges are specified as command-line arguments.
1587 */
1588void sparsebit_dump(FILE *stream, struct sparsebit *s,
1589 unsigned int indent)
1590{
1591 size_t current_line_len = 0;
1592 size_t sz;
1593 struct node *nodep;
1594
1595 if (!sparsebit_any_set(s))
1596 return;
1597
1598 /* Display initial indent */
1599 fprintf(stream, "%*s", indent, "");
1600
1601 /* For each node */
1602 for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) {
1603 unsigned int n1;
1604 sparsebit_idx_t low, high;
1605
1606 /* For each group of bits in the mask */
1607 for (n1 = 0; n1 < MASK_BITS; n1++) {
1608 if (nodep->mask & (1 << n1)) {
1609 low = high = nodep->idx + n1;
1610
1611 for (; n1 < MASK_BITS; n1++) {
1612 if (nodep->mask & (1 << n1))
1613 high = nodep->idx + n1;
1614 else
1615 break;
1616 }
1617
1618 if ((n1 == MASK_BITS) && nodep->num_after)
1619 high += nodep->num_after;
1620
1621 /*
1622 * How much room will it take to display
1623 * this range.
1624 */
1625 sz = display_range(NULL, low, high,
1626 current_line_len != 0);
1627
1628 /*
1629 * If there is not enough room, display
1630 * a newline plus the indent of the next
1631 * line.
1632 */
1633 if (current_line_len + sz > DUMP_LINE_MAX) {
1634 fputs("\n", stream);
1635 fprintf(stream, "%*s", indent, "");
1636 current_line_len = 0;
1637 }
1638
1639 /* Display the range */
1640 sz = display_range(stream, low, high,
1641 current_line_len != 0);
1642 current_line_len += sz;
1643 }
1644 }
1645
1646 /*
1647 * If num_after and most significant-bit of mask is not
1648 * set, then still need to display a range for the bits
1649 * described by num_after.
1650 */
1651 if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) {
1652 low = nodep->idx + MASK_BITS;
1653 high = nodep->idx + MASK_BITS + nodep->num_after - 1;
1654
1655 /*
1656 * How much room will it take to display
1657 * this range.
1658 */
1659 sz = display_range(NULL, low, high,
1660 current_line_len != 0);
1661
1662 /*
1663 * If there is not enough room, display
1664 * a newline plus the indent of the next
1665 * line.
1666 */
1667 if (current_line_len + sz > DUMP_LINE_MAX) {
1668 fputs("\n", stream);
1669 fprintf(stream, "%*s", indent, "");
1670 current_line_len = 0;
1671 }
1672
1673 /* Display the range */
1674 sz = display_range(stream, low, high,
1675 current_line_len != 0);
1676 current_line_len += sz;
1677 }
1678 }
1679 fputs("\n", stream);
1680}
1681
1682/* Validates the internal state of the sparsebit array given by
1683 * s. On error, diagnostic information is printed to stderr and
1684 * abort is called.
1685 */
1686void sparsebit_validate_internal(struct sparsebit *s)
1687{
1688 bool error_detected = false;
1689 struct node *nodep, *prev = NULL;
1690 sparsebit_num_t total_bits_set = 0;
1691 unsigned int n1;
1692
1693 /* For each node */
1694 for (nodep = node_first(s); nodep;
1695 prev = nodep, nodep = node_next(s, nodep)) {
1696
1697 /*
1698 * Increase total bits set by the number of bits set
1699 * in this node.
1700 */
1701 for (n1 = 0; n1 < MASK_BITS; n1++)
1702 if (nodep->mask & (1 << n1))
1703 total_bits_set++;
1704
1705 total_bits_set += nodep->num_after;
1706
1707 /*
1708 * Arbitrary choice as to whether a mask of 0 is allowed
1709 * or not. For diagnostic purposes it is beneficial to
1710 * have only one valid means to represent a set of bits.
1711 * To support this an arbitrary choice has been made
1712 * to not allow a mask of zero.
1713 */
1714 if (nodep->mask == 0) {
1715 fprintf(stderr, "Node mask of zero, "
1716 "nodep: %p nodep->mask: 0x%x",
1717 nodep, nodep->mask);
1718 error_detected = true;
1719 break;
1720 }
1721
1722 /*
1723 * Validate num_after is not greater than the max index
1724 * - the number of mask bits. The num_after member
1725 * uses 0-based indexing and thus has no value that
1726 * represents all bits set. This limitation is handled
1727 * by requiring a non-zero mask. With a non-zero mask,
1728 * MASK_BITS worth of bits are described by the mask,
1729 * which makes the largest needed num_after equal to:
1730 *
1731 * (~(sparsebit_num_t) 0) - MASK_BITS + 1
1732 */
1733 if (nodep->num_after
1734 > (~(sparsebit_num_t) 0) - MASK_BITS + 1) {
1735 fprintf(stderr, "num_after too large, "
1736 "nodep: %p nodep->num_after: 0x%lx",
1737 nodep, nodep->num_after);
1738 error_detected = true;
1739 break;
1740 }
1741
1742 /* Validate node index is divisible by the mask size */
1743 if (nodep->idx % MASK_BITS) {
1744 fprintf(stderr, "Node index not divisable by "
1745 "mask size,\n"
1746 " nodep: %p nodep->idx: 0x%lx "
1747 "MASK_BITS: %lu\n",
1748 nodep, nodep->idx, MASK_BITS);
1749 error_detected = true;
1750 break;
1751 }
1752
1753 /*
1754 * Validate bits described by node don't wrap beyond the
1755 * highest supported index.
1756 */
1757 if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) {
1758 fprintf(stderr, "Bits described by node wrap "
1759 "beyond highest supported index,\n"
1760 " nodep: %p nodep->idx: 0x%lx\n"
1761 " MASK_BITS: %lu nodep->num_after: 0x%lx",
1762 nodep, nodep->idx, MASK_BITS, nodep->num_after);
1763 error_detected = true;
1764 break;
1765 }
1766
1767 /* Check parent pointers. */
1768 if (nodep->left) {
1769 if (nodep->left->parent != nodep) {
1770 fprintf(stderr, "Left child parent pointer "
1771 "doesn't point to this node,\n"
1772 " nodep: %p nodep->left: %p "
1773 "nodep->left->parent: %p",
1774 nodep, nodep->left,
1775 nodep->left->parent);
1776 error_detected = true;
1777 break;
1778 }
1779 }
1780
1781 if (nodep->right) {
1782 if (nodep->right->parent != nodep) {
1783 fprintf(stderr, "Right child parent pointer "
1784 "doesn't point to this node,\n"
1785 " nodep: %p nodep->right: %p "
1786 "nodep->right->parent: %p",
1787 nodep, nodep->right,
1788 nodep->right->parent);
1789 error_detected = true;
1790 break;
1791 }
1792 }
1793
1794 if (!nodep->parent) {
1795 if (s->root != nodep) {
1796 fprintf(stderr, "Unexpected root node, "
1797 "s->root: %p nodep: %p",
1798 s->root, nodep);
1799 error_detected = true;
1800 break;
1801 }
1802 }
1803
1804 if (prev) {
1805 /*
1806 * Is index of previous node before index of
1807 * current node?
1808 */
1809 if (prev->idx >= nodep->idx) {
1810 fprintf(stderr, "Previous node index "
1811 ">= current node index,\n"
1812 " prev: %p prev->idx: 0x%lx\n"
1813 " nodep: %p nodep->idx: 0x%lx",
1814 prev, prev->idx, nodep, nodep->idx);
1815 error_detected = true;
1816 break;
1817 }
1818
1819 /*
1820 * Nodes occur in asscending order, based on each
1821 * nodes starting index.
1822 */
1823 if ((prev->idx + MASK_BITS + prev->num_after - 1)
1824 >= nodep->idx) {
1825 fprintf(stderr, "Previous node bit range "
1826 "overlap with current node bit range,\n"
1827 " prev: %p prev->idx: 0x%lx "
1828 "prev->num_after: 0x%lx\n"
1829 " nodep: %p nodep->idx: 0x%lx "
1830 "nodep->num_after: 0x%lx\n"
1831 " MASK_BITS: %lu",
1832 prev, prev->idx, prev->num_after,
1833 nodep, nodep->idx, nodep->num_after,
1834 MASK_BITS);
1835 error_detected = true;
1836 break;
1837 }
1838
1839 /*
1840 * When the node has all mask bits set, it shouldn't
1841 * be adjacent to the last bit described by the
1842 * previous node.
1843 */
1844 if (nodep->mask == ~(mask_t) 0 &&
1845 prev->idx + MASK_BITS + prev->num_after == nodep->idx) {
1846 fprintf(stderr, "Current node has mask with "
1847 "all bits set and is adjacent to the "
1848 "previous node,\n"
1849 " prev: %p prev->idx: 0x%lx "
1850 "prev->num_after: 0x%lx\n"
1851 " nodep: %p nodep->idx: 0x%lx "
1852 "nodep->num_after: 0x%lx\n"
1853 " MASK_BITS: %lu",
1854 prev, prev->idx, prev->num_after,
1855 nodep, nodep->idx, nodep->num_after,
1856 MASK_BITS);
1857
1858 error_detected = true;
1859 break;
1860 }
1861 }
1862 }
1863
1864 if (!error_detected) {
1865 /*
1866 * Is sum of bits set in each node equal to the count
1867 * of total bits set.
1868 */
1869 if (s->num_set != total_bits_set) {
1870 fprintf(stderr, "Number of bits set missmatch,\n"
1871 " s->num_set: 0x%lx total_bits_set: 0x%lx",
1872 s->num_set, total_bits_set);
1873
1874 error_detected = true;
1875 }
1876 }
1877
1878 if (error_detected) {
1879 fputs(" dump_internal:\n", stderr);
1880 sparsebit_dump_internal(stderr, s, 4);
1881 abort();
1882 }
1883}
1884
1885
1886#ifdef FUZZ
1887/* A simple but effective fuzzing driver. Look for bugs with the help
1888 * of some invariants and of a trivial representation of sparsebit.
1889 * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let
1890 * afl-fuzz do the magic. :)
1891 */
1892
1893#include <stdlib.h>
1894#include <assert.h>
1895
1896struct range {
1897 sparsebit_idx_t first, last;
1898 bool set;
1899};
1900
1901struct sparsebit *s;
1902struct range ranges[1000];
1903int num_ranges;
1904
1905static bool get_value(sparsebit_idx_t idx)
1906{
1907 int i;
1908
1909 for (i = num_ranges; --i >= 0; )
1910 if (ranges[i].first <= idx && idx <= ranges[i].last)
1911 return ranges[i].set;
1912
1913 return false;
1914}
1915
1916static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last)
1917{
1918 sparsebit_num_t num;
1919 sparsebit_idx_t next;
1920
1921 if (first < last) {
1922 num = last - first + 1;
1923 } else {
1924 num = first - last + 1;
1925 first = last;
1926 last = first + num - 1;
1927 }
1928
1929 switch (code) {
1930 case 0:
1931 sparsebit_set(s, first);
1932 assert(sparsebit_is_set(s, first));
1933 assert(!sparsebit_is_clear(s, first));
1934 assert(sparsebit_any_set(s));
1935 assert(!sparsebit_all_clear(s));
1936 if (get_value(first))
1937 return;
1938 if (num_ranges == 1000)
1939 exit(0);
1940 ranges[num_ranges++] = (struct range)
1941 { .first = first, .last = first, .set = true };
1942 break;
1943 case 1:
1944 sparsebit_clear(s, first);
1945 assert(!sparsebit_is_set(s, first));
1946 assert(sparsebit_is_clear(s, first));
1947 assert(sparsebit_any_clear(s));
1948 assert(!sparsebit_all_set(s));
1949 if (!get_value(first))
1950 return;
1951 if (num_ranges == 1000)
1952 exit(0);
1953 ranges[num_ranges++] = (struct range)
1954 { .first = first, .last = first, .set = false };
1955 break;
1956 case 2:
1957 assert(sparsebit_is_set(s, first) == get_value(first));
1958 assert(sparsebit_is_clear(s, first) == !get_value(first));
1959 break;
1960 case 3:
1961 if (sparsebit_any_set(s))
1962 assert(get_value(sparsebit_first_set(s)));
1963 if (sparsebit_any_clear(s))
1964 assert(!get_value(sparsebit_first_clear(s)));
1965 sparsebit_set_all(s);
1966 assert(!sparsebit_any_clear(s));
1967 assert(sparsebit_all_set(s));
1968 num_ranges = 0;
1969 ranges[num_ranges++] = (struct range)
1970 { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true };
1971 break;
1972 case 4:
1973 if (sparsebit_any_set(s))
1974 assert(get_value(sparsebit_first_set(s)));
1975 if (sparsebit_any_clear(s))
1976 assert(!get_value(sparsebit_first_clear(s)));
1977 sparsebit_clear_all(s);
1978 assert(!sparsebit_any_set(s));
1979 assert(sparsebit_all_clear(s));
1980 num_ranges = 0;
1981 break;
1982 case 5:
1983 next = sparsebit_next_set(s, first);
1984 assert(next == 0 || next > first);
1985 assert(next == 0 || get_value(next));
1986 break;
1987 case 6:
1988 next = sparsebit_next_clear(s, first);
1989 assert(next == 0 || next > first);
1990 assert(next == 0 || !get_value(next));
1991 break;
1992 case 7:
1993 next = sparsebit_next_clear(s, first);
1994 if (sparsebit_is_set_num(s, first, num)) {
1995 assert(next == 0 || next > last);
1996 if (first)
1997 next = sparsebit_next_set(s, first - 1);
1998 else if (sparsebit_any_set(s))
1999 next = sparsebit_first_set(s);
2000 else
2001 return;
2002 assert(next == first);
2003 } else {
2004 assert(sparsebit_is_clear(s, first) || next <= last);
2005 }
2006 break;
2007 case 8:
2008 next = sparsebit_next_set(s, first);
2009 if (sparsebit_is_clear_num(s, first, num)) {
2010 assert(next == 0 || next > last);
2011 if (first)
2012 next = sparsebit_next_clear(s, first - 1);
2013 else if (sparsebit_any_clear(s))
2014 next = sparsebit_first_clear(s);
2015 else
2016 return;
2017 assert(next == first);
2018 } else {
2019 assert(sparsebit_is_set(s, first) || next <= last);
2020 }
2021 break;
2022 case 9:
2023 sparsebit_set_num(s, first, num);
2024 assert(sparsebit_is_set_num(s, first, num));
2025 assert(!sparsebit_is_clear_num(s, first, num));
2026 assert(sparsebit_any_set(s));
2027 assert(!sparsebit_all_clear(s));
2028 if (num_ranges == 1000)
2029 exit(0);
2030 ranges[num_ranges++] = (struct range)
2031 { .first = first, .last = last, .set = true };
2032 break;
2033 case 10:
2034 sparsebit_clear_num(s, first, num);
2035 assert(!sparsebit_is_set_num(s, first, num));
2036 assert(sparsebit_is_clear_num(s, first, num));
2037 assert(sparsebit_any_clear(s));
2038 assert(!sparsebit_all_set(s));
2039 if (num_ranges == 1000)
2040 exit(0);
2041 ranges[num_ranges++] = (struct range)
2042 { .first = first, .last = last, .set = false };
2043 break;
2044 case 11:
2045 sparsebit_validate_internal(s);
2046 break;
2047 default:
2048 break;
2049 }
2050}
2051
2052unsigned char get8(void)
2053{
2054 int ch;
2055
2056 ch = getchar();
2057 if (ch == EOF)
2058 exit(0);
2059 return ch;
2060}
2061
2062uint64_t get64(void)
2063{
2064 uint64_t x;
2065
2066 x = get8();
2067 x = (x << 8) | get8();
2068 x = (x << 8) | get8();
2069 x = (x << 8) | get8();
2070 x = (x << 8) | get8();
2071 x = (x << 8) | get8();
2072 x = (x << 8) | get8();
2073 return (x << 8) | get8();
2074}
2075
2076int main(void)
2077{
2078 s = sparsebit_alloc();
2079 for (;;) {
2080 uint8_t op = get8() & 0xf;
2081 uint64_t first = get64();
2082 uint64_t last = get64();
2083
2084 operate(op, first, last);
2085 }
2086}
2087#endif
diff --git a/tools/testing/selftests/kvm/lib/x86.c b/tools/testing/selftests/kvm/lib/x86.c
new file mode 100644
index 000000000000..2f17675f4275
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -0,0 +1,700 @@
1/*
2 * tools/testing/selftests/kvm/lib/x86.c
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 */
8
9#define _GNU_SOURCE /* for program_invocation_name */
10
11#include "test_util.h"
12#include "kvm_util.h"
13#include "kvm_util_internal.h"
14#include "x86.h"
15
16/* Minimum physical address used for virtual translation tables. */
17#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
18
19/* Virtual translation table structure declarations */
20struct pageMapL4Entry {
21 uint64_t present:1;
22 uint64_t writable:1;
23 uint64_t user:1;
24 uint64_t write_through:1;
25 uint64_t cache_disable:1;
26 uint64_t accessed:1;
27 uint64_t ignored_06:1;
28 uint64_t page_size:1;
29 uint64_t ignored_11_08:4;
30 uint64_t address:40;
31 uint64_t ignored_62_52:11;
32 uint64_t execute_disable:1;
33};
34
35struct pageDirectoryPointerEntry {
36 uint64_t present:1;
37 uint64_t writable:1;
38 uint64_t user:1;
39 uint64_t write_through:1;
40 uint64_t cache_disable:1;
41 uint64_t accessed:1;
42 uint64_t ignored_06:1;
43 uint64_t page_size:1;
44 uint64_t ignored_11_08:4;
45 uint64_t address:40;
46 uint64_t ignored_62_52:11;
47 uint64_t execute_disable:1;
48};
49
50struct pageDirectoryEntry {
51 uint64_t present:1;
52 uint64_t writable:1;
53 uint64_t user:1;
54 uint64_t write_through:1;
55 uint64_t cache_disable:1;
56 uint64_t accessed:1;
57 uint64_t ignored_06:1;
58 uint64_t page_size:1;
59 uint64_t ignored_11_08:4;
60 uint64_t address:40;
61 uint64_t ignored_62_52:11;
62 uint64_t execute_disable:1;
63};
64
65struct pageTableEntry {
66 uint64_t present:1;
67 uint64_t writable:1;
68 uint64_t user:1;
69 uint64_t write_through:1;
70 uint64_t cache_disable:1;
71 uint64_t accessed:1;
72 uint64_t dirty:1;
73 uint64_t reserved_07:1;
74 uint64_t global:1;
75 uint64_t ignored_11_09:3;
76 uint64_t address:40;
77 uint64_t ignored_62_52:11;
78 uint64_t execute_disable:1;
79};
80
81/* Register Dump
82 *
83 * Input Args:
84 * indent - Left margin indent amount
85 * regs - register
86 *
87 * Output Args:
88 * stream - Output FILE stream
89 *
90 * Return: None
91 *
92 * Dumps the state of the registers given by regs, to the FILE stream
93 * given by steam.
94 */
95void regs_dump(FILE *stream, struct kvm_regs *regs,
96 uint8_t indent)
97{
98 fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
99 "rcx: 0x%.16llx rdx: 0x%.16llx\n",
100 indent, "",
101 regs->rax, regs->rbx, regs->rcx, regs->rdx);
102 fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
103 "rsp: 0x%.16llx rbp: 0x%.16llx\n",
104 indent, "",
105 regs->rsi, regs->rdi, regs->rsp, regs->rbp);
106 fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "
107 "r10: 0x%.16llx r11: 0x%.16llx\n",
108 indent, "",
109 regs->r8, regs->r9, regs->r10, regs->r11);
110 fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
111 "r14: 0x%.16llx r15: 0x%.16llx\n",
112 indent, "",
113 regs->r12, regs->r13, regs->r14, regs->r15);
114 fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
115 indent, "",
116 regs->rip, regs->rflags);
117}
118
119/* Segment Dump
120 *
121 * Input Args:
122 * indent - Left margin indent amount
123 * segment - KVM segment
124 *
125 * Output Args:
126 * stream - Output FILE stream
127 *
128 * Return: None
129 *
130 * Dumps the state of the KVM segment given by segment, to the FILE stream
131 * given by steam.
132 */
133static void segment_dump(FILE *stream, struct kvm_segment *segment,
134 uint8_t indent)
135{
136 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
137 "selector: 0x%.4x type: 0x%.2x\n",
138 indent, "", segment->base, segment->limit,
139 segment->selector, segment->type);
140 fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
141 "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
142 indent, "", segment->present, segment->dpl,
143 segment->db, segment->s, segment->l);
144 fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
145 "unusable: 0x%.2x padding: 0x%.2x\n",
146 indent, "", segment->g, segment->avl,
147 segment->unusable, segment->padding);
148}
149
150/* dtable Dump
151 *
152 * Input Args:
153 * indent - Left margin indent amount
154 * dtable - KVM dtable
155 *
156 * Output Args:
157 * stream - Output FILE stream
158 *
159 * Return: None
160 *
161 * Dumps the state of the KVM dtable given by dtable, to the FILE stream
162 * given by steam.
163 */
164static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
165 uint8_t indent)
166{
167 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
168 "padding: 0x%.4x 0x%.4x 0x%.4x\n",
169 indent, "", dtable->base, dtable->limit,
170 dtable->padding[0], dtable->padding[1], dtable->padding[2]);
171}
172
173/* System Register Dump
174 *
175 * Input Args:
176 * indent - Left margin indent amount
177 * sregs - System registers
178 *
179 * Output Args:
180 * stream - Output FILE stream
181 *
182 * Return: None
183 *
184 * Dumps the state of the system registers given by sregs, to the FILE stream
185 * given by steam.
186 */
187void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
188 uint8_t indent)
189{
190 unsigned int i;
191
192 fprintf(stream, "%*scs:\n", indent, "");
193 segment_dump(stream, &sregs->cs, indent + 2);
194 fprintf(stream, "%*sds:\n", indent, "");
195 segment_dump(stream, &sregs->ds, indent + 2);
196 fprintf(stream, "%*ses:\n", indent, "");
197 segment_dump(stream, &sregs->es, indent + 2);
198 fprintf(stream, "%*sfs:\n", indent, "");
199 segment_dump(stream, &sregs->fs, indent + 2);
200 fprintf(stream, "%*sgs:\n", indent, "");
201 segment_dump(stream, &sregs->gs, indent + 2);
202 fprintf(stream, "%*sss:\n", indent, "");
203 segment_dump(stream, &sregs->ss, indent + 2);
204 fprintf(stream, "%*str:\n", indent, "");
205 segment_dump(stream, &sregs->tr, indent + 2);
206 fprintf(stream, "%*sldt:\n", indent, "");
207 segment_dump(stream, &sregs->ldt, indent + 2);
208
209 fprintf(stream, "%*sgdt:\n", indent, "");
210 dtable_dump(stream, &sregs->gdt, indent + 2);
211 fprintf(stream, "%*sidt:\n", indent, "");
212 dtable_dump(stream, &sregs->idt, indent + 2);
213
214 fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
215 "cr3: 0x%.16llx cr4: 0x%.16llx\n",
216 indent, "",
217 sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
218 fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
219 "apic_base: 0x%.16llx\n",
220 indent, "",
221 sregs->cr8, sregs->efer, sregs->apic_base);
222
223 fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
224 for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
225 fprintf(stream, "%*s%.16llx\n", indent + 2, "",
226 sregs->interrupt_bitmap[i]);
227 }
228}
229
230void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
231{
232 int rc;
233
234 TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
235 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
236
237 /* If needed, create page map l4 table. */
238 if (!vm->pgd_created) {
239 vm_paddr_t paddr = vm_phy_page_alloc(vm,
240 KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
241 vm->pgd = paddr;
242
243 /* Set pointer to pgd tables in all the VCPUs that
244 * have already been created. Future VCPUs will have
245 * the value set as each one is created.
246 */
247 for (struct vcpu *vcpu = vm->vcpu_head; vcpu;
248 vcpu = vcpu->next) {
249 struct kvm_sregs sregs;
250
251 /* Obtain the current system register settings */
252 vcpu_sregs_get(vm, vcpu->id, &sregs);
253
254 /* Set and store the pointer to the start of the
255 * pgd tables.
256 */
257 sregs.cr3 = vm->pgd;
258 vcpu_sregs_set(vm, vcpu->id, &sregs);
259 }
260
261 vm->pgd_created = true;
262 }
263}
264
265/* VM Virtual Page Map
266 *
267 * Input Args:
268 * vm - Virtual Machine
269 * vaddr - VM Virtual Address
270 * paddr - VM Physical Address
271 * pgd_memslot - Memory region slot for new virtual translation tables
272 *
273 * Output Args: None
274 *
275 * Return: None
276 *
277 * Within the VM given by vm, creates a virtual translation for the page
278 * starting at vaddr to the page starting at paddr.
279 */
280void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
281 uint32_t pgd_memslot)
282{
283 uint16_t index[4];
284 struct pageMapL4Entry *pml4e;
285
286 TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
287 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
288
289 TEST_ASSERT((vaddr % vm->page_size) == 0,
290 "Virtual address not on page boundary,\n"
291 " vaddr: 0x%lx vm->page_size: 0x%x",
292 vaddr, vm->page_size);
293 TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
294 (vaddr >> vm->page_shift)),
295 "Invalid virtual address, vaddr: 0x%lx",
296 vaddr);
297 TEST_ASSERT((paddr % vm->page_size) == 0,
298 "Physical address not on page boundary,\n"
299 " paddr: 0x%lx vm->page_size: 0x%x",
300 paddr, vm->page_size);
301 TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
302 "Physical address beyond beyond maximum supported,\n"
303 " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
304 paddr, vm->max_gfn, vm->page_size);
305
306 index[0] = (vaddr >> 12) & 0x1ffu;
307 index[1] = (vaddr >> 21) & 0x1ffu;
308 index[2] = (vaddr >> 30) & 0x1ffu;
309 index[3] = (vaddr >> 39) & 0x1ffu;
310
311 /* Allocate page directory pointer table if not present. */
312 pml4e = addr_gpa2hva(vm, vm->pgd);
313 if (!pml4e[index[3]].present) {
314 pml4e[index[3]].address = vm_phy_page_alloc(vm,
315 KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
316 >> vm->page_shift;
317 pml4e[index[3]].writable = true;
318 pml4e[index[3]].present = true;
319 }
320
321 /* Allocate page directory table if not present. */
322 struct pageDirectoryPointerEntry *pdpe;
323 pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
324 if (!pdpe[index[2]].present) {
325 pdpe[index[2]].address = vm_phy_page_alloc(vm,
326 KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
327 >> vm->page_shift;
328 pdpe[index[2]].writable = true;
329 pdpe[index[2]].present = true;
330 }
331
332 /* Allocate page table if not present. */
333 struct pageDirectoryEntry *pde;
334 pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
335 if (!pde[index[1]].present) {
336 pde[index[1]].address = vm_phy_page_alloc(vm,
337 KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
338 >> vm->page_shift;
339 pde[index[1]].writable = true;
340 pde[index[1]].present = true;
341 }
342
343 /* Fill in page table entry. */
344 struct pageTableEntry *pte;
345 pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
346 pte[index[0]].address = paddr >> vm->page_shift;
347 pte[index[0]].writable = true;
348 pte[index[0]].present = 1;
349}
350
351/* Virtual Translation Tables Dump
352 *
353 * Input Args:
354 * vm - Virtual Machine
355 * indent - Left margin indent amount
356 *
357 * Output Args:
358 * stream - Output FILE stream
359 *
360 * Return: None
361 *
362 * Dumps to the FILE stream given by stream, the contents of all the
363 * virtual translation tables for the VM given by vm.
364 */
365void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
366{
367 struct pageMapL4Entry *pml4e, *pml4e_start;
368 struct pageDirectoryPointerEntry *pdpe, *pdpe_start;
369 struct pageDirectoryEntry *pde, *pde_start;
370 struct pageTableEntry *pte, *pte_start;
371
372 if (!vm->pgd_created)
373 return;
374
375 fprintf(stream, "%*s "
376 " no\n", indent, "");
377 fprintf(stream, "%*s index hvaddr gpaddr "
378 "addr w exec dirty\n",
379 indent, "");
380 pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm,
381 vm->pgd);
382 for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
383 pml4e = &pml4e_start[n1];
384 if (!pml4e->present)
385 continue;
386 fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
387 " %u\n",
388 indent, "",
389 pml4e - pml4e_start, pml4e,
390 addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address,
391 pml4e->writable, pml4e->execute_disable);
392
393 pdpe_start = addr_gpa2hva(vm, pml4e->address
394 * vm->page_size);
395 for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
396 pdpe = &pdpe_start[n2];
397 if (!pdpe->present)
398 continue;
399 fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
400 "%u %u\n",
401 indent, "",
402 pdpe - pdpe_start, pdpe,
403 addr_hva2gpa(vm, pdpe),
404 (uint64_t) pdpe->address, pdpe->writable,
405 pdpe->execute_disable);
406
407 pde_start = addr_gpa2hva(vm,
408 pdpe->address * vm->page_size);
409 for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
410 pde = &pde_start[n3];
411 if (!pde->present)
412 continue;
413 fprintf(stream, "%*spde 0x%-3zx %p "
414 "0x%-12lx 0x%-10lx %u %u\n",
415 indent, "", pde - pde_start, pde,
416 addr_hva2gpa(vm, pde),
417 (uint64_t) pde->address, pde->writable,
418 pde->execute_disable);
419
420 pte_start = addr_gpa2hva(vm,
421 pde->address * vm->page_size);
422 for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
423 pte = &pte_start[n4];
424 if (!pte->present)
425 continue;
426 fprintf(stream, "%*spte 0x%-3zx %p "
427 "0x%-12lx 0x%-10lx %u %u "
428 " %u 0x%-10lx\n",
429 indent, "",
430 pte - pte_start, pte,
431 addr_hva2gpa(vm, pte),
432 (uint64_t) pte->address,
433 pte->writable,
434 pte->execute_disable,
435 pte->dirty,
436 ((uint64_t) n1 << 27)
437 | ((uint64_t) n2 << 18)
438 | ((uint64_t) n3 << 9)
439 | ((uint64_t) n4));
440 }
441 }
442 }
443 }
444}
445
446/* Set Unusable Segment
447 *
448 * Input Args: None
449 *
450 * Output Args:
451 * segp - Pointer to segment register
452 *
453 * Return: None
454 *
455 * Sets the segment register pointed to by segp to an unusable state.
456 */
457static void kvm_seg_set_unusable(struct kvm_segment *segp)
458{
459 memset(segp, 0, sizeof(*segp));
460 segp->unusable = true;
461}
462
463/* Set Long Mode Flat Kernel Code Segment
464 *
465 * Input Args:
466 * selector - selector value
467 *
468 * Output Args:
469 * segp - Pointer to KVM segment
470 *
471 * Return: None
472 *
473 * Sets up the KVM segment pointed to by segp, to be a code segment
474 * with the selector value given by selector.
475 */
476static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
477 struct kvm_segment *segp)
478{
479 memset(segp, 0, sizeof(*segp));
480 segp->selector = selector;
481 segp->limit = 0xFFFFFFFFu;
482 segp->s = 0x1; /* kTypeCodeData */
483 segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
484 * | kFlagCodeReadable
485 */
486 segp->g = true;
487 segp->l = true;
488 segp->present = 1;
489}
490
491/* Set Long Mode Flat Kernel Data Segment
492 *
493 * Input Args:
494 * selector - selector value
495 *
496 * Output Args:
497 * segp - Pointer to KVM segment
498 *
499 * Return: None
500 *
501 * Sets up the KVM segment pointed to by segp, to be a data segment
502 * with the selector value given by selector.
503 */
504static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
505 struct kvm_segment *segp)
506{
507 memset(segp, 0, sizeof(*segp));
508 segp->selector = selector;
509 segp->limit = 0xFFFFFFFFu;
510 segp->s = 0x1; /* kTypeCodeData */
511 segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
512 * | kFlagDataWritable
513 */
514 segp->g = true;
515 segp->present = true;
516}
517
518/* Address Guest Virtual to Guest Physical
519 *
520 * Input Args:
521 * vm - Virtual Machine
522 * gpa - VM virtual address
523 *
524 * Output Args: None
525 *
526 * Return:
527 * Equivalent VM physical address
528 *
529 * Translates the VM virtual address given by gva to a VM physical
530 * address and then locates the memory region containing the VM
531 * physical address, within the VM given by vm. When found, the host
532 * virtual address providing the memory to the vm physical address is returned.
533 * A TEST_ASSERT failure occurs if no region containing translated
534 * VM virtual address exists.
535 */
536vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
537{
538 uint16_t index[4];
539 struct pageMapL4Entry *pml4e;
540 struct pageDirectoryPointerEntry *pdpe;
541 struct pageDirectoryEntry *pde;
542 struct pageTableEntry *pte;
543 void *hva;
544
545 TEST_ASSERT(vm->mode == VM_MODE_FLAT48PG, "Attempt to use "
546 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
547
548 index[0] = (gva >> 12) & 0x1ffu;
549 index[1] = (gva >> 21) & 0x1ffu;
550 index[2] = (gva >> 30) & 0x1ffu;
551 index[3] = (gva >> 39) & 0x1ffu;
552
553 if (!vm->pgd_created)
554 goto unmapped_gva;
555 pml4e = addr_gpa2hva(vm, vm->pgd);
556 if (!pml4e[index[3]].present)
557 goto unmapped_gva;
558
559 pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
560 if (!pdpe[index[2]].present)
561 goto unmapped_gva;
562
563 pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
564 if (!pde[index[1]].present)
565 goto unmapped_gva;
566
567 pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
568 if (!pte[index[0]].present)
569 goto unmapped_gva;
570
571 return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
572
573unmapped_gva:
574 TEST_ASSERT(false, "No mapping for vm virtual address, "
575 "gva: 0x%lx", gva);
576}
577
578void vcpu_setup(struct kvm_vm *vm, int vcpuid)
579{
580 struct kvm_sregs sregs;
581
582 /* Set mode specific system register values. */
583 vcpu_sregs_get(vm, vcpuid, &sregs);
584
585 switch (vm->mode) {
586 case VM_MODE_FLAT48PG:
587 sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
588 sregs.cr4 |= X86_CR4_PAE;
589 sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
590
591 kvm_seg_set_unusable(&sregs.ldt);
592 kvm_seg_set_kernel_code_64bit(0x8, &sregs.cs);
593 kvm_seg_set_kernel_data_64bit(0x10, &sregs.ds);
594 kvm_seg_set_kernel_data_64bit(0x10, &sregs.es);
595 break;
596
597 default:
598 TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
599 }
600 vcpu_sregs_set(vm, vcpuid, &sregs);
601
602 /* If virtual translation table have been setup, set system register
603 * to point to the tables. It's okay if they haven't been setup yet,
604 * in that the code that sets up the virtual translation tables, will
605 * go back through any VCPUs that have already been created and set
606 * their values.
607 */
608 if (vm->pgd_created) {
609 struct kvm_sregs sregs;
610
611 vcpu_sregs_get(vm, vcpuid, &sregs);
612
613 sregs.cr3 = vm->pgd;
614 vcpu_sregs_set(vm, vcpuid, &sregs);
615 }
616}
617/* Adds a vCPU with reasonable defaults (i.e., a stack)
618 *
619 * Input Args:
620 * vcpuid - The id of the VCPU to add to the VM.
621 * guest_code - The vCPU's entry point
622 */
623void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
624{
625 struct kvm_mp_state mp_state;
626 struct kvm_regs regs;
627 vm_vaddr_t stack_vaddr;
628 stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
629 DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
630
631 /* Create VCPU */
632 vm_vcpu_add(vm, vcpuid);
633
634 /* Setup guest general purpose registers */
635 vcpu_regs_get(vm, vcpuid, &regs);
636 regs.rflags = regs.rflags | 0x2;
637 regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
638 regs.rip = (unsigned long) guest_code;
639 vcpu_regs_set(vm, vcpuid, &regs);
640
641 /* Setup the MP state */
642 mp_state.mp_state = 0;
643 vcpu_set_mp_state(vm, vcpuid, &mp_state);
644}
645
646/* VM VCPU CPUID Set
647 *
648 * Input Args:
649 * vm - Virtual Machine
650 * vcpuid - VCPU id
651 * cpuid - The CPUID values to set.
652 *
653 * Output Args: None
654 *
655 * Return: void
656 *
657 * Set the VCPU's CPUID.
658 */
659void vcpu_set_cpuid(struct kvm_vm *vm,
660 uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
661{
662 struct vcpu *vcpu = vcpu_find(vm, vcpuid);
663 int rc;
664
665 TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
666
667 rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
668 TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
669 rc, errno);
670
671}
672/* Create a VM with reasonable defaults
673 *
674 * Input Args:
675 * vcpuid - The id of the single VCPU to add to the VM.
676 * guest_code - The vCPU's entry point
677 *
678 * Output Args: None
679 *
680 * Return:
681 * Pointer to opaque structure that describes the created VM.
682 */
683struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code)
684{
685 struct kvm_vm *vm;
686
687 /* Create VM */
688 vm = vm_create(VM_MODE_FLAT48PG, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
689
690 /* Setup guest code */
691 kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
692
693 /* Setup IRQ Chip */
694 vm_create_irqchip(vm);
695
696 /* Add the first vCPU. */
697 vm_vcpu_add_default(vm, vcpuid, guest_code);
698
699 return vm;
700}
diff --git a/tools/testing/selftests/kvm/set_sregs_test.c b/tools/testing/selftests/kvm/set_sregs_test.c
new file mode 100644
index 000000000000..090fd3f19352
--- /dev/null
+++ b/tools/testing/selftests/kvm/set_sregs_test.c
@@ -0,0 +1,54 @@
1/*
2 * KVM_SET_SREGS tests
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 * This is a regression test for the bug fixed by the following commit:
9 * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
10 *
11 * That bug allowed a user-mode program that called the KVM_SET_SREGS
12 * ioctl to put a VCPU's local APIC into an invalid state.
13 *
14 */
15#define _GNU_SOURCE /* for program_invocation_short_name */
16#include <fcntl.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <sys/ioctl.h>
21
22#include "test_util.h"
23
24#include "kvm_util.h"
25#include "x86.h"
26
27#define VCPU_ID 5
28
29int main(int argc, char *argv[])
30{
31 struct kvm_sregs sregs;
32 struct kvm_vm *vm;
33 int rc;
34
35 /* Tell stdout not to buffer its content */
36 setbuf(stdout, NULL);
37
38 /* Create VM */
39 vm = vm_create_default(VCPU_ID, NULL);
40
41 vcpu_sregs_get(vm, VCPU_ID, &sregs);
42 sregs.apic_base = 1 << 10;
43 rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
44 TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
45 sregs.apic_base);
46 sregs.apic_base = 1 << 11;
47 rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
48 TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
49 sregs.apic_base);
50
51 kvm_vm_free(vm);
52
53 return 0;
54}
diff --git a/tools/testing/selftests/kvm/sync_regs_test.c b/tools/testing/selftests/kvm/sync_regs_test.c
new file mode 100644
index 000000000000..428e9473f5e2
--- /dev/null
+++ b/tools/testing/selftests/kvm/sync_regs_test.c
@@ -0,0 +1,232 @@
1/*
2 * Test for x86 KVM_CAP_SYNC_REGS
3 *
4 * Copyright (C) 2018, Google LLC.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
9 * including requesting an invalid register set, updates to/from values
10 * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
11 */
12
13#define _GNU_SOURCE /* for program_invocation_short_name */
14#include <fcntl.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <sys/ioctl.h>
19
20#include "test_util.h"
21#include "kvm_util.h"
22#include "x86.h"
23
24#define VCPU_ID 5
25#define PORT_HOST_SYNC 0x1000
26
27static void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
28{
29 __asm__ __volatile__("in %[port], %%al"
30 :
31 : [port]"d"(port), "D"(arg0), "S"(arg1)
32 : "rax");
33}
34
35#define exit_to_l0(_port, _arg0, _arg1) \
36 __exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1))
37
38#define GUEST_ASSERT(_condition) do { \
39 if (!(_condition)) \
40 exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, 0);\
41} while (0)
42
43void guest_code(void)
44{
45 for (;;) {
46 exit_to_l0(PORT_HOST_SYNC, "hello", 0);
47 asm volatile ("inc %r11");
48 }
49}
50
51static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
52{
53#define REG_COMPARE(reg) \
54 TEST_ASSERT(left->reg == right->reg, \
55 "Register " #reg \
56 " values did not match: 0x%llx, 0x%llx\n", \
57 left->reg, right->reg)
58 REG_COMPARE(rax);
59 REG_COMPARE(rbx);
60 REG_COMPARE(rcx);
61 REG_COMPARE(rdx);
62 REG_COMPARE(rsi);
63 REG_COMPARE(rdi);
64 REG_COMPARE(rsp);
65 REG_COMPARE(rbp);
66 REG_COMPARE(r8);
67 REG_COMPARE(r9);
68 REG_COMPARE(r10);
69 REG_COMPARE(r11);
70 REG_COMPARE(r12);
71 REG_COMPARE(r13);
72 REG_COMPARE(r14);
73 REG_COMPARE(r15);
74 REG_COMPARE(rip);
75 REG_COMPARE(rflags);
76#undef REG_COMPARE
77}
78
79static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
80{
81}
82
83static void compare_vcpu_events(struct kvm_vcpu_events *left,
84 struct kvm_vcpu_events *right)
85{
86}
87
88int main(int argc, char *argv[])
89{
90 struct kvm_vm *vm;
91 struct kvm_run *run;
92 struct kvm_regs regs;
93 struct kvm_sregs sregs;
94 struct kvm_vcpu_events events;
95 int rv, cap;
96
97 /* Tell stdout not to buffer its content */
98 setbuf(stdout, NULL);
99
100 cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
101 TEST_ASSERT((unsigned long)cap == KVM_SYNC_X86_VALID_FIELDS,
102 "KVM_CAP_SYNC_REGS (0x%x) != KVM_SYNC_X86_VALID_FIELDS (0x%lx)\n",
103 cap, KVM_SYNC_X86_VALID_FIELDS);
104
105 /* Create VM */
106 vm = vm_create_default(VCPU_ID, guest_code);
107
108 run = vcpu_state(vm, VCPU_ID);
109
110 /* Request reading invalid register set from VCPU. */
111 run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
112 rv = _vcpu_run(vm, VCPU_ID);
113 TEST_ASSERT(rv < 0 && errno == EINVAL,
114 "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
115 rv);
116 vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
117
118 /* Request setting invalid register set into VCPU. */
119 run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS << 1;
120 rv = _vcpu_run(vm, VCPU_ID);
121 TEST_ASSERT(rv < 0 && errno == EINVAL,
122 "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
123 rv);
124 vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
125
126 /* Request and verify all valid register sets. */
127 /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
128 run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
129 rv = _vcpu_run(vm, VCPU_ID);
130 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
131 "Unexpected exit reason: %u (%s),\n",
132 run->exit_reason,
133 exit_reason_str(run->exit_reason));
134
135 vcpu_regs_get(vm, VCPU_ID, &regs);
136 compare_regs(&regs, &run->s.regs.regs);
137
138 vcpu_sregs_get(vm, VCPU_ID, &sregs);
139 compare_sregs(&sregs, &run->s.regs.sregs);
140
141 vcpu_events_get(vm, VCPU_ID, &events);
142 compare_vcpu_events(&events, &run->s.regs.events);
143
144 /* Set and verify various register values. */
145 run->s.regs.regs.r11 = 0xBAD1DEA;
146 run->s.regs.sregs.apic_base = 1 << 11;
147 /* TODO run->s.regs.events.XYZ = ABC; */
148
149 run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
150 run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
151 rv = _vcpu_run(vm, VCPU_ID);
152 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
153 "Unexpected exit reason: %u (%s),\n",
154 run->exit_reason,
155 exit_reason_str(run->exit_reason));
156 TEST_ASSERT(run->s.regs.regs.r11 == 0xBAD1DEA + 1,
157 "r11 sync regs value incorrect 0x%llx.",
158 run->s.regs.regs.r11);
159 TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
160 "apic_base sync regs value incorrect 0x%llx.",
161 run->s.regs.sregs.apic_base);
162
163 vcpu_regs_get(vm, VCPU_ID, &regs);
164 compare_regs(&regs, &run->s.regs.regs);
165
166 vcpu_sregs_get(vm, VCPU_ID, &sregs);
167 compare_sregs(&sregs, &run->s.regs.sregs);
168
169 vcpu_events_get(vm, VCPU_ID, &events);
170 compare_vcpu_events(&events, &run->s.regs.events);
171
172 /* Clear kvm_dirty_regs bits, verify new s.regs values are
173 * overwritten with existing guest values.
174 */
175 run->kvm_valid_regs = KVM_SYNC_X86_VALID_FIELDS;
176 run->kvm_dirty_regs = 0;
177 run->s.regs.regs.r11 = 0xDEADBEEF;
178 rv = _vcpu_run(vm, VCPU_ID);
179 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
180 "Unexpected exit reason: %u (%s),\n",
181 run->exit_reason,
182 exit_reason_str(run->exit_reason));
183 TEST_ASSERT(run->s.regs.regs.r11 != 0xDEADBEEF,
184 "r11 sync regs value incorrect 0x%llx.",
185 run->s.regs.regs.r11);
186
187 /* Clear kvm_valid_regs bits and kvm_dirty_bits.
188 * Verify s.regs values are not overwritten with existing guest values
189 * and that guest values are not overwritten with kvm_sync_regs values.
190 */
191 run->kvm_valid_regs = 0;
192 run->kvm_dirty_regs = 0;
193 run->s.regs.regs.r11 = 0xAAAA;
194 regs.r11 = 0xBAC0;
195 vcpu_regs_set(vm, VCPU_ID, &regs);
196 rv = _vcpu_run(vm, VCPU_ID);
197 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
198 "Unexpected exit reason: %u (%s),\n",
199 run->exit_reason,
200 exit_reason_str(run->exit_reason));
201 TEST_ASSERT(run->s.regs.regs.r11 == 0xAAAA,
202 "r11 sync regs value incorrect 0x%llx.",
203 run->s.regs.regs.r11);
204 vcpu_regs_get(vm, VCPU_ID, &regs);
205 TEST_ASSERT(regs.r11 == 0xBAC0 + 1,
206 "r11 guest value incorrect 0x%llx.",
207 regs.r11);
208
209 /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
210 * with existing guest values but that guest values are overwritten
211 * with kvm_sync_regs values.
212 */
213 run->kvm_valid_regs = 0;
214 run->kvm_dirty_regs = KVM_SYNC_X86_VALID_FIELDS;
215 run->s.regs.regs.r11 = 0xBBBB;
216 rv = _vcpu_run(vm, VCPU_ID);
217 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
218 "Unexpected exit reason: %u (%s),\n",
219 run->exit_reason,
220 exit_reason_str(run->exit_reason));
221 TEST_ASSERT(run->s.regs.regs.r11 == 0xBBBB,
222 "r11 sync regs value incorrect 0x%llx.",
223 run->s.regs.regs.r11);
224 vcpu_regs_get(vm, VCPU_ID, &regs);
225 TEST_ASSERT(regs.r11 == 0xBBBB + 1,
226 "r11 guest value incorrect 0x%llx.",
227 regs.r11);
228
229 kvm_vm_free(vm);
230
231 return 0;
232}
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index 8bc479fa37e6..efc84cbe8277 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -178,7 +178,7 @@ static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
178 *vcpu_cpsr(vcpu) = cpsr; 178 *vcpu_cpsr(vcpu) = cpsr;
179 179
180 /* Note: These now point to the banked copies */ 180 /* Note: These now point to the banked copies */
181 *vcpu_spsr(vcpu) = new_spsr_value; 181 vcpu_write_spsr(vcpu, new_spsr_value);
182 *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; 182 *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
183 183
184 /* Branch to exception vector */ 184 /* Branch to exception vector */
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 282389eb204f..bd3d57f40f1b 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -545,9 +545,11 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
545 * The kernel may decide to run userspace after calling vcpu_put, so 545 * The kernel may decide to run userspace after calling vcpu_put, so
546 * we reset cntvoff to 0 to ensure a consistent read between user 546 * we reset cntvoff to 0 to ensure a consistent read between user
547 * accesses to the virtual counter and kernel access to the physical 547 * accesses to the virtual counter and kernel access to the physical
548 * counter. 548 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
549 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
549 */ 550 */
550 set_cntvoff(0); 551 if (!has_vhe())
552 set_cntvoff(0);
551} 553}
552 554
553/* 555/*
@@ -856,11 +858,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
856 return ret; 858 return ret;
857 859
858no_vgic: 860no_vgic:
859 preempt_disable();
860 timer->enabled = 1; 861 timer->enabled = 1;
861 kvm_timer_vcpu_load(vcpu);
862 preempt_enable();
863
864 return 0; 862 return 0;
865} 863}
866 864
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 53572304843b..dba629c5f8ac 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -362,10 +362,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
362 kvm_arm_set_running_vcpu(vcpu); 362 kvm_arm_set_running_vcpu(vcpu);
363 kvm_vgic_load(vcpu); 363 kvm_vgic_load(vcpu);
364 kvm_timer_vcpu_load(vcpu); 364 kvm_timer_vcpu_load(vcpu);
365 kvm_vcpu_load_sysregs(vcpu);
365} 366}
366 367
367void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 368void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
368{ 369{
370 kvm_vcpu_put_sysregs(vcpu);
369 kvm_timer_vcpu_put(vcpu); 371 kvm_timer_vcpu_put(vcpu);
370 kvm_vgic_put(vcpu); 372 kvm_vgic_put(vcpu);
371 373
@@ -420,7 +422,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
420 */ 422 */
421int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 423int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
422{ 424{
423 return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) 425 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
426 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
424 && !v->arch.power_off && !v->arch.pause); 427 && !v->arch.power_off && !v->arch.pause);
425} 428}
426 429
@@ -632,27 +635,22 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
632 if (unlikely(!kvm_vcpu_initialized(vcpu))) 635 if (unlikely(!kvm_vcpu_initialized(vcpu)))
633 return -ENOEXEC; 636 return -ENOEXEC;
634 637
635 vcpu_load(vcpu);
636
637 ret = kvm_vcpu_first_run_init(vcpu); 638 ret = kvm_vcpu_first_run_init(vcpu);
638 if (ret) 639 if (ret)
639 goto out; 640 return ret;
640 641
641 if (run->exit_reason == KVM_EXIT_MMIO) { 642 if (run->exit_reason == KVM_EXIT_MMIO) {
642 ret = kvm_handle_mmio_return(vcpu, vcpu->run); 643 ret = kvm_handle_mmio_return(vcpu, vcpu->run);
643 if (ret) 644 if (ret)
644 goto out; 645 return ret;
645 if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) { 646 if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
646 ret = 0; 647 return 0;
647 goto out;
648 }
649
650 } 648 }
651 649
652 if (run->immediate_exit) { 650 if (run->immediate_exit)
653 ret = -EINTR; 651 return -EINTR;
654 goto out; 652
655 } 653 vcpu_load(vcpu);
656 654
657 kvm_sigset_activate(vcpu); 655 kvm_sigset_activate(vcpu);
658 656
@@ -719,6 +717,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
719 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || 717 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
720 kvm_request_pending(vcpu)) { 718 kvm_request_pending(vcpu)) {
721 vcpu->mode = OUTSIDE_GUEST_MODE; 719 vcpu->mode = OUTSIDE_GUEST_MODE;
720 isb(); /* Ensure work in x_flush_hwstate is committed */
722 kvm_pmu_sync_hwstate(vcpu); 721 kvm_pmu_sync_hwstate(vcpu);
723 if (static_branch_unlikely(&userspace_irqchip_in_use)) 722 if (static_branch_unlikely(&userspace_irqchip_in_use))
724 kvm_timer_sync_hwstate(vcpu); 723 kvm_timer_sync_hwstate(vcpu);
@@ -735,13 +734,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
735 */ 734 */
736 trace_kvm_entry(*vcpu_pc(vcpu)); 735 trace_kvm_entry(*vcpu_pc(vcpu));
737 guest_enter_irqoff(); 736 guest_enter_irqoff();
738 if (has_vhe())
739 kvm_arm_vhe_guest_enter();
740
741 ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
742 737
743 if (has_vhe()) 738 if (has_vhe()) {
739 kvm_arm_vhe_guest_enter();
740 ret = kvm_vcpu_run_vhe(vcpu);
744 kvm_arm_vhe_guest_exit(); 741 kvm_arm_vhe_guest_exit();
742 } else {
743 ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu);
744 }
745
745 vcpu->mode = OUTSIDE_GUEST_MODE; 746 vcpu->mode = OUTSIDE_GUEST_MODE;
746 vcpu->stat.exits++; 747 vcpu->stat.exits++;
747 /* 748 /*
@@ -811,7 +812,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
811 812
812 kvm_sigset_deactivate(vcpu); 813 kvm_sigset_deactivate(vcpu);
813 814
814out:
815 vcpu_put(vcpu); 815 vcpu_put(vcpu);
816 return ret; 816 return ret;
817} 817}
@@ -820,18 +820,18 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
820{ 820{
821 int bit_index; 821 int bit_index;
822 bool set; 822 bool set;
823 unsigned long *ptr; 823 unsigned long *hcr;
824 824
825 if (number == KVM_ARM_IRQ_CPU_IRQ) 825 if (number == KVM_ARM_IRQ_CPU_IRQ)
826 bit_index = __ffs(HCR_VI); 826 bit_index = __ffs(HCR_VI);
827 else /* KVM_ARM_IRQ_CPU_FIQ */ 827 else /* KVM_ARM_IRQ_CPU_FIQ */
828 bit_index = __ffs(HCR_VF); 828 bit_index = __ffs(HCR_VF);
829 829
830 ptr = (unsigned long *)&vcpu->arch.irq_lines; 830 hcr = vcpu_hcr(vcpu);
831 if (level) 831 if (level)
832 set = test_and_set_bit(bit_index, ptr); 832 set = test_and_set_bit(bit_index, hcr);
833 else 833 else
834 set = test_and_clear_bit(bit_index, ptr); 834 set = test_and_clear_bit(bit_index, hcr);
835 835
836 /* 836 /*
837 * If we didn't change anything, no need to wake up or kick other CPUs 837 * If we didn't change anything, no need to wake up or kick other CPUs
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
index f24404b3c8df..77754a62eb0c 100644
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ b/virt/kvm/arm/hyp/timer-sr.c
@@ -27,34 +27,34 @@ void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
27 write_sysreg(cntvoff, cntvoff_el2); 27 write_sysreg(cntvoff, cntvoff_el2);
28} 28}
29 29
30/*
31 * Should only be called on non-VHE systems.
32 * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
33 */
30void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) 34void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
31{ 35{
32 /* 36 u64 val;
33 * We don't need to do this for VHE since the host kernel runs in EL2
34 * with HCR_EL2.TGE ==1, which makes those bits have no impact.
35 */
36 if (!has_vhe()) {
37 u64 val;
38 37
39 /* Allow physical timer/counter access for the host */ 38 /* Allow physical timer/counter access for the host */
40 val = read_sysreg(cnthctl_el2); 39 val = read_sysreg(cnthctl_el2);
41 val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; 40 val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
42 write_sysreg(val, cnthctl_el2); 41 write_sysreg(val, cnthctl_el2);
43 }
44} 42}
45 43
44/*
45 * Should only be called on non-VHE systems.
46 * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
47 */
46void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) 48void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
47{ 49{
48 if (!has_vhe()) { 50 u64 val;
49 u64 val;
50 51
51 /* 52 /*
52 * Disallow physical timer access for the guest 53 * Disallow physical timer access for the guest
53 * Physical counter access is allowed 54 * Physical counter access is allowed
54 */ 55 */
55 val = read_sysreg(cnthctl_el2); 56 val = read_sysreg(cnthctl_el2);
56 val &= ~CNTHCTL_EL1PCEN; 57 val &= ~CNTHCTL_EL1PCEN;
57 val |= CNTHCTL_EL1PCTEN; 58 val |= CNTHCTL_EL1PCTEN;
58 write_sysreg(val, cnthctl_el2); 59 write_sysreg(val, cnthctl_el2);
59 }
60} 60}
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
deleted file mode 100644
index 4fe6e797e8b3..000000000000
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ /dev/null
@@ -1,159 +0,0 @@
1/*
2 * Copyright (C) 2012-2015 - ARM Ltd
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <linux/compiler.h>
19#include <linux/irqchip/arm-gic.h>
20#include <linux/kvm_host.h>
21
22#include <asm/kvm_emulate.h>
23#include <asm/kvm_hyp.h>
24#include <asm/kvm_mmu.h>
25
26static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
27{
28 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
29 int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
30 u32 elrsr0, elrsr1;
31
32 elrsr0 = readl_relaxed(base + GICH_ELRSR0);
33 if (unlikely(nr_lr > 32))
34 elrsr1 = readl_relaxed(base + GICH_ELRSR1);
35 else
36 elrsr1 = 0;
37
38 cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0;
39}
40
41static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
42{
43 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
44 int i;
45 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
46
47 for (i = 0; i < used_lrs; i++) {
48 if (cpu_if->vgic_elrsr & (1UL << i))
49 cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
50 else
51 cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
52
53 writel_relaxed(0, base + GICH_LR0 + (i * 4));
54 }
55}
56
57/* vcpu is already in the HYP VA space */
58void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
59{
60 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
61 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
62 struct vgic_dist *vgic = &kvm->arch.vgic;
63 void __iomem *base = kern_hyp_va(vgic->vctrl_base);
64 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
65
66 if (!base)
67 return;
68
69 if (used_lrs) {
70 cpu_if->vgic_apr = readl_relaxed(base + GICH_APR);
71
72 save_elrsr(vcpu, base);
73 save_lrs(vcpu, base);
74
75 writel_relaxed(0, base + GICH_HCR);
76 } else {
77 cpu_if->vgic_elrsr = ~0UL;
78 cpu_if->vgic_apr = 0;
79 }
80}
81
82/* vcpu is already in the HYP VA space */
83void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
84{
85 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
86 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
87 struct vgic_dist *vgic = &kvm->arch.vgic;
88 void __iomem *base = kern_hyp_va(vgic->vctrl_base);
89 int i;
90 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
91
92 if (!base)
93 return;
94
95 if (used_lrs) {
96 writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
97 writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
98 for (i = 0; i < used_lrs; i++) {
99 writel_relaxed(cpu_if->vgic_lr[i],
100 base + GICH_LR0 + (i * 4));
101 }
102 }
103}
104
105#ifdef CONFIG_ARM64
106/*
107 * __vgic_v2_perform_cpuif_access -- perform a GICV access on behalf of the
108 * guest.
109 *
110 * @vcpu: the offending vcpu
111 *
112 * Returns:
113 * 1: GICV access successfully performed
114 * 0: Not a GICV access
115 * -1: Illegal GICV access
116 */
117int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
118{
119 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
120 struct vgic_dist *vgic = &kvm->arch.vgic;
121 phys_addr_t fault_ipa;
122 void __iomem *addr;
123 int rd;
124
125 /* Build the full address */
126 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
127 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
128
129 /* If not for GICV, move on */
130 if (fault_ipa < vgic->vgic_cpu_base ||
131 fault_ipa >= (vgic->vgic_cpu_base + KVM_VGIC_V2_CPU_SIZE))
132 return 0;
133
134 /* Reject anything but a 32bit access */
135 if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32))
136 return -1;
137
138 /* Not aligned? Don't bother */
139 if (fault_ipa & 3)
140 return -1;
141
142 rd = kvm_vcpu_dabt_get_rd(vcpu);
143 addr = kern_hyp_va((kern_hyp_va(&kvm_vgic_global_state))->vcpu_base_va);
144 addr += fault_ipa - vgic->vgic_cpu_base;
145
146 if (kvm_vcpu_dabt_iswrite(vcpu)) {
147 u32 data = vcpu_data_guest_to_host(vcpu,
148 vcpu_get_reg(vcpu, rd),
149 sizeof(u32));
150 writel_relaxed(data, addr);
151 } else {
152 u32 data = readl_relaxed(addr);
153 vcpu_set_reg(vcpu, rd, vcpu_data_host_to_guest(vcpu, data,
154 sizeof(u32)));
155 }
156
157 return 1;
158}
159#endif
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index b89ce5432214..616e5a433ab0 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -21,6 +21,7 @@
21 21
22#include <asm/kvm_emulate.h> 22#include <asm/kvm_emulate.h>
23#include <asm/kvm_hyp.h> 23#include <asm/kvm_hyp.h>
24#include <asm/kvm_mmu.h>
24 25
25#define vtr_to_max_lr_idx(v) ((v) & 0xf) 26#define vtr_to_max_lr_idx(v) ((v) & 0xf)
26#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1) 27#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
@@ -208,89 +209,68 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
208{ 209{
209 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; 210 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
210 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; 211 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
211 u64 val;
212 212
213 /* 213 /*
214 * Make sure stores to the GIC via the memory mapped interface 214 * Make sure stores to the GIC via the memory mapped interface
215 * are now visible to the system register interface. 215 * are now visible to the system register interface when reading the
216 * LRs, and when reading back the VMCR on non-VHE systems.
216 */ 217 */
217 if (!cpu_if->vgic_sre) { 218 if (used_lrs || !has_vhe()) {
218 dsb(sy); 219 if (!cpu_if->vgic_sre) {
219 isb(); 220 dsb(sy);
220 cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); 221 isb();
222 }
221 } 223 }
222 224
223 if (used_lrs) { 225 if (used_lrs) {
224 int i; 226 int i;
225 u32 nr_pre_bits; 227 u32 elrsr;
226 228
227 cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); 229 elrsr = read_gicreg(ICH_ELSR_EL2);
228 230
229 write_gicreg(0, ICH_HCR_EL2); 231 write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
230 val = read_gicreg(ICH_VTR_EL2);
231 nr_pre_bits = vtr_to_nr_pre_bits(val);
232 232
233 for (i = 0; i < used_lrs; i++) { 233 for (i = 0; i < used_lrs; i++) {
234 if (cpu_if->vgic_elrsr & (1 << i)) 234 if (elrsr & (1 << i))
235 cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; 235 cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
236 else 236 else
237 cpu_if->vgic_lr[i] = __gic_v3_get_lr(i); 237 cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
238 238
239 __gic_v3_set_lr(0, i); 239 __gic_v3_set_lr(0, i);
240 } 240 }
241 }
242}
241 243
242 switch (nr_pre_bits) { 244void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
243 case 7: 245{
244 cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3); 246 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
245 cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2); 247 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
246 case 6: 248 int i;
247 cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
248 default:
249 cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
250 }
251 249
252 switch (nr_pre_bits) { 250 if (used_lrs) {
253 case 7: 251 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
254 cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
255 cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
256 case 6:
257 cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
258 default:
259 cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
260 }
261 } else {
262 if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
263 cpu_if->its_vpe.its_vm)
264 write_gicreg(0, ICH_HCR_EL2);
265
266 cpu_if->vgic_elrsr = 0xffff;
267 cpu_if->vgic_ap0r[0] = 0;
268 cpu_if->vgic_ap0r[1] = 0;
269 cpu_if->vgic_ap0r[2] = 0;
270 cpu_if->vgic_ap0r[3] = 0;
271 cpu_if->vgic_ap1r[0] = 0;
272 cpu_if->vgic_ap1r[1] = 0;
273 cpu_if->vgic_ap1r[2] = 0;
274 cpu_if->vgic_ap1r[3] = 0;
275 }
276 252
277 val = read_gicreg(ICC_SRE_EL2); 253 for (i = 0; i < used_lrs; i++)
278 write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2); 254 __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
255 }
279 256
280 if (!cpu_if->vgic_sre) { 257 /*
281 /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */ 258 * Ensure that writes to the LRs, and on non-VHE systems ensure that
282 isb(); 259 * the write to the VMCR in __vgic_v3_activate_traps(), will have
283 write_gicreg(1, ICC_SRE_EL1); 260 * reached the (re)distributors. This ensure the guest will read the
261 * correct values from the memory-mapped interface.
262 */
263 if (used_lrs || !has_vhe()) {
264 if (!cpu_if->vgic_sre) {
265 isb();
266 dsb(sy);
267 }
284 } 268 }
285} 269}
286 270
287void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) 271void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
288{ 272{
289 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; 273 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
290 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
291 u64 val;
292 u32 nr_pre_bits;
293 int i;
294 274
295 /* 275 /*
296 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a 276 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
@@ -299,70 +279,135 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
299 * consequences. So we must make sure that ICC_SRE_EL1 has 279 * consequences. So we must make sure that ICC_SRE_EL1 has
300 * been actually programmed with the value we want before 280 * been actually programmed with the value we want before
301 * starting to mess with the rest of the GIC, and VMCR_EL2 in 281 * starting to mess with the rest of the GIC, and VMCR_EL2 in
302 * particular. 282 * particular. This logic must be called before
283 * __vgic_v3_restore_state().
303 */ 284 */
304 if (!cpu_if->vgic_sre) { 285 if (!cpu_if->vgic_sre) {
305 write_gicreg(0, ICC_SRE_EL1); 286 write_gicreg(0, ICC_SRE_EL1);
306 isb(); 287 isb();
307 write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); 288 write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
289
290
291 if (has_vhe()) {
292 /*
293 * Ensure that the write to the VMCR will have reached
294 * the (re)distributors. This ensure the guest will
295 * read the correct values from the memory-mapped
296 * interface.
297 */
298 isb();
299 dsb(sy);
300 }
308 } 301 }
309 302
310 val = read_gicreg(ICH_VTR_EL2); 303 /*
311 nr_pre_bits = vtr_to_nr_pre_bits(val); 304 * Prevent the guest from touching the GIC system registers if
305 * SRE isn't enabled for GICv3 emulation.
306 */
307 write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
308 ICC_SRE_EL2);
312 309
313 if (used_lrs) { 310 /*
311 * If we need to trap system registers, we must write
312 * ICH_HCR_EL2 anyway, even if no interrupts are being
313 * injected,
314 */
315 if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
316 cpu_if->its_vpe.its_vm)
314 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); 317 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
318}
315 319
316 switch (nr_pre_bits) { 320void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
317 case 7: 321{
318 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3); 322 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
319 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2); 323 u64 val;
320 case 6:
321 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
322 default:
323 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
324 }
325
326 switch (nr_pre_bits) {
327 case 7:
328 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
329 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
330 case 6:
331 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
332 default:
333 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
334 }
335 324
336 for (i = 0; i < used_lrs; i++) 325 if (!cpu_if->vgic_sre) {
337 __gic_v3_set_lr(cpu_if->vgic_lr[i], i); 326 cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
338 } else {
339 /*
340 * If we need to trap system registers, we must write
341 * ICH_HCR_EL2 anyway, even if no interrupts are being
342 * injected. Same thing if GICv4 is used, as VLPI
343 * delivery is gated by ICH_HCR_EL2.En.
344 */
345 if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
346 cpu_if->its_vpe.its_vm)
347 write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
348 } 327 }
349 328
350 /* 329 val = read_gicreg(ICC_SRE_EL2);
351 * Ensures that the above will have reached the 330 write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
352 * (re)distributors. This ensure the guest will read the 331
353 * correct values from the memory-mapped interface.
354 */
355 if (!cpu_if->vgic_sre) { 332 if (!cpu_if->vgic_sre) {
333 /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
356 isb(); 334 isb();
357 dsb(sy); 335 write_gicreg(1, ICC_SRE_EL1);
358 } 336 }
359 337
360 /* 338 /*
361 * Prevent the guest from touching the GIC system registers if 339 * If we were trapping system registers, we enabled the VGIC even if
362 * SRE isn't enabled for GICv3 emulation. 340 * no interrupts were being injected, and we disable it again here.
363 */ 341 */
364 write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE, 342 if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
365 ICC_SRE_EL2); 343 cpu_if->its_vpe.its_vm)
344 write_gicreg(0, ICH_HCR_EL2);
345}
346
347void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
348{
349 struct vgic_v3_cpu_if *cpu_if;
350 u64 val;
351 u32 nr_pre_bits;
352
353 vcpu = kern_hyp_va(vcpu);
354 cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
355
356 val = read_gicreg(ICH_VTR_EL2);
357 nr_pre_bits = vtr_to_nr_pre_bits(val);
358
359 switch (nr_pre_bits) {
360 case 7:
361 cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
362 cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
363 case 6:
364 cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
365 default:
366 cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
367 }
368
369 switch (nr_pre_bits) {
370 case 7:
371 cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
372 cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
373 case 6:
374 cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
375 default:
376 cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
377 }
378}
379
380void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
381{
382 struct vgic_v3_cpu_if *cpu_if;
383 u64 val;
384 u32 nr_pre_bits;
385
386 vcpu = kern_hyp_va(vcpu);
387 cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
388
389 val = read_gicreg(ICH_VTR_EL2);
390 nr_pre_bits = vtr_to_nr_pre_bits(val);
391
392 switch (nr_pre_bits) {
393 case 7:
394 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
395 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
396 case 6:
397 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
398 default:
399 __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
400 }
401
402 switch (nr_pre_bits) {
403 case 7:
404 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
405 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
406 case 6:
407 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
408 default:
409 __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
410 }
366} 411}
367 412
368void __hyp_text __vgic_v3_init_lrs(void) 413void __hyp_text __vgic_v3_init_lrs(void)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b960acdd0c05..7f6a944db23d 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -43,6 +43,8 @@ static unsigned long hyp_idmap_start;
43static unsigned long hyp_idmap_end; 43static unsigned long hyp_idmap_end;
44static phys_addr_t hyp_idmap_vector; 44static phys_addr_t hyp_idmap_vector;
45 45
46static unsigned long io_map_base;
47
46#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) 48#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
47#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) 49#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
48 50
@@ -479,7 +481,13 @@ static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
479 clear_hyp_pgd_entry(pgd); 481 clear_hyp_pgd_entry(pgd);
480} 482}
481 483
482static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) 484static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
485{
486 return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
487}
488
489static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
490 phys_addr_t start, u64 size)
483{ 491{
484 pgd_t *pgd; 492 pgd_t *pgd;
485 phys_addr_t addr = start, end = start + size; 493 phys_addr_t addr = start, end = start + size;
@@ -489,7 +497,7 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
489 * We don't unmap anything from HYP, except at the hyp tear down. 497 * We don't unmap anything from HYP, except at the hyp tear down.
490 * Hence, we don't have to invalidate the TLBs here. 498 * Hence, we don't have to invalidate the TLBs here.
491 */ 499 */
492 pgd = pgdp + pgd_index(addr); 500 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
493 do { 501 do {
494 next = pgd_addr_end(addr, end); 502 next = pgd_addr_end(addr, end);
495 if (!pgd_none(*pgd)) 503 if (!pgd_none(*pgd))
@@ -497,32 +505,50 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
497 } while (pgd++, addr = next, addr != end); 505 } while (pgd++, addr = next, addr != end);
498} 506}
499 507
508static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
509{
510 __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
511}
512
513static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
514{
515 __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
516}
517
500/** 518/**
501 * free_hyp_pgds - free Hyp-mode page tables 519 * free_hyp_pgds - free Hyp-mode page tables
502 * 520 *
503 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and 521 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
504 * therefore contains either mappings in the kernel memory area (above 522 * therefore contains either mappings in the kernel memory area (above
505 * PAGE_OFFSET), or device mappings in the vmalloc range (from 523 * PAGE_OFFSET), or device mappings in the idmap range.
506 * VMALLOC_START to VMALLOC_END).
507 * 524 *
508 * boot_hyp_pgd should only map two pages for the init code. 525 * boot_hyp_pgd should only map the idmap range, and is only used in
526 * the extended idmap case.
509 */ 527 */
510void free_hyp_pgds(void) 528void free_hyp_pgds(void)
511{ 529{
530 pgd_t *id_pgd;
531
512 mutex_lock(&kvm_hyp_pgd_mutex); 532 mutex_lock(&kvm_hyp_pgd_mutex);
513 533
534 id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
535
536 if (id_pgd) {
537 /* In case we never called hyp_mmu_init() */
538 if (!io_map_base)
539 io_map_base = hyp_idmap_start;
540 unmap_hyp_idmap_range(id_pgd, io_map_base,
541 hyp_idmap_start + PAGE_SIZE - io_map_base);
542 }
543
514 if (boot_hyp_pgd) { 544 if (boot_hyp_pgd) {
515 unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
516 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); 545 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
517 boot_hyp_pgd = NULL; 546 boot_hyp_pgd = NULL;
518 } 547 }
519 548
520 if (hyp_pgd) { 549 if (hyp_pgd) {
521 unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
522 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), 550 unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
523 (uintptr_t)high_memory - PAGE_OFFSET); 551 (uintptr_t)high_memory - PAGE_OFFSET);
524 unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START),
525 VMALLOC_END - VMALLOC_START);
526 552
527 free_pages((unsigned long)hyp_pgd, hyp_pgd_order); 553 free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
528 hyp_pgd = NULL; 554 hyp_pgd = NULL;
@@ -634,7 +660,7 @@ static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
634 addr = start & PAGE_MASK; 660 addr = start & PAGE_MASK;
635 end = PAGE_ALIGN(end); 661 end = PAGE_ALIGN(end);
636 do { 662 do {
637 pgd = pgdp + ((addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1)); 663 pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
638 664
639 if (pgd_none(*pgd)) { 665 if (pgd_none(*pgd)) {
640 pud = pud_alloc_one(NULL, addr); 666 pud = pud_alloc_one(NULL, addr);
@@ -708,29 +734,115 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
708 return 0; 734 return 0;
709} 735}
710 736
737static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
738 unsigned long *haddr, pgprot_t prot)
739{
740 pgd_t *pgd = hyp_pgd;
741 unsigned long base;
742 int ret = 0;
743
744 mutex_lock(&kvm_hyp_pgd_mutex);
745
746 /*
747 * This assumes that we we have enough space below the idmap
748 * page to allocate our VAs. If not, the check below will
749 * kick. A potential alternative would be to detect that
750 * overflow and switch to an allocation above the idmap.
751 *
752 * The allocated size is always a multiple of PAGE_SIZE.
753 */
754 size = PAGE_ALIGN(size + offset_in_page(phys_addr));
755 base = io_map_base - size;
756
757 /*
758 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
759 * allocating the new area, as it would indicate we've
760 * overflowed the idmap/IO address range.
761 */
762 if ((base ^ io_map_base) & BIT(VA_BITS - 1))
763 ret = -ENOMEM;
764 else
765 io_map_base = base;
766
767 mutex_unlock(&kvm_hyp_pgd_mutex);
768
769 if (ret)
770 goto out;
771
772 if (__kvm_cpu_uses_extended_idmap())
773 pgd = boot_hyp_pgd;
774
775 ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
776 base, base + size,
777 __phys_to_pfn(phys_addr), prot);
778 if (ret)
779 goto out;
780
781 *haddr = base + offset_in_page(phys_addr);
782
783out:
784 return ret;
785}
786
711/** 787/**
712 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 788 * create_hyp_io_mappings - Map IO into both kernel and HYP
713 * @from: The kernel start VA of the range
714 * @to: The kernel end VA of the range (exclusive)
715 * @phys_addr: The physical start address which gets mapped 789 * @phys_addr: The physical start address which gets mapped
716 * 790 * @size: Size of the region being mapped
717 * The resulting HYP VA is the same as the kernel VA, modulo 791 * @kaddr: Kernel VA for this mapping
718 * HYP_PAGE_OFFSET. 792 * @haddr: HYP VA for this mapping
719 */ 793 */
720int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) 794int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
795 void __iomem **kaddr,
796 void __iomem **haddr)
721{ 797{
722 unsigned long start = kern_hyp_va((unsigned long)from); 798 unsigned long addr;
723 unsigned long end = kern_hyp_va((unsigned long)to); 799 int ret;
724 800
725 if (is_kernel_in_hyp_mode()) 801 *kaddr = ioremap(phys_addr, size);
802 if (!*kaddr)
803 return -ENOMEM;
804
805 if (is_kernel_in_hyp_mode()) {
806 *haddr = *kaddr;
726 return 0; 807 return 0;
808 }
727 809
728 /* Check for a valid kernel IO mapping */ 810 ret = __create_hyp_private_mapping(phys_addr, size,
729 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) 811 &addr, PAGE_HYP_DEVICE);
730 return -EINVAL; 812 if (ret) {
813 iounmap(*kaddr);
814 *kaddr = NULL;
815 *haddr = NULL;
816 return ret;
817 }
818
819 *haddr = (void __iomem *)addr;
820 return 0;
821}
731 822
732 return __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end, 823/**
733 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); 824 * create_hyp_exec_mappings - Map an executable range into HYP
825 * @phys_addr: The physical start address which gets mapped
826 * @size: Size of the region being mapped
827 * @haddr: HYP VA for this mapping
828 */
829int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
830 void **haddr)
831{
832 unsigned long addr;
833 int ret;
834
835 BUG_ON(is_kernel_in_hyp_mode());
836
837 ret = __create_hyp_private_mapping(phys_addr, size,
838 &addr, PAGE_HYP_EXEC);
839 if (ret) {
840 *haddr = NULL;
841 return ret;
842 }
843
844 *haddr = (void *)addr;
845 return 0;
734} 846}
735 847
736/** 848/**
@@ -1801,7 +1913,9 @@ int kvm_mmu_init(void)
1801 int err; 1913 int err;
1802 1914
1803 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start); 1915 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1916 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1804 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end); 1917 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1918 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1805 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init); 1919 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
1806 1920
1807 /* 1921 /*
@@ -1812,10 +1926,11 @@ int kvm_mmu_init(void)
1812 1926
1813 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 1927 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
1814 kvm_debug("HYP VA range: %lx:%lx\n", 1928 kvm_debug("HYP VA range: %lx:%lx\n",
1815 kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL)); 1929 kern_hyp_va(PAGE_OFFSET),
1930 kern_hyp_va((unsigned long)high_memory - 1));
1816 1931
1817 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 1932 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
1818 hyp_idmap_start < kern_hyp_va(~0UL) && 1933 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
1819 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 1934 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
1820 /* 1935 /*
1821 * The idmap page is intersecting with the VA space, 1936 * The idmap page is intersecting with the VA space,
@@ -1859,6 +1974,7 @@ int kvm_mmu_init(void)
1859 goto out; 1974 goto out;
1860 } 1975 }
1861 1976
1977 io_map_base = hyp_idmap_start;
1862 return 0; 1978 return 0;
1863out: 1979out:
1864 free_hyp_pgds(); 1980 free_hyp_pgds();
@@ -2035,7 +2151,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
2035 */ 2151 */
2036void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2152void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2037{ 2153{
2038 unsigned long hcr = vcpu_get_hcr(vcpu); 2154 unsigned long hcr = *vcpu_hcr(vcpu);
2039 2155
2040 /* 2156 /*
2041 * If this is the first time we do a S/W operation 2157 * If this is the first time we do a S/W operation
@@ -2050,7 +2166,7 @@ void kvm_set_way_flush(struct kvm_vcpu *vcpu)
2050 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2166 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
2051 vcpu_has_cache_enabled(vcpu)); 2167 vcpu_has_cache_enabled(vcpu));
2052 stage2_flush_vm(vcpu->kvm); 2168 stage2_flush_vm(vcpu->kvm);
2053 vcpu_set_hcr(vcpu, hcr | HCR_TVM); 2169 *vcpu_hcr(vcpu) = hcr | HCR_TVM;
2054 } 2170 }
2055} 2171}
2056 2172
@@ -2068,7 +2184,7 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
2068 2184
2069 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2185 /* Caches are now on, stop trapping VM ops (until a S/W op) */
2070 if (now_enabled) 2186 if (now_enabled)
2071 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM); 2187 *vcpu_hcr(vcpu) &= ~HCR_TVM;
2072 2188
2073 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2189 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2074} 2190}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 8a9c42366db7..1c5b76c46e26 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -37,7 +37,7 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
37 37
38 reg = (select_idx == ARMV8_PMU_CYCLE_IDX) 38 reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
39 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; 39 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
40 counter = vcpu_sys_reg(vcpu, reg); 40 counter = __vcpu_sys_reg(vcpu, reg);
41 41
42 /* The real counter value is equal to the value of counter register plus 42 /* The real counter value is equal to the value of counter register plus
43 * the value perf event counts. 43 * the value perf event counts.
@@ -61,7 +61,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
61 61
62 reg = (select_idx == ARMV8_PMU_CYCLE_IDX) 62 reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
63 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx; 63 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
64 vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx); 64 __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
65} 65}
66 66
67/** 67/**
@@ -78,7 +78,7 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
78 counter = kvm_pmu_get_counter_value(vcpu, pmc->idx); 78 counter = kvm_pmu_get_counter_value(vcpu, pmc->idx);
79 reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX) 79 reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
80 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx; 80 ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
81 vcpu_sys_reg(vcpu, reg) = counter; 81 __vcpu_sys_reg(vcpu, reg) = counter;
82 perf_event_disable(pmc->perf_event); 82 perf_event_disable(pmc->perf_event);
83 perf_event_release_kernel(pmc->perf_event); 83 perf_event_release_kernel(pmc->perf_event);
84 pmc->perf_event = NULL; 84 pmc->perf_event = NULL;
@@ -125,7 +125,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
125 125
126u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) 126u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
127{ 127{
128 u64 val = vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; 128 u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
129 129
130 val &= ARMV8_PMU_PMCR_N_MASK; 130 val &= ARMV8_PMU_PMCR_N_MASK;
131 if (val == 0) 131 if (val == 0)
@@ -147,7 +147,7 @@ void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val)
147 struct kvm_pmu *pmu = &vcpu->arch.pmu; 147 struct kvm_pmu *pmu = &vcpu->arch.pmu;
148 struct kvm_pmc *pmc; 148 struct kvm_pmc *pmc;
149 149
150 if (!(vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) 150 if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
151 return; 151 return;
152 152
153 for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { 153 for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
@@ -193,10 +193,10 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
193{ 193{
194 u64 reg = 0; 194 u64 reg = 0;
195 195
196 if ((vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { 196 if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
197 reg = vcpu_sys_reg(vcpu, PMOVSSET_EL0); 197 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
198 reg &= vcpu_sys_reg(vcpu, PMCNTENSET_EL0); 198 reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
199 reg &= vcpu_sys_reg(vcpu, PMINTENSET_EL1); 199 reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
200 reg &= kvm_pmu_valid_counter_mask(vcpu); 200 reg &= kvm_pmu_valid_counter_mask(vcpu);
201 } 201 }
202 202
@@ -295,7 +295,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
295 struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); 295 struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
296 int idx = pmc->idx; 296 int idx = pmc->idx;
297 297
298 vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); 298 __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
299 299
300 if (kvm_pmu_overflow_status(vcpu)) { 300 if (kvm_pmu_overflow_status(vcpu)) {
301 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); 301 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
@@ -316,19 +316,19 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
316 if (val == 0) 316 if (val == 0)
317 return; 317 return;
318 318
319 enable = vcpu_sys_reg(vcpu, PMCNTENSET_EL0); 319 enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
320 for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) { 320 for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
321 if (!(val & BIT(i))) 321 if (!(val & BIT(i)))
322 continue; 322 continue;
323 type = vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i) 323 type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
324 & ARMV8_PMU_EVTYPE_EVENT; 324 & ARMV8_PMU_EVTYPE_EVENT;
325 if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR) 325 if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR)
326 && (enable & BIT(i))) { 326 && (enable & BIT(i))) {
327 reg = vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1; 327 reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
328 reg = lower_32_bits(reg); 328 reg = lower_32_bits(reg);
329 vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg; 329 __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
330 if (!reg) 330 if (!reg)
331 vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); 331 __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
332 } 332 }
333 } 333 }
334} 334}
@@ -348,7 +348,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
348 mask = kvm_pmu_valid_counter_mask(vcpu); 348 mask = kvm_pmu_valid_counter_mask(vcpu);
349 if (val & ARMV8_PMU_PMCR_E) { 349 if (val & ARMV8_PMU_PMCR_E) {
350 kvm_pmu_enable_counter(vcpu, 350 kvm_pmu_enable_counter(vcpu,
351 vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); 351 __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
352 } else { 352 } else {
353 kvm_pmu_disable_counter(vcpu, mask); 353 kvm_pmu_disable_counter(vcpu, mask);
354 } 354 }
@@ -369,8 +369,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
369 369
370static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx) 370static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
371{ 371{
372 return (vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && 372 return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
373 (vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx)); 373 (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
374} 374}
375 375
376/** 376/**
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 743ca5cb05ef..68378fe17a0e 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -166,12 +166,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
166 kvm->arch.vgic.in_kernel = true; 166 kvm->arch.vgic.in_kernel = true;
167 kvm->arch.vgic.vgic_model = type; 167 kvm->arch.vgic.vgic_model = type;
168 168
169 /*
170 * kvm_vgic_global_state.vctrl_base is set on vgic probe (kvm_arch_init)
171 * it is stored in distributor struct for asm save/restore purpose
172 */
173 kvm->arch.vgic.vctrl_base = kvm_vgic_global_state.vctrl_base;
174
175 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; 169 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
176 kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; 170 kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
177 kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF; 171 kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
@@ -302,17 +296,6 @@ int vgic_init(struct kvm *kvm)
302 296
303 dist->initialized = true; 297 dist->initialized = true;
304 298
305 /*
306 * If we're initializing GICv2 on-demand when first running the VCPU
307 * then we need to load the VGIC state onto the CPU. We can detect
308 * this easily by checking if we are in between vcpu_load and vcpu_put
309 * when we just initialized the VGIC.
310 */
311 preempt_disable();
312 vcpu = kvm_arm_get_running_vcpu();
313 if (vcpu)
314 kvm_vgic_load(vcpu);
315 preempt_enable();
316out: 299out:
317 return ret; 300 return ret;
318} 301}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 465095355666..a8f07243aa9f 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -316,21 +316,24 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr)
316 struct vgic_dist *dist = &vcpu->kvm->arch.vgic; 316 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
317 struct vgic_irq *irq; 317 struct vgic_irq *irq;
318 u32 *intids; 318 u32 *intids;
319 int irq_count = dist->lpi_list_count, i = 0; 319 int irq_count, i = 0;
320 320
321 /* 321 /*
322 * We use the current value of the list length, which may change 322 * There is an obvious race between allocating the array and LPIs
323 * after the kmalloc. We don't care, because the guest shouldn't 323 * being mapped/unmapped. If we ended up here as a result of a
324 * change anything while the command handling is still running, 324 * command, we're safe (locks are held, preventing another
325 * and in the worst case we would miss a new IRQ, which one wouldn't 325 * command). If coming from another path (such as enabling LPIs),
326 * expect to be covered by this command anyway. 326 * we must be careful not to overrun the array.
327 */ 327 */
328 irq_count = READ_ONCE(dist->lpi_list_count);
328 intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); 329 intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
329 if (!intids) 330 if (!intids)
330 return -ENOMEM; 331 return -ENOMEM;
331 332
332 spin_lock(&dist->lpi_list_lock); 333 spin_lock(&dist->lpi_list_lock);
333 list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { 334 list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
335 if (i == irq_count)
336 break;
334 /* We don't need to "get" the IRQ, as we hold the list lock. */ 337 /* We don't need to "get" the IRQ, as we hold the list lock. */
335 if (irq->target_vcpu != vcpu) 338 if (irq->target_vcpu != vcpu)
336 continue; 339 continue;
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 29556f71b691..45aa433f018f 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -105,12 +105,9 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
105 105
106 /* 106 /*
107 * Clear soft pending state when level irqs have been acked. 107 * Clear soft pending state when level irqs have been acked.
108 * Always regenerate the pending state.
109 */ 108 */
110 if (irq->config == VGIC_CONFIG_LEVEL) { 109 if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
111 if (!(val & GICH_LR_PENDING_BIT)) 110 irq->pending_latch = false;
112 irq->pending_latch = false;
113 }
114 111
115 /* 112 /*
116 * Level-triggered mapped IRQs are special because we only 113 * Level-triggered mapped IRQs are special because we only
@@ -153,8 +150,35 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
153void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) 150void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
154{ 151{
155 u32 val = irq->intid; 152 u32 val = irq->intid;
153 bool allow_pending = true;
154
155 if (irq->active)
156 val |= GICH_LR_ACTIVE_BIT;
157
158 if (irq->hw) {
159 val |= GICH_LR_HW;
160 val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
161 /*
162 * Never set pending+active on a HW interrupt, as the
163 * pending state is kept at the physical distributor
164 * level.
165 */
166 if (irq->active)
167 allow_pending = false;
168 } else {
169 if (irq->config == VGIC_CONFIG_LEVEL) {
170 val |= GICH_LR_EOI;
156 171
157 if (irq_is_pending(irq)) { 172 /*
173 * Software resampling doesn't work very well
174 * if we allow P+A, so let's not do that.
175 */
176 if (irq->active)
177 allow_pending = false;
178 }
179 }
180
181 if (allow_pending && irq_is_pending(irq)) {
158 val |= GICH_LR_PENDING_BIT; 182 val |= GICH_LR_PENDING_BIT;
159 183
160 if (irq->config == VGIC_CONFIG_EDGE) 184 if (irq->config == VGIC_CONFIG_EDGE)
@@ -171,24 +195,6 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
171 } 195 }
172 } 196 }
173 197
174 if (irq->active)
175 val |= GICH_LR_ACTIVE_BIT;
176
177 if (irq->hw) {
178 val |= GICH_LR_HW;
179 val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
180 /*
181 * Never set pending+active on a HW interrupt, as the
182 * pending state is kept at the physical distributor
183 * level.
184 */
185 if (irq->active && irq_is_pending(irq))
186 val &= ~GICH_LR_PENDING_BIT;
187 } else {
188 if (irq->config == VGIC_CONFIG_LEVEL)
189 val |= GICH_LR_EOI;
190 }
191
192 /* 198 /*
193 * Level-triggered mapped IRQs are special because we only observe 199 * Level-triggered mapped IRQs are special because we only observe
194 * rising edges as input to the VGIC. We therefore lower the line 200 * rising edges as input to the VGIC. We therefore lower the line
@@ -272,7 +278,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu)
272 * anyway. 278 * anyway.
273 */ 279 */
274 vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; 280 vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
275 vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
276 281
277 /* Get the show on the road... */ 282 /* Get the show on the road... */
278 vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; 283 vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -368,16 +373,11 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
368 if (!PAGE_ALIGNED(info->vcpu.start) || 373 if (!PAGE_ALIGNED(info->vcpu.start) ||
369 !PAGE_ALIGNED(resource_size(&info->vcpu))) { 374 !PAGE_ALIGNED(resource_size(&info->vcpu))) {
370 kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n"); 375 kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
371 kvm_vgic_global_state.vcpu_base_va = ioremap(info->vcpu.start,
372 resource_size(&info->vcpu));
373 if (!kvm_vgic_global_state.vcpu_base_va) {
374 kvm_err("Cannot ioremap GICV\n");
375 return -ENOMEM;
376 }
377 376
378 ret = create_hyp_io_mappings(kvm_vgic_global_state.vcpu_base_va, 377 ret = create_hyp_io_mappings(info->vcpu.start,
379 kvm_vgic_global_state.vcpu_base_va + resource_size(&info->vcpu), 378 resource_size(&info->vcpu),
380 info->vcpu.start); 379 &kvm_vgic_global_state.vcpu_base_va,
380 &kvm_vgic_global_state.vcpu_hyp_va);
381 if (ret) { 381 if (ret) {
382 kvm_err("Cannot map GICV into hyp\n"); 382 kvm_err("Cannot map GICV into hyp\n");
383 goto out; 383 goto out;
@@ -386,26 +386,18 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
386 static_branch_enable(&vgic_v2_cpuif_trap); 386 static_branch_enable(&vgic_v2_cpuif_trap);
387 } 387 }
388 388
389 kvm_vgic_global_state.vctrl_base = ioremap(info->vctrl.start, 389 ret = create_hyp_io_mappings(info->vctrl.start,
390 resource_size(&info->vctrl)); 390 resource_size(&info->vctrl),
391 if (!kvm_vgic_global_state.vctrl_base) { 391 &kvm_vgic_global_state.vctrl_base,
392 kvm_err("Cannot ioremap GICH\n"); 392 &kvm_vgic_global_state.vctrl_hyp);
393 ret = -ENOMEM; 393 if (ret) {
394 kvm_err("Cannot map VCTRL into hyp\n");
394 goto out; 395 goto out;
395 } 396 }
396 397
397 vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); 398 vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
398 kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; 399 kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
399 400
400 ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
401 kvm_vgic_global_state.vctrl_base +
402 resource_size(&info->vctrl),
403 info->vctrl.start);
404 if (ret) {
405 kvm_err("Cannot map VCTRL into hyp\n");
406 goto out;
407 }
408
409 ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); 401 ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
410 if (ret) { 402 if (ret) {
411 kvm_err("Cannot register GICv2 KVM device\n"); 403 kvm_err("Cannot register GICv2 KVM device\n");
@@ -429,18 +421,74 @@ out:
429 return ret; 421 return ret;
430} 422}
431 423
424static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
425{
426 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
427 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
428 u64 elrsr;
429 int i;
430
431 elrsr = readl_relaxed(base + GICH_ELRSR0);
432 if (unlikely(used_lrs > 32))
433 elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
434
435 for (i = 0; i < used_lrs; i++) {
436 if (elrsr & (1UL << i))
437 cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
438 else
439 cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
440
441 writel_relaxed(0, base + GICH_LR0 + (i * 4));
442 }
443}
444
445void vgic_v2_save_state(struct kvm_vcpu *vcpu)
446{
447 void __iomem *base = kvm_vgic_global_state.vctrl_base;
448 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
449
450 if (!base)
451 return;
452
453 if (used_lrs) {
454 save_lrs(vcpu, base);
455 writel_relaxed(0, base + GICH_HCR);
456 }
457}
458
459void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
460{
461 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
462 void __iomem *base = kvm_vgic_global_state.vctrl_base;
463 u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
464 int i;
465
466 if (!base)
467 return;
468
469 if (used_lrs) {
470 writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
471 for (i = 0; i < used_lrs; i++) {
472 writel_relaxed(cpu_if->vgic_lr[i],
473 base + GICH_LR0 + (i * 4));
474 }
475 }
476}
477
432void vgic_v2_load(struct kvm_vcpu *vcpu) 478void vgic_v2_load(struct kvm_vcpu *vcpu)
433{ 479{
434 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; 480 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
435 struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
436 481
437 writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR); 482 writel_relaxed(cpu_if->vgic_vmcr,
483 kvm_vgic_global_state.vctrl_base + GICH_VMCR);
484 writel_relaxed(cpu_if->vgic_apr,
485 kvm_vgic_global_state.vctrl_base + GICH_APR);
438} 486}
439 487
440void vgic_v2_put(struct kvm_vcpu *vcpu) 488void vgic_v2_put(struct kvm_vcpu *vcpu)
441{ 489{
442 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; 490 struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
443 struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
444 491
445 cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR); 492 cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
493 cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
446} 494}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 0ff2006f3781..8195f52ae6f0 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -16,6 +16,7 @@
16#include <linux/kvm.h> 16#include <linux/kvm.h>
17#include <linux/kvm_host.h> 17#include <linux/kvm_host.h>
18#include <kvm/arm_vgic.h> 18#include <kvm/arm_vgic.h>
19#include <asm/kvm_hyp.h>
19#include <asm/kvm_mmu.h> 20#include <asm/kvm_mmu.h>
20#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
21 22
@@ -96,12 +97,9 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
96 97
97 /* 98 /*
98 * Clear soft pending state when level irqs have been acked. 99 * Clear soft pending state when level irqs have been acked.
99 * Always regenerate the pending state.
100 */ 100 */
101 if (irq->config == VGIC_CONFIG_LEVEL) { 101 if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
102 if (!(val & ICH_LR_PENDING_BIT)) 102 irq->pending_latch = false;
103 irq->pending_latch = false;
104 }
105 103
106 /* 104 /*
107 * Level-triggered mapped IRQs are special because we only 105 * Level-triggered mapped IRQs are special because we only
@@ -135,8 +133,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
135{ 133{
136 u32 model = vcpu->kvm->arch.vgic.vgic_model; 134 u32 model = vcpu->kvm->arch.vgic.vgic_model;
137 u64 val = irq->intid; 135 u64 val = irq->intid;
136 bool allow_pending = true;
137
138 if (irq->active)
139 val |= ICH_LR_ACTIVE_BIT;
140
141 if (irq->hw) {
142 val |= ICH_LR_HW;
143 val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
144 /*
145 * Never set pending+active on a HW interrupt, as the
146 * pending state is kept at the physical distributor
147 * level.
148 */
149 if (irq->active)
150 allow_pending = false;
151 } else {
152 if (irq->config == VGIC_CONFIG_LEVEL) {
153 val |= ICH_LR_EOI;
154
155 /*
156 * Software resampling doesn't work very well
157 * if we allow P+A, so let's not do that.
158 */
159 if (irq->active)
160 allow_pending = false;
161 }
162 }
138 163
139 if (irq_is_pending(irq)) { 164 if (allow_pending && irq_is_pending(irq)) {
140 val |= ICH_LR_PENDING_BIT; 165 val |= ICH_LR_PENDING_BIT;
141 166
142 if (irq->config == VGIC_CONFIG_EDGE) 167 if (irq->config == VGIC_CONFIG_EDGE)
@@ -154,24 +179,6 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
154 } 179 }
155 } 180 }
156 181
157 if (irq->active)
158 val |= ICH_LR_ACTIVE_BIT;
159
160 if (irq->hw) {
161 val |= ICH_LR_HW;
162 val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
163 /*
164 * Never set pending+active on a HW interrupt, as the
165 * pending state is kept at the physical distributor
166 * level.
167 */
168 if (irq->active && irq_is_pending(irq))
169 val &= ~ICH_LR_PENDING_BIT;
170 } else {
171 if (irq->config == VGIC_CONFIG_LEVEL)
172 val |= ICH_LR_EOI;
173 }
174
175 /* 182 /*
176 * Level-triggered mapped IRQs are special because we only observe 183 * Level-triggered mapped IRQs are special because we only observe
177 * rising edges as input to the VGIC. We therefore lower the line 184 * rising edges as input to the VGIC. We therefore lower the line
@@ -274,7 +281,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
274 * anyway. 281 * anyway.
275 */ 282 */
276 vgic_v3->vgic_vmcr = 0; 283 vgic_v3->vgic_vmcr = 0;
277 vgic_v3->vgic_elrsr = ~0;
278 284
279 /* 285 /*
280 * If we are emulating a GICv3, we do it in an non-GICv2-compatible 286 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -595,6 +601,11 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
595 */ 601 */
596 if (likely(cpu_if->vgic_sre)) 602 if (likely(cpu_if->vgic_sre))
597 kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); 603 kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
604
605 kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
606
607 if (has_vhe())
608 __vgic_v3_activate_traps(vcpu);
598} 609}
599 610
600void vgic_v3_put(struct kvm_vcpu *vcpu) 611void vgic_v3_put(struct kvm_vcpu *vcpu)
@@ -603,4 +614,9 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
603 614
604 if (likely(cpu_if->vgic_sre)) 615 if (likely(cpu_if->vgic_sre))
605 cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); 616 cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
617
618 kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
619
620 if (has_vhe())
621 __vgic_v3_deactivate_traps(vcpu);
606} 622}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 8201899126f6..e74baec76361 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -19,6 +19,7 @@
19#include <linux/list_sort.h> 19#include <linux/list_sort.h>
20#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/irq.h> 21#include <linux/irq.h>
22#include <asm/kvm_hyp.h>
22 23
23#include "vgic.h" 24#include "vgic.h"
24 25
@@ -808,6 +809,24 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
808 vgic_clear_lr(vcpu, count); 809 vgic_clear_lr(vcpu, count);
809} 810}
810 811
812static inline bool can_access_vgic_from_kernel(void)
813{
814 /*
815 * GICv2 can always be accessed from the kernel because it is
816 * memory-mapped, and VHE systems can access GICv3 EL2 system
817 * registers.
818 */
819 return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
820}
821
822static inline void vgic_save_state(struct kvm_vcpu *vcpu)
823{
824 if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
825 vgic_v2_save_state(vcpu);
826 else
827 __vgic_v3_save_state(vcpu);
828}
829
811/* Sync back the hardware VGIC state into our emulation after a guest's run. */ 830/* Sync back the hardware VGIC state into our emulation after a guest's run. */
812void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) 831void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
813{ 832{
@@ -819,11 +838,22 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
819 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) 838 if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
820 return; 839 return;
821 840
841 if (can_access_vgic_from_kernel())
842 vgic_save_state(vcpu);
843
822 if (vgic_cpu->used_lrs) 844 if (vgic_cpu->used_lrs)
823 vgic_fold_lr_state(vcpu); 845 vgic_fold_lr_state(vcpu);
824 vgic_prune_ap_list(vcpu); 846 vgic_prune_ap_list(vcpu);
825} 847}
826 848
849static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
850{
851 if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
852 vgic_v2_restore_state(vcpu);
853 else
854 __vgic_v3_restore_state(vcpu);
855}
856
827/* Flush our emulation state into the GIC hardware before entering the guest. */ 857/* Flush our emulation state into the GIC hardware before entering the guest. */
828void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) 858void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
829{ 859{
@@ -846,6 +876,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
846 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); 876 spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
847 vgic_flush_lr_state(vcpu); 877 vgic_flush_lr_state(vcpu);
848 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); 878 spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
879
880 if (can_access_vgic_from_kernel())
881 vgic_restore_state(vcpu);
849} 882}
850 883
851void kvm_vgic_load(struct kvm_vcpu *vcpu) 884void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index f5b8519e5546..830e815748a0 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -178,6 +178,9 @@ void vgic_v2_init_lrs(void);
178void vgic_v2_load(struct kvm_vcpu *vcpu); 178void vgic_v2_load(struct kvm_vcpu *vcpu);
179void vgic_v2_put(struct kvm_vcpu *vcpu); 179void vgic_v2_put(struct kvm_vcpu *vcpu);
180 180
181void vgic_v2_save_state(struct kvm_vcpu *vcpu);
182void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
183
181static inline void vgic_get_irq_kref(struct vgic_irq *irq) 184static inline void vgic_get_irq_kref(struct vgic_irq *irq)
182{ 185{
183 if (irq->intid < VGIC_MIN_LPI) 186 if (irq->intid < VGIC_MIN_LPI)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65dea3ffef68..c7b2e927f699 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3398,21 +3398,6 @@ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
3398 return kvm_io_bus_cmp(p1, p2); 3398 return kvm_io_bus_cmp(p1, p2);
3399} 3399}
3400 3400
3401static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
3402 gpa_t addr, int len)
3403{
3404 bus->range[bus->dev_count++] = (struct kvm_io_range) {
3405 .addr = addr,
3406 .len = len,
3407 .dev = dev,
3408 };
3409
3410 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
3411 kvm_io_bus_sort_cmp, NULL);
3412
3413 return 0;
3414}
3415
3416static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3401static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
3417 gpa_t addr, int len) 3402 gpa_t addr, int len)
3418{ 3403{
@@ -3553,7 +3538,9 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3553int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3538int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3554 int len, struct kvm_io_device *dev) 3539 int len, struct kvm_io_device *dev)
3555{ 3540{
3541 int i;
3556 struct kvm_io_bus *new_bus, *bus; 3542 struct kvm_io_bus *new_bus, *bus;
3543 struct kvm_io_range range;
3557 3544
3558 bus = kvm_get_bus(kvm, bus_idx); 3545 bus = kvm_get_bus(kvm, bus_idx);
3559 if (!bus) 3546 if (!bus)
@@ -3567,9 +3554,22 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3567 sizeof(struct kvm_io_range)), GFP_KERNEL); 3554 sizeof(struct kvm_io_range)), GFP_KERNEL);
3568 if (!new_bus) 3555 if (!new_bus)
3569 return -ENOMEM; 3556 return -ENOMEM;
3570 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 3557
3571 sizeof(struct kvm_io_range))); 3558 range = (struct kvm_io_range) {
3572 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 3559 .addr = addr,
3560 .len = len,
3561 .dev = dev,
3562 };
3563
3564 for (i = 0; i < bus->dev_count; i++)
3565 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
3566 break;
3567
3568 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
3569 new_bus->dev_count++;
3570 new_bus->range[i] = range;
3571 memcpy(new_bus->range + i + 1, bus->range + i,
3572 (bus->dev_count - i) * sizeof(struct kvm_io_range));
3573 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3573 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3574 synchronize_srcu_expedited(&kvm->srcu); 3574 synchronize_srcu_expedited(&kvm->srcu);
3575 kfree(bus); 3575 kfree(bus);