aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 21:22:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-22 21:22:53 -0500
commitfd7e9a88348472521d999434ee02f25735c7dadf (patch)
tree90e6249e58d90ba9d590cfed4481c29ca36a05dc
parent5066e4a34081dd82fb625f2f382bfa29ca421a3f (diff)
parentdd0fd8bca1850ddadf5d33a9ed28f3707cd98ac7 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "4.11 is going to be a relatively large release for KVM, with a little over 200 commits and noteworthy changes for most architectures. ARM: - GICv3 save/restore - cache flushing fixes - working MSI injection for GICv3 ITS - physical timer emulation MIPS: - various improvements under the hood - support for SMP guests - a large rewrite of MMU emulation. KVM MIPS can now use MMU notifiers to support copy-on-write, KSM, idle page tracking, swapping, ballooning and everything else. KVM_CAP_READONLY_MEM is also supported, so that writes to some memory regions can be treated as MMIO. The new MMU also paves the way for hardware virtualization support. PPC: - support for POWER9 using the radix-tree MMU for host and guest - resizable hashed page table - bugfixes. s390: - expose more features to the guest - more SIMD extensions - instruction execution protection - ESOP2 x86: - improved hashing in the MMU - faster PageLRU tracking for Intel CPUs without EPT A/D bits - some refactoring of nested VMX entry/exit code, preparing for live migration support of nested hypervisors - expose yet another AVX512 CPUID bit - host-to-guest PTP support - refactoring of interrupt injection, with some optimizations thrown in and some duct tape removed. - remove lazy FPU handling - optimizations of user-mode exits - optimizations of vcpu_is_preempted() for KVM guests generic: - alternative signaling mechanism that doesn't pound on tsk->sighand->siglock" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (195 commits) x86/kvm: Provide optimized version of vcpu_is_preempted() for x86-64 x86/paravirt: Change vcp_is_preempted() arg type to long KVM: VMX: use correct vmcs_read/write for guest segment selector/base x86/kvm/vmx: Defer TR reload after VM exit x86/asm/64: Drop __cacheline_aligned from struct x86_hw_tss x86/kvm/vmx: Simplify segment_base() x86/kvm/vmx: Get rid of segment_base() on 64-bit kernels x86/kvm/vmx: Don't fetch the TSS base from the GDT x86/asm: Define the kernel TSS limit in a macro kvm: fix page struct leak in handle_vmon KVM: PPC: Book3S HV: Disable HPT resizing on POWER9 for now KVM: Return an error code only as a constant in kvm_get_dirty_log() KVM: Return an error code only as a constant in kvm_get_dirty_log_protect() KVM: Return directly after a failed copy_from_user() in kvm_vm_compat_ioctl() KVM: x86: remove code for lazy FPU handling KVM: race-free exit from KVM_RUN without POSIX signals KVM: PPC: Book3S HV: Turn "KVM guest htab" message into a debug message KVM: PPC: Book3S PR: Ratelimit copy data failure error messages KVM: Support vCPU-based gfn->hva cache KVM: use separate generations for each address space ...
-rw-r--r--Documentation/virtual/kvm/api.txt138
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic-v3.txt11
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt35
-rw-r--r--Documentation/virtual/kvm/locking.txt31
-rw-r--r--arch/arm/include/asm/kvm_host.h3
-rw-r--r--arch/arm/include/asm/kvm_mmu.h12
-rw-r--r--arch/arm/include/uapi/asm/kvm.h13
-rw-r--r--arch/arm/kvm/Makefile5
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/mmu.c20
-rw-r--r--arch/arm/kvm/reset.c9
-rw-r--r--arch/arm/kvm/vgic-v3-coproc.c35
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h6
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h13
-rw-r--r--arch/arm64/kvm/Makefile4
-rw-r--r--arch/arm64/kvm/reset.c9
-rw-r--r--arch/arm64/kvm/sys_regs.c92
-rw-r--r--arch/arm64/kvm/sys_regs.h4
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c346
-rw-r--r--arch/mips/include/asm/kvm_host.h183
-rw-r--r--arch/mips/include/asm/mmu_context.h9
-rw-r--r--arch/mips/include/uapi/asm/kvm.h2
-rw-r--r--arch/mips/kvm/Kconfig2
-rw-r--r--arch/mips/kvm/dyntrans.c52
-rw-r--r--arch/mips/kvm/emulate.c432
-rw-r--r--arch/mips/kvm/entry.c155
-rw-r--r--arch/mips/kvm/interrupt.c5
-rw-r--r--arch/mips/kvm/mips.c503
-rw-r--r--arch/mips/kvm/mmu.c1329
-rw-r--r--arch/mips/kvm/tlb.c291
-rw-r--r--arch/mips/kvm/trap_emul.c734
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h16
-rw-r--r--arch/powerpc/include/asm/kvm_host.h21
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h15
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c635
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c65
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c62
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c138
-rw-r--r--arch/powerpc/kvm/book3s_pr.c130
-rw-r--r--arch/powerpc/kvm/book3s_xics.c192
-rw-r--r--arch/powerpc/kvm/book3s_xics.h7
-rw-r--r--arch/powerpc/kvm/powerpc.c10
-rw-r--r--arch/s390/kvm/gaccess.c26
-rw-r--r--arch/s390/kvm/gaccess.h19
-rw-r--r--arch/s390/kvm/guestdbg.c120
-rw-r--r--arch/s390/kvm/intercept.c25
-rw-r--r--arch/s390/kvm/kvm-s390.c46
-rw-r--r--arch/s390/kvm/kvm-s390.h12
-rw-r--r--arch/s390/kvm/priv.c30
-rw-r--r--arch/s390/kvm/vsie.c3
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/s390/tools/gen_facilities.c2
-rw-r--r--arch/x86/include/asm/desc.h58
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h30
-rw-r--r--arch/x86/include/asm/kvmclock.h6
-rw-r--r--arch/x86/include/asm/paravirt.h2
-rw-r--r--arch/x86/include/asm/processor.h12
-rw-r--r--arch/x86/include/asm/qspinlock.h2
-rw-r--r--arch/x86/include/asm/vmx.h28
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c9
-rw-r--r--arch/x86/kernel/ioport.c5
-rw-r--r--arch/x86/kernel/kvm.c26
-rw-r--r--arch/x86/kernel/kvmclock.c5
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c2
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kvm/cpuid.c10
-rw-r--r--arch/x86/kvm/emulate.c20
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/i8259.c16
-rw-r--r--arch/x86/kvm/irq.h19
-rw-r--r--arch/x86/kvm/irq_comm.c29
-rw-r--r--arch/x86/kvm/lapic.c197
-rw-r--r--arch/x86/kvm/lapic.h16
-rw-r--r--arch/x86/kvm/mmu.c509
-rw-r--r--arch/x86/kvm/svm.c57
-rw-r--r--arch/x86/kvm/vmx.c909
-rw-r--r--arch/x86/kvm/x86.c274
-rw-r--r--drivers/ptp/Kconfig12
-rw-r--r--drivers/ptp/Makefile1
-rw-r--r--drivers/ptp/ptp_kvm.c207
-rw-r--r--include/kvm/arm_arch_timer.h39
-rw-r--r--include/kvm/arm_vgic.h18
-rw-r--r--include/linux/irqchip/arm-gic-v3.h45
-rw-r--r--include/linux/kvm_host.h18
-rw-r--r--include/uapi/linux/kvm.h15
-rw-r--r--include/uapi/linux/kvm_para.h2
-rw-r--r--virt/kvm/arm/arch_timer.c201
-rw-r--r--virt/kvm/arm/hyp/timer-sr.c13
-rw-r--r--virt/kvm/arm/vgic/vgic-debug.c283
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c4
-rw-r--r--virt/kvm/arm/vgic/vgic-irqfd.c3
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c6
-rw-r--r--virt/kvm/arm/vgic/vgic-kvm-device.c231
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c87
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c203
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c167
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h24
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c12
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c40
-rw-r--r--virt/kvm/arm/vgic/vgic.c66
-rw-r--r--virt/kvm/arm/vgic/vgic.h83
-rw-r--r--virt/kvm/kvm_main.c113
110 files changed, 7277 insertions, 2968 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4470671b0c26..069450938b79 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2061,6 +2061,8 @@ registers, find a list below:
2061 MIPS | KVM_REG_MIPS_LO | 64 2061 MIPS | KVM_REG_MIPS_LO | 64
2062 MIPS | KVM_REG_MIPS_PC | 64 2062 MIPS | KVM_REG_MIPS_PC | 64
2063 MIPS | KVM_REG_MIPS_CP0_INDEX | 32 2063 MIPS | KVM_REG_MIPS_CP0_INDEX | 32
2064 MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64
2065 MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64
2064 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 2066 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64
2065 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 2067 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64
2066 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 2068 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32
@@ -2071,9 +2073,11 @@ registers, find a list below:
2071 MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 2073 MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64
2072 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 2074 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32
2073 MIPS | KVM_REG_MIPS_CP0_STATUS | 32 2075 MIPS | KVM_REG_MIPS_CP0_STATUS | 32
2076 MIPS | KVM_REG_MIPS_CP0_INTCTL | 32
2074 MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 2077 MIPS | KVM_REG_MIPS_CP0_CAUSE | 32
2075 MIPS | KVM_REG_MIPS_CP0_EPC | 64 2078 MIPS | KVM_REG_MIPS_CP0_EPC | 64
2076 MIPS | KVM_REG_MIPS_CP0_PRID | 32 2079 MIPS | KVM_REG_MIPS_CP0_PRID | 32
2080 MIPS | KVM_REG_MIPS_CP0_EBASE | 64
2077 MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 2081 MIPS | KVM_REG_MIPS_CP0_CONFIG | 32
2078 MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 2082 MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32
2079 MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 2083 MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32
@@ -2148,6 +2152,12 @@ patterns depending on whether they're 32-bit or 64-bit registers:
2148 0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit) 2152 0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit)
2149 0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit) 2153 0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit)
2150 2154
2155Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64
2156versions of the EntryLo registers regardless of the word size of the host
2157hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
2158with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
2159the PFNX field starting at bit 30.
2160
2151MIPS KVM control registers (see above) have the following id bit patterns: 2161MIPS KVM control registers (see above) have the following id bit patterns:
2152 0x7030 0000 0002 <reg:16> 2162 0x7030 0000 0002 <reg:16>
2153 2163
@@ -2443,18 +2453,20 @@ are, it will do nothing and return an EBUSY error.
2443The parameter is a pointer to a 32-bit unsigned integer variable 2453The parameter is a pointer to a 32-bit unsigned integer variable
2444containing the order (log base 2) of the desired size of the hash 2454containing the order (log base 2) of the desired size of the hash
2445table, which must be between 18 and 46. On successful return from the 2455table, which must be between 18 and 46. On successful return from the
2446ioctl, it will have been updated with the order of the hash table that 2456ioctl, the value will not be changed by the kernel.
2447was allocated.
2448 2457
2449If no hash table has been allocated when any vcpu is asked to run 2458If no hash table has been allocated when any vcpu is asked to run
2450(with the KVM_RUN ioctl), the host kernel will allocate a 2459(with the KVM_RUN ioctl), the host kernel will allocate a
2451default-sized hash table (16 MB). 2460default-sized hash table (16 MB).
2452 2461
2453If this ioctl is called when a hash table has already been allocated, 2462If this ioctl is called when a hash table has already been allocated,
2454the kernel will clear out the existing hash table (zero all HPTEs) and 2463with a different order from the existing hash table, the existing hash
2455return the hash table order in the parameter. (If the guest is using 2464table will be freed and a new one allocated. If this is ioctl is
2456the virtualized real-mode area (VRMA) facility, the kernel will 2465called when a hash table has already been allocated of the same order
2457re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) 2466as specified, the kernel will clear out the existing hash table (zero
2467all HPTEs). In either case, if the guest is using the virtualized
2468real-mode area (VRMA) facility, the kernel will re-create the VMRA
2469HPTEs on the next KVM_RUN of any vcpu.
2458 2470
24594.77 KVM_S390_INTERRUPT 24714.77 KVM_S390_INTERRUPT
2460 2472
@@ -3177,7 +3189,7 @@ of IOMMU pages.
3177 3189
3178The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. 3190The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
3179 3191
31804.98 KVM_REINJECT_CONTROL 31924.99 KVM_REINJECT_CONTROL
3181 3193
3182Capability: KVM_CAP_REINJECT_CONTROL 3194Capability: KVM_CAP_REINJECT_CONTROL
3183Architectures: x86 3195Architectures: x86
@@ -3201,7 +3213,7 @@ struct kvm_reinject_control {
3201pit_reinject = 0 (!reinject mode) is recommended, unless running an old 3213pit_reinject = 0 (!reinject mode) is recommended, unless running an old
3202operating system that uses the PIT for timing (e.g. Linux 2.4.x). 3214operating system that uses the PIT for timing (e.g. Linux 2.4.x).
3203 3215
32044.99 KVM_PPC_CONFIGURE_V3_MMU 32164.100 KVM_PPC_CONFIGURE_V3_MMU
3205 3217
3206Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 3218Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
3207Architectures: ppc 3219Architectures: ppc
@@ -3232,7 +3244,7 @@ process table, which is in the guest's space. This field is formatted
3232as the second doubleword of the partition table entry, as defined in 3244as the second doubleword of the partition table entry, as defined in
3233the Power ISA V3.00, Book III section 5.7.6.1. 3245the Power ISA V3.00, Book III section 5.7.6.1.
3234 3246
32354.100 KVM_PPC_GET_RMMU_INFO 32474.101 KVM_PPC_GET_RMMU_INFO
3236 3248
3237Capability: KVM_CAP_PPC_RADIX_MMU 3249Capability: KVM_CAP_PPC_RADIX_MMU
3238Architectures: ppc 3250Architectures: ppc
@@ -3266,6 +3278,101 @@ The ap_encodings gives the supported page sizes and their AP field
3266encodings, encoded with the AP value in the top 3 bits and the log 3278encodings, encoded with the AP value in the top 3 bits and the log
3267base 2 of the page size in the bottom 6 bits. 3279base 2 of the page size in the bottom 6 bits.
3268 3280
32814.102 KVM_PPC_RESIZE_HPT_PREPARE
3282
3283Capability: KVM_CAP_SPAPR_RESIZE_HPT
3284Architectures: powerpc
3285Type: vm ioctl
3286Parameters: struct kvm_ppc_resize_hpt (in)
3287Returns: 0 on successful completion,
3288 >0 if a new HPT is being prepared, the value is an estimated
3289 number of milliseconds until preparation is complete
3290 -EFAULT if struct kvm_reinject_control cannot be read,
3291 -EINVAL if the supplied shift or flags are invalid
3292 -ENOMEM if unable to allocate the new HPT
3293 -ENOSPC if there was a hash collision when moving existing
3294 HPT entries to the new HPT
3295 -EIO on other error conditions
3296
3297Used to implement the PAPR extension for runtime resizing of a guest's
3298Hashed Page Table (HPT). Specifically this starts, stops or monitors
3299the preparation of a new potential HPT for the guest, essentially
3300implementing the H_RESIZE_HPT_PREPARE hypercall.
3301
3302If called with shift > 0 when there is no pending HPT for the guest,
3303this begins preparation of a new pending HPT of size 2^(shift) bytes.
3304It then returns a positive integer with the estimated number of
3305milliseconds until preparation is complete.
3306
3307If called when there is a pending HPT whose size does not match that
3308requested in the parameters, discards the existing pending HPT and
3309creates a new one as above.
3310
3311If called when there is a pending HPT of the size requested, will:
3312 * If preparation of the pending HPT is already complete, return 0
3313 * If preparation of the pending HPT has failed, return an error
3314 code, then discard the pending HPT.
3315 * If preparation of the pending HPT is still in progress, return an
3316 estimated number of milliseconds until preparation is complete.
3317
3318If called with shift == 0, discards any currently pending HPT and
3319returns 0 (i.e. cancels any in-progress preparation).
3320
3321flags is reserved for future expansion, currently setting any bits in
3322flags will result in an -EINVAL.
3323
3324Normally this will be called repeatedly with the same parameters until
3325it returns <= 0. The first call will initiate preparation, subsequent
3326ones will monitor preparation until it completes or fails.
3327
3328struct kvm_ppc_resize_hpt {
3329 __u64 flags;
3330 __u32 shift;
3331 __u32 pad;
3332};
3333
33344.103 KVM_PPC_RESIZE_HPT_COMMIT
3335
3336Capability: KVM_CAP_SPAPR_RESIZE_HPT
3337Architectures: powerpc
3338Type: vm ioctl
3339Parameters: struct kvm_ppc_resize_hpt (in)
3340Returns: 0 on successful completion,
3341 -EFAULT if struct kvm_reinject_control cannot be read,
3342 -EINVAL if the supplied shift or flags are invalid
3343 -ENXIO is there is no pending HPT, or the pending HPT doesn't
3344 have the requested size
3345 -EBUSY if the pending HPT is not fully prepared
3346 -ENOSPC if there was a hash collision when moving existing
3347 HPT entries to the new HPT
3348 -EIO on other error conditions
3349
3350Used to implement the PAPR extension for runtime resizing of a guest's
3351Hashed Page Table (HPT). Specifically this requests that the guest be
3352transferred to working with the new HPT, essentially implementing the
3353H_RESIZE_HPT_COMMIT hypercall.
3354
3355This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has
3356returned 0 with the same parameters. In other cases
3357KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or
3358-EBUSY, though others may be possible if the preparation was started,
3359but failed).
3360
3361This will have undefined effects on the guest if it has not already
3362placed itself in a quiescent state where no vcpu will make MMU enabled
3363memory accesses.
3364
3365On succsful completion, the pending HPT will become the guest's active
3366HPT and the previous HPT will be discarded.
3367
3368On failure, the guest will still be operating on its previous HPT.
3369
3370struct kvm_ppc_resize_hpt {
3371 __u64 flags;
3372 __u32 shift;
3373 __u32 pad;
3374};
3375
32695. The kvm_run structure 33765. The kvm_run structure
3270------------------------ 3377------------------------
3271 3378
@@ -3282,7 +3389,18 @@ struct kvm_run {
3282Request that KVM_RUN return when it becomes possible to inject external 3389Request that KVM_RUN return when it becomes possible to inject external
3283interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. 3390interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
3284 3391
3285 __u8 padding1[7]; 3392 __u8 immediate_exit;
3393
3394This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
3395exits immediately, returning -EINTR. In the common scenario where a
3396signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
3397to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
3398Rather than blocking the signal outside KVM_RUN, userspace can set up
3399a signal handler that sets run->immediate_exit to a non-zero value.
3400
3401This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
3402
3403 __u8 padding1[6];
3286 3404
3287 /* out */ 3405 /* out */
3288 __u32 exit_reason; 3406 __u32 exit_reason;
diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
index 9348b3caccd7..c1a24612c198 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
@@ -118,7 +118,7 @@ Groups:
118 -EBUSY: One or more VCPUs are running 118 -EBUSY: One or more VCPUs are running
119 119
120 120
121 KVM_DEV_ARM_VGIC_CPU_SYSREGS 121 KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS
122 Attributes: 122 Attributes:
123 The attr field of kvm_device_attr encodes two values: 123 The attr field of kvm_device_attr encodes two values:
124 bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | 124 bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 |
@@ -139,13 +139,15 @@ Groups:
139 All system regs accessed through this API are (rw, 64-bit) and 139 All system regs accessed through this API are (rw, 64-bit) and
140 kvm_device_attr.addr points to a __u64 value. 140 kvm_device_attr.addr points to a __u64 value.
141 141
142 KVM_DEV_ARM_VGIC_CPU_SYSREGS accesses the CPU interface registers for the 142 KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the
143 CPU specified by the mpidr field. 143 CPU specified by the mpidr field.
144 144
145 CPU interface registers access is not implemented for AArch32 mode.
146 Error -ENXIO is returned when accessed in AArch32 mode.
145 Errors: 147 Errors:
146 -ENXIO: Getting or setting this register is not yet supported 148 -ENXIO: Getting or setting this register is not yet supported
147 -EBUSY: VCPU is running 149 -EBUSY: VCPU is running
148 -EINVAL: Invalid mpidr supplied 150 -EINVAL: Invalid mpidr or register value supplied
149 151
150 152
151 KVM_DEV_ARM_VGIC_GRP_NR_IRQS 153 KVM_DEV_ARM_VGIC_GRP_NR_IRQS
@@ -204,3 +206,6 @@ Groups:
204 architecture defined MPIDR, and the field is encoded as follows: 206 architecture defined MPIDR, and the field is encoded as follows:
205 | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | 207 | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 |
206 | Aff3 | Aff2 | Aff1 | Aff0 | 208 | Aff3 | Aff2 | Aff1 | Aff0 |
209 Errors:
210 -EINVAL: vINTID is not multiple of 32 or
211 info field is not VGIC_LEVEL_INFO_LINE_LEVEL
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index c8d040e27046..feaaa634f154 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the
81same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, 81same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall,
82specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) 82specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0)
83is used in the hypercall for future use. 83is used in the hypercall for future use.
84
85
866. KVM_HC_CLOCK_PAIRING
87------------------------
88Architecture: x86
89Status: active
90Purpose: Hypercall used to synchronize host and guest clocks.
91Usage:
92
93a0: guest physical address where host copies
94"struct kvm_clock_offset" structure.
95
96a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0)
97is supported (corresponding to the host's CLOCK_REALTIME clock).
98
99 struct kvm_clock_pairing {
100 __s64 sec;
101 __s64 nsec;
102 __u64 tsc;
103 __u32 flags;
104 __u32 pad[9];
105 };
106
107 Where:
108 * sec: seconds from clock_type clock.
109 * nsec: nanoseconds from clock_type clock.
110 * tsc: guest TSC value used to calculate sec/nsec pair
111 * flags: flags, unused (0) at the moment.
112
113The hypercall lets a guest compute a precise timestamp across
114host and guest. The guest can use the returned TSC value to
115compute the CLOCK_REALTIME for its clock, at the same instant.
116
117Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
118or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index fd013bf4115b..1bb8bcaf8497 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -26,9 +26,16 @@ sections.
26Fast page fault: 26Fast page fault:
27 27
28Fast page fault is the fast path which fixes the guest page fault out of 28Fast page fault is the fast path which fixes the guest page fault out of
29the mmu-lock on x86. Currently, the page fault can be fast only if the 29the mmu-lock on x86. Currently, the page fault can be fast in one of the
30shadow page table is present and it is caused by write-protect, that means 30following two cases:
31we just need change the W bit of the spte. 31
321. Access Tracking: The SPTE is not present, but it is marked for access
33tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to
34restore the saved R/X bits. This is described in more detail later below.
35
362. Write-Protection: The SPTE is present and the fault is
37caused by write-protect. That means we just need to change the W bit of the
38spte.
32 39
33What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and 40What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
34SPTE_MMU_WRITEABLE bit on the spte: 41SPTE_MMU_WRITEABLE bit on the spte:
@@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte:
38 page write-protection. 45 page write-protection.
39 46
40On fast page fault path, we will use cmpxchg to atomically set the spte W 47On fast page fault path, we will use cmpxchg to atomically set the spte W
41bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this 48bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or
49restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This
42is safe because whenever changing these bits can be detected by cmpxchg. 50is safe because whenever changing these bits can be detected by cmpxchg.
43 51
44But we need carefully check these cases: 52But we need carefully check these cases:
@@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always
142atomically update the spte, the race caused by fast page fault can be avoided, 150atomically update the spte, the race caused by fast page fault can be avoided,
143See the comments in spte_has_volatile_bits() and mmu_spte_update(). 151See the comments in spte_has_volatile_bits() and mmu_spte_update().
144 152
153Lockless Access Tracking:
154
155This is used for Intel CPUs that are using EPT but do not support the EPT A/D
156bits. In this case, when the KVM MMU notifier is called to track accesses to a
157page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present
158by clearing the RWX bits in the PTE and storing the original R & X bits in
159some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the
160PTE (using the ignored bit 62). When the VM tries to access the page later on,
161a fault is generated and the fast page fault mechanism described above is used
162to atomically restore the PTE to a Present state. The W bit is not saved when
163the PTE is marked for access tracking and during restoration to the Present
164state, the W bit is set depending on whether or not it was a write access. If
165it wasn't, then the W bit will remain clear until a write access happens, at
166which time it will be set using the Dirty tracking mechanism described above.
167
1453. Reference 1683. Reference
146------------ 169------------
147 170
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d5423ab15ed5..cc495d799c67 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -60,9 +60,6 @@ struct kvm_arch {
60 /* The last vcpu id that ran on each physical CPU */ 60 /* The last vcpu id that ran on each physical CPU */
61 int __percpu *last_vcpu_ran; 61 int __percpu *last_vcpu_ran;
62 62
63 /* Timer */
64 struct arch_timer_kvm timer;
65
66 /* 63 /*
67 * Anything that is not used directly from assembly code goes 64 * Anything that is not used directly from assembly code goes
68 * here. 65 * here.
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 74a44727f8e1..95f38dcd611d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -129,8 +129,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
129 129
130static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, 130static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
131 kvm_pfn_t pfn, 131 kvm_pfn_t pfn,
132 unsigned long size, 132 unsigned long size)
133 bool ipa_uncached)
134{ 133{
135 /* 134 /*
136 * If we are going to insert an instruction page and the icache is 135 * If we are going to insert an instruction page and the icache is
@@ -150,18 +149,12 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
150 * and iterate over the range. 149 * and iterate over the range.
151 */ 150 */
152 151
153 bool need_flush = !vcpu_has_cache_enabled(vcpu) || ipa_uncached;
154
155 VM_BUG_ON(size & ~PAGE_MASK); 152 VM_BUG_ON(size & ~PAGE_MASK);
156 153
157 if (!need_flush && !icache_is_pipt())
158 goto vipt_cache;
159
160 while (size) { 154 while (size) {
161 void *va = kmap_atomic_pfn(pfn); 155 void *va = kmap_atomic_pfn(pfn);
162 156
163 if (need_flush) 157 kvm_flush_dcache_to_poc(va, PAGE_SIZE);
164 kvm_flush_dcache_to_poc(va, PAGE_SIZE);
165 158
166 if (icache_is_pipt()) 159 if (icache_is_pipt())
167 __cpuc_coherent_user_range((unsigned long)va, 160 __cpuc_coherent_user_range((unsigned long)va,
@@ -173,7 +166,6 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
173 kunmap_atomic(va); 166 kunmap_atomic(va);
174 } 167 }
175 168
176vipt_cache:
177 if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) { 169 if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
178 /* any kind of VIPT cache */ 170 /* any kind of VIPT cache */
179 __flush_icache_all(); 171 __flush_icache_all();
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index af05f8e0903e..6ebd3e6a1fd1 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -181,10 +181,23 @@ struct kvm_arch_memory_slot {
181#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 181#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
182#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 182#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
183#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) 183#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
184#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
185#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
186 (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
184#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 187#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
185#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) 188#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
189#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
186#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 190#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
187#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 191#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
192#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
193#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
194#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
195#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
196#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
197 (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
198#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
199#define VGIC_LEVEL_INFO_LINE_LEVEL 0
200
188#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 201#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
189 202
190/* KVM_IRQ_LINE irq field index values */ 203/* KVM_IRQ_LINE irq field index values */
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index d571243ab4d1..7b3670c2ae7b 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt)
7 plus_virt_def := -DREQUIRES_VIRT=1 7 plus_virt_def := -DREQUIRES_VIRT=1
8endif 8endif
9 9
10ccflags-y += -Iarch/arm/kvm 10ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic
11CFLAGS_arm.o := -I. $(plus_virt_def) 11CFLAGS_arm.o := -I. $(plus_virt_def)
12CFLAGS_mmu.o := -I. 12CFLAGS_mmu.o := -I.
13 13
@@ -20,7 +20,7 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vf
20obj-$(CONFIG_KVM_ARM_HOST) += hyp/ 20obj-$(CONFIG_KVM_ARM_HOST) += hyp/
21obj-y += kvm-arm.o init.o interrupts.o 21obj-y += kvm-arm.o init.o interrupts.o
22obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 22obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
23obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o 23obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o vgic-v3-coproc.o
24obj-y += $(KVM)/arm/aarch32.o 24obj-y += $(KVM)/arm/aarch32.o
25 25
26obj-y += $(KVM)/arm/vgic/vgic.o 26obj-y += $(KVM)/arm/vgic/vgic.o
@@ -33,5 +33,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
33obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o 33obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
34obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o 34obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
35obj-y += $(KVM)/arm/vgic/vgic-its.o 35obj-y += $(KVM)/arm/vgic/vgic-its.o
36obj-y += $(KVM)/arm/vgic/vgic-debug.o
36obj-y += $(KVM)/irqchip.o 37obj-y += $(KVM)/irqchip.o
37obj-y += $(KVM)/arm/arch_timer.o 38obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9d7446456e0c..c9a2103faeb9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -135,7 +135,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
135 goto out_free_stage2_pgd; 135 goto out_free_stage2_pgd;
136 136
137 kvm_vgic_early_init(kvm); 137 kvm_vgic_early_init(kvm);
138 kvm_timer_init(kvm);
139 138
140 /* Mark the initial VMID generation invalid */ 139 /* Mark the initial VMID generation invalid */
141 kvm->arch.vmid_gen = 0; 140 kvm->arch.vmid_gen = 0;
@@ -207,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
207 case KVM_CAP_ARM_PSCI_0_2: 206 case KVM_CAP_ARM_PSCI_0_2:
208 case KVM_CAP_READONLY_MEM: 207 case KVM_CAP_READONLY_MEM:
209 case KVM_CAP_MP_STATE: 208 case KVM_CAP_MP_STATE:
209 case KVM_CAP_IMMEDIATE_EXIT:
210 r = 1; 210 r = 1;
211 break; 211 break;
212 case KVM_CAP_COALESCED_MMIO: 212 case KVM_CAP_COALESCED_MMIO:
@@ -301,7 +301,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
301 301
302int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 302int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
303{ 303{
304 return kvm_timer_should_fire(vcpu); 304 return kvm_timer_should_fire(vcpu_vtimer(vcpu)) ||
305 kvm_timer_should_fire(vcpu_ptimer(vcpu));
305} 306}
306 307
307void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 308void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
604 return ret; 605 return ret;
605 } 606 }
606 607
608 if (run->immediate_exit)
609 return -EINTR;
610
607 if (vcpu->sigset_active) 611 if (vcpu->sigset_active)
608 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 612 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
609 613
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index a5265edbeeab..962616fd4ddd 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1232,9 +1232,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1232} 1232}
1233 1233
1234static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, 1234static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
1235 unsigned long size, bool uncached) 1235 unsigned long size)
1236{ 1236{
1237 __coherent_cache_guest_page(vcpu, pfn, size, uncached); 1237 __coherent_cache_guest_page(vcpu, pfn, size);
1238} 1238}
1239 1239
1240static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1240static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1250,7 +1250,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1250 struct vm_area_struct *vma; 1250 struct vm_area_struct *vma;
1251 kvm_pfn_t pfn; 1251 kvm_pfn_t pfn;
1252 pgprot_t mem_type = PAGE_S2; 1252 pgprot_t mem_type = PAGE_S2;
1253 bool fault_ipa_uncached;
1254 bool logging_active = memslot_is_logging(memslot); 1253 bool logging_active = memslot_is_logging(memslot);
1255 unsigned long flags = 0; 1254 unsigned long flags = 0;
1256 1255
@@ -1337,8 +1336,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1337 if (!hugetlb && !force_pte) 1336 if (!hugetlb && !force_pte)
1338 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); 1337 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
1339 1338
1340 fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
1341
1342 if (hugetlb) { 1339 if (hugetlb) {
1343 pmd_t new_pmd = pfn_pmd(pfn, mem_type); 1340 pmd_t new_pmd = pfn_pmd(pfn, mem_type);
1344 new_pmd = pmd_mkhuge(new_pmd); 1341 new_pmd = pmd_mkhuge(new_pmd);
@@ -1346,7 +1343,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1346 new_pmd = kvm_s2pmd_mkwrite(new_pmd); 1343 new_pmd = kvm_s2pmd_mkwrite(new_pmd);
1347 kvm_set_pfn_dirty(pfn); 1344 kvm_set_pfn_dirty(pfn);
1348 } 1345 }
1349 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); 1346 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
1350 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); 1347 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1351 } else { 1348 } else {
1352 pte_t new_pte = pfn_pte(pfn, mem_type); 1349 pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1356,7 +1353,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1356 kvm_set_pfn_dirty(pfn); 1353 kvm_set_pfn_dirty(pfn);
1357 mark_page_dirty(kvm, gfn); 1354 mark_page_dirty(kvm, gfn);
1358 } 1355 }
1359 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached); 1356 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE);
1360 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); 1357 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
1361 } 1358 }
1362 1359
@@ -1879,15 +1876,6 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
1879int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 1876int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1880 unsigned long npages) 1877 unsigned long npages)
1881{ 1878{
1882 /*
1883 * Readonly memslots are not incoherent with the caches by definition,
1884 * but in practice, they are used mostly to emulate ROMs or NOR flashes
1885 * that the guest may consider devices and hence map as uncached.
1886 * To prevent incoherency issues in these cases, tag all readonly
1887 * regions as incoherent.
1888 */
1889 if (slot->flags & KVM_MEM_READONLY)
1890 slot->flags |= KVM_MEMSLOT_INCOHERENT;
1891 return 0; 1879 return 0;
1892} 1880}
1893 1881
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 4b5e802e57d1..1da8b2d14550 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -37,6 +37,11 @@ static struct kvm_regs cortexa_regs_reset = {
37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, 37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
38}; 38};
39 39
40static const struct kvm_irq_level cortexa_ptimer_irq = {
41 { .irq = 30 },
42 .level = 1,
43};
44
40static const struct kvm_irq_level cortexa_vtimer_irq = { 45static const struct kvm_irq_level cortexa_vtimer_irq = {
41 { .irq = 27 }, 46 { .irq = 27 },
42 .level = 1, 47 .level = 1,
@@ -58,6 +63,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
58{ 63{
59 struct kvm_regs *reset_regs; 64 struct kvm_regs *reset_regs;
60 const struct kvm_irq_level *cpu_vtimer_irq; 65 const struct kvm_irq_level *cpu_vtimer_irq;
66 const struct kvm_irq_level *cpu_ptimer_irq;
61 67
62 switch (vcpu->arch.target) { 68 switch (vcpu->arch.target) {
63 case KVM_ARM_TARGET_CORTEX_A7: 69 case KVM_ARM_TARGET_CORTEX_A7:
@@ -65,6 +71,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
65 reset_regs = &cortexa_regs_reset; 71 reset_regs = &cortexa_regs_reset;
66 vcpu->arch.midr = read_cpuid_id(); 72 vcpu->arch.midr = read_cpuid_id();
67 cpu_vtimer_irq = &cortexa_vtimer_irq; 73 cpu_vtimer_irq = &cortexa_vtimer_irq;
74 cpu_ptimer_irq = &cortexa_ptimer_irq;
68 break; 75 break;
69 default: 76 default:
70 return -ENODEV; 77 return -ENODEV;
@@ -77,5 +84,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
77 kvm_reset_coprocs(vcpu); 84 kvm_reset_coprocs(vcpu);
78 85
79 /* Reset arch_timer context */ 86 /* Reset arch_timer context */
80 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); 87 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
81} 88}
diff --git a/arch/arm/kvm/vgic-v3-coproc.c b/arch/arm/kvm/vgic-v3-coproc.c
new file mode 100644
index 000000000000..f41abf76366f
--- /dev/null
+++ b/arch/arm/kvm/vgic-v3-coproc.c
@@ -0,0 +1,35 @@
1/*
2 * VGIC system registers handling functions for AArch32 mode
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 */
13
14#include <linux/kvm.h>
15#include <linux/kvm_host.h>
16#include <asm/kvm_emulate.h>
17#include "vgic.h"
18
19int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
20 u64 *reg)
21{
22 /*
23 * TODO: Implement for AArch32
24 */
25 return -ENXIO;
26}
27
28int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id,
29 u64 *reg)
30{
31 /*
32 * TODO: Implement for AArch32
33 */
34 return -ENXIO;
35}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 443b387021f2..f21fd3894370 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -70,9 +70,6 @@ struct kvm_arch {
70 70
71 /* Interrupt controller */ 71 /* Interrupt controller */
72 struct vgic_dist vgic; 72 struct vgic_dist vgic;
73
74 /* Timer */
75 struct arch_timer_kvm timer;
76}; 73};
77 74
78#define KVM_NR_MEM_OBJS 40 75#define KVM_NR_MEM_OBJS 40
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 55772c13a375..ed1246014901 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -236,13 +236,11 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
236 236
237static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, 237static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
238 kvm_pfn_t pfn, 238 kvm_pfn_t pfn,
239 unsigned long size, 239 unsigned long size)
240 bool ipa_uncached)
241{ 240{
242 void *va = page_address(pfn_to_page(pfn)); 241 void *va = page_address(pfn_to_page(pfn));
243 242
244 if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached) 243 kvm_flush_dcache_to_poc(va, size);
245 kvm_flush_dcache_to_poc(va, size);
246 244
247 if (!icache_is_aliasing()) { /* PIPT */ 245 if (!icache_is_aliasing()) { /* PIPT */
248 flush_icache_range((unsigned long)va, 246 flush_icache_range((unsigned long)va,
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 3051f86a9b5f..c2860358ae3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -201,10 +201,23 @@ struct kvm_arch_memory_slot {
201#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 201#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
202#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 202#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
203#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) 203#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
204#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
205#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
206 (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
204#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 207#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
205#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) 208#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
209#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
206#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 210#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
207#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 211#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
212#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
213#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
214#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
215#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
216#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
217 (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
218#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
219#define VGIC_LEVEL_INFO_LINE_LEVEL 0
220
208#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 221#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
209 222
210/* Device Control API on vcpu fd */ 223/* Device Control API on vcpu fd */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d50a82a16ff6..afd51bebb9c5 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -2,7 +2,7 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5ccflags-y += -Iarch/arm64/kvm 5ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic
6CFLAGS_arm.o := -I. 6CFLAGS_arm.o := -I.
7CFLAGS_mmu.o := -I. 7CFLAGS_mmu.o := -I.
8 8
@@ -19,6 +19,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
19kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o 19kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
20kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o 20kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
21kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o 21kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
22kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o
22kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o 23kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
23 24
24kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o 25kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
@@ -31,6 +32,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
31kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o 32kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
32kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o 33kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
33kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o 34kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
35kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o
34kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o 36kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o
35kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o 37kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
36kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o 38kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e95d4f68bf54..d9e9697de1b2 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -46,6 +46,11 @@ static const struct kvm_regs default_regs_reset32 = {
46 COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT), 46 COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT),
47}; 47};
48 48
49static const struct kvm_irq_level default_ptimer_irq = {
50 .irq = 30,
51 .level = 1,
52};
53
49static const struct kvm_irq_level default_vtimer_irq = { 54static const struct kvm_irq_level default_vtimer_irq = {
50 .irq = 27, 55 .irq = 27,
51 .level = 1, 56 .level = 1,
@@ -104,6 +109,7 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
104int kvm_reset_vcpu(struct kvm_vcpu *vcpu) 109int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
105{ 110{
106 const struct kvm_irq_level *cpu_vtimer_irq; 111 const struct kvm_irq_level *cpu_vtimer_irq;
112 const struct kvm_irq_level *cpu_ptimer_irq;
107 const struct kvm_regs *cpu_reset; 113 const struct kvm_regs *cpu_reset;
108 114
109 switch (vcpu->arch.target) { 115 switch (vcpu->arch.target) {
@@ -117,6 +123,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
117 } 123 }
118 124
119 cpu_vtimer_irq = &default_vtimer_irq; 125 cpu_vtimer_irq = &default_vtimer_irq;
126 cpu_ptimer_irq = &default_ptimer_irq;
120 break; 127 break;
121 } 128 }
122 129
@@ -130,5 +137,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
130 kvm_pmu_vcpu_reset(vcpu); 137 kvm_pmu_vcpu_reset(vcpu);
131 138
132 /* Reset timer */ 139 /* Reset timer */
133 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); 140 return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
134} 141}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 87e7e6608cd8..0e26f8c2b56f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -820,6 +820,61 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
820 CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ 820 CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
821 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } 821 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
822 822
823static bool access_cntp_tval(struct kvm_vcpu *vcpu,
824 struct sys_reg_params *p,
825 const struct sys_reg_desc *r)
826{
827 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
828 u64 now = kvm_phys_timer_read();
829
830 if (p->is_write)
831 ptimer->cnt_cval = p->regval + now;
832 else
833 p->regval = ptimer->cnt_cval - now;
834
835 return true;
836}
837
838static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
839 struct sys_reg_params *p,
840 const struct sys_reg_desc *r)
841{
842 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
843
844 if (p->is_write) {
845 /* ISTATUS bit is read-only */
846 ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
847 } else {
848 u64 now = kvm_phys_timer_read();
849
850 p->regval = ptimer->cnt_ctl;
851 /*
852 * Set ISTATUS bit if it's expired.
853 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
854 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
855 * regardless of ENABLE bit for our implementation convenience.
856 */
857 if (ptimer->cnt_cval <= now)
858 p->regval |= ARCH_TIMER_CTRL_IT_STAT;
859 }
860
861 return true;
862}
863
864static bool access_cntp_cval(struct kvm_vcpu *vcpu,
865 struct sys_reg_params *p,
866 const struct sys_reg_desc *r)
867{
868 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
869
870 if (p->is_write)
871 ptimer->cnt_cval = p->regval;
872 else
873 p->regval = ptimer->cnt_cval;
874
875 return true;
876}
877
823/* 878/*
824 * Architected system registers. 879 * Architected system registers.
825 * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 880 * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -1029,6 +1084,16 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1029 { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011), 1084 { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
1030 NULL, reset_unknown, TPIDRRO_EL0 }, 1085 NULL, reset_unknown, TPIDRRO_EL0 },
1031 1086
1087 /* CNTP_TVAL_EL0 */
1088 { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b000),
1089 access_cntp_tval },
1090 /* CNTP_CTL_EL0 */
1091 { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b001),
1092 access_cntp_ctl },
1093 /* CNTP_CVAL_EL0 */
1094 { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b010),
1095 access_cntp_cval },
1096
1032 /* PMEVCNTRn_EL0 */ 1097 /* PMEVCNTRn_EL0 */
1033 PMU_PMEVCNTR_EL0(0), 1098 PMU_PMEVCNTR_EL0(0),
1034 PMU_PMEVCNTR_EL0(1), 1099 PMU_PMEVCNTR_EL0(1),
@@ -1795,6 +1860,17 @@ static bool index_to_params(u64 id, struct sys_reg_params *params)
1795 } 1860 }
1796} 1861}
1797 1862
1863const struct sys_reg_desc *find_reg_by_id(u64 id,
1864 struct sys_reg_params *params,
1865 const struct sys_reg_desc table[],
1866 unsigned int num)
1867{
1868 if (!index_to_params(id, params))
1869 return NULL;
1870
1871 return find_reg(params, table, num);
1872}
1873
1798/* Decode an index value, and find the sys_reg_desc entry. */ 1874/* Decode an index value, and find the sys_reg_desc entry. */
1799static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, 1875static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
1800 u64 id) 1876 u64 id)
@@ -1807,11 +1883,8 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
1807 if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) 1883 if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG)
1808 return NULL; 1884 return NULL;
1809 1885
1810 if (!index_to_params(id, &params))
1811 return NULL;
1812
1813 table = get_target_table(vcpu->arch.target, true, &num); 1886 table = get_target_table(vcpu->arch.target, true, &num);
1814 r = find_reg(&params, table, num); 1887 r = find_reg_by_id(id, &params, table, num);
1815 if (!r) 1888 if (!r)
1816 r = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); 1889 r = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
1817 1890
@@ -1918,10 +1991,8 @@ static int get_invariant_sys_reg(u64 id, void __user *uaddr)
1918 struct sys_reg_params params; 1991 struct sys_reg_params params;
1919 const struct sys_reg_desc *r; 1992 const struct sys_reg_desc *r;
1920 1993
1921 if (!index_to_params(id, &params)) 1994 r = find_reg_by_id(id, &params, invariant_sys_regs,
1922 return -ENOENT; 1995 ARRAY_SIZE(invariant_sys_regs));
1923
1924 r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
1925 if (!r) 1996 if (!r)
1926 return -ENOENT; 1997 return -ENOENT;
1927 1998
@@ -1935,9 +2006,8 @@ static int set_invariant_sys_reg(u64 id, void __user *uaddr)
1935 int err; 2006 int err;
1936 u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ 2007 u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */
1937 2008
1938 if (!index_to_params(id, &params)) 2009 r = find_reg_by_id(id, &params, invariant_sys_regs,
1939 return -ENOENT; 2010 ARRAY_SIZE(invariant_sys_regs));
1940 r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
1941 if (!r) 2011 if (!r)
1942 return -ENOENT; 2012 return -ENOENT;
1943 2013
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index dbbb01cfbee9..9c6ffd0f0196 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -136,6 +136,10 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
136 return i1->Op2 - i2->Op2; 136 return i1->Op2 - i2->Op2;
137} 137}
138 138
139const struct sys_reg_desc *find_reg_by_id(u64 id,
140 struct sys_reg_params *params,
141 const struct sys_reg_desc table[],
142 unsigned int num);
139 143
140#define Op0(_x) .Op0 = _x 144#define Op0(_x) .Op0 = _x
141#define Op1(_x) .Op1 = _x 145#define Op1(_x) .Op1 = _x
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
new file mode 100644
index 000000000000..79f37e37d367
--- /dev/null
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -0,0 +1,346 @@
1/*
2 * VGIC system registers handling functions for AArch64 mode
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 */
13
14#include <linux/irqchip/arm-gic-v3.h>
15#include <linux/kvm.h>
16#include <linux/kvm_host.h>
17#include <asm/kvm_emulate.h>
18#include "vgic.h"
19#include "sys_regs.h"
20
21static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
22 const struct sys_reg_desc *r)
23{
24 u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v;
25 struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
26 struct vgic_vmcr vmcr;
27 u64 val;
28
29 vgic_get_vmcr(vcpu, &vmcr);
30 if (p->is_write) {
31 val = p->regval;
32
33 /*
34 * Disallow restoring VM state if not supported by this
35 * hardware.
36 */
37 host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >>
38 ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1;
39 if (host_pri_bits > vgic_v3_cpu->num_pri_bits)
40 return false;
41
42 vgic_v3_cpu->num_pri_bits = host_pri_bits;
43
44 host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >>
45 ICC_CTLR_EL1_ID_BITS_SHIFT;
46 if (host_id_bits > vgic_v3_cpu->num_id_bits)
47 return false;
48
49 vgic_v3_cpu->num_id_bits = host_id_bits;
50
51 host_seis = ((kvm_vgic_global_state.ich_vtr_el2 &
52 ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT);
53 seis = (val & ICC_CTLR_EL1_SEIS_MASK) >>
54 ICC_CTLR_EL1_SEIS_SHIFT;
55 if (host_seis != seis)
56 return false;
57
58 host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 &
59 ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT);
60 a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT;
61 if (host_a3v != a3v)
62 return false;
63
64 /*
65 * Here set VMCR.CTLR in ICC_CTLR_EL1 layout.
66 * The vgic_set_vmcr() will convert to ICH_VMCR layout.
67 */
68 vmcr.ctlr = val & ICC_CTLR_EL1_CBPR_MASK;
69 vmcr.ctlr |= val & ICC_CTLR_EL1_EOImode_MASK;
70 vgic_set_vmcr(vcpu, &vmcr);
71 } else {
72 val = 0;
73 val |= (vgic_v3_cpu->num_pri_bits - 1) <<
74 ICC_CTLR_EL1_PRI_BITS_SHIFT;
75 val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT;
76 val |= ((kvm_vgic_global_state.ich_vtr_el2 &
77 ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) <<
78 ICC_CTLR_EL1_SEIS_SHIFT;
79 val |= ((kvm_vgic_global_state.ich_vtr_el2 &
80 ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) <<
81 ICC_CTLR_EL1_A3V_SHIFT;
82 /*
83 * The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
84 * Extract it directly using ICC_CTLR_EL1 reg definitions.
85 */
86 val |= vmcr.ctlr & ICC_CTLR_EL1_CBPR_MASK;
87 val |= vmcr.ctlr & ICC_CTLR_EL1_EOImode_MASK;
88
89 p->regval = val;
90 }
91
92 return true;
93}
94
95static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
96 const struct sys_reg_desc *r)
97{
98 struct vgic_vmcr vmcr;
99
100 vgic_get_vmcr(vcpu, &vmcr);
101 if (p->is_write) {
102 vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT;
103 vgic_set_vmcr(vcpu, &vmcr);
104 } else {
105 p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK;
106 }
107
108 return true;
109}
110
111static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
112 const struct sys_reg_desc *r)
113{
114 struct vgic_vmcr vmcr;
115
116 vgic_get_vmcr(vcpu, &vmcr);
117 if (p->is_write) {
118 vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >>
119 ICC_BPR0_EL1_SHIFT;
120 vgic_set_vmcr(vcpu, &vmcr);
121 } else {
122 p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) &
123 ICC_BPR0_EL1_MASK;
124 }
125
126 return true;
127}
128
129static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
130 const struct sys_reg_desc *r)
131{
132 struct vgic_vmcr vmcr;
133
134 if (!p->is_write)
135 p->regval = 0;
136
137 vgic_get_vmcr(vcpu, &vmcr);
138 if (!((vmcr.ctlr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT)) {
139 if (p->is_write) {
140 vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >>
141 ICC_BPR1_EL1_SHIFT;
142 vgic_set_vmcr(vcpu, &vmcr);
143 } else {
144 p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) &
145 ICC_BPR1_EL1_MASK;
146 }
147 } else {
148 if (!p->is_write)
149 p->regval = min((vmcr.bpr + 1), 7U);
150 }
151
152 return true;
153}
154
155static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
156 const struct sys_reg_desc *r)
157{
158 struct vgic_vmcr vmcr;
159
160 vgic_get_vmcr(vcpu, &vmcr);
161 if (p->is_write) {
162 vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >>
163 ICC_IGRPEN0_EL1_SHIFT;
164 vgic_set_vmcr(vcpu, &vmcr);
165 } else {
166 p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) &
167 ICC_IGRPEN0_EL1_MASK;
168 }
169
170 return true;
171}
172
173static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
174 const struct sys_reg_desc *r)
175{
176 struct vgic_vmcr vmcr;
177
178 vgic_get_vmcr(vcpu, &vmcr);
179 if (p->is_write) {
180 vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >>
181 ICC_IGRPEN1_EL1_SHIFT;
182 vgic_set_vmcr(vcpu, &vmcr);
183 } else {
184 p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) &
185 ICC_IGRPEN1_EL1_MASK;
186 }
187
188 return true;
189}
190
191static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu,
192 struct sys_reg_params *p, u8 apr, u8 idx)
193{
194 struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
195 uint32_t *ap_reg;
196
197 if (apr)
198 ap_reg = &vgicv3->vgic_ap1r[idx];
199 else
200 ap_reg = &vgicv3->vgic_ap0r[idx];
201
202 if (p->is_write)
203 *ap_reg = p->regval;
204 else
205 p->regval = *ap_reg;
206}
207
208static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
209 const struct sys_reg_desc *r, u8 apr)
210{
211 struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
212 u8 idx = r->Op2 & 3;
213
214 /*
215 * num_pri_bits are initialized with HW supported values.
216 * We can rely safely on num_pri_bits even if VM has not
217 * restored ICC_CTLR_EL1 before restoring APnR registers.
218 */
219 switch (vgic_v3_cpu->num_pri_bits) {
220 case 7:
221 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
222 break;
223 case 6:
224 if (idx > 1)
225 goto err;
226 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
227 break;
228 default:
229 if (idx > 0)
230 goto err;
231 vgic_v3_access_apr_reg(vcpu, p, apr, idx);
232 }
233
234 return true;
235err:
236 if (!p->is_write)
237 p->regval = 0;
238
239 return false;
240}
241
242static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
243 const struct sys_reg_desc *r)
244
245{
246 return access_gic_aprn(vcpu, p, r, 0);
247}
248
249static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
250 const struct sys_reg_desc *r)
251{
252 return access_gic_aprn(vcpu, p, r, 1);
253}
254
255static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
256 const struct sys_reg_desc *r)
257{
258 struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
259
260 /* Validate SRE bit */
261 if (p->is_write) {
262 if (!(p->regval & ICC_SRE_EL1_SRE))
263 return false;
264 } else {
265 p->regval = vgicv3->vgic_sre;
266 }
267
268 return true;
269}
270static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
271 /* ICC_PMR_EL1 */
272 { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr },
273 /* ICC_BPR0_EL1 */
274 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 },
275 /* ICC_AP0R0_EL1 */
276 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r },
277 /* ICC_AP0R1_EL1 */
278 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r },
279 /* ICC_AP0R2_EL1 */
280 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r },
281 /* ICC_AP0R3_EL1 */
282 { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r },
283 /* ICC_AP1R0_EL1 */
284 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r },
285 /* ICC_AP1R1_EL1 */
286 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r },
287 /* ICC_AP1R2_EL1 */
288 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r },
289 /* ICC_AP1R3_EL1 */
290 { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r },
291 /* ICC_BPR1_EL1 */
292 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 },
293 /* ICC_CTLR_EL1 */
294 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr },
295 /* ICC_SRE_EL1 */
296 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre },
297 /* ICC_IGRPEN0_EL1 */
298 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 },
299 /* ICC_GRPEN1_EL1 */
300 { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 },
301};
302
303int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
304 u64 *reg)
305{
306 struct sys_reg_params params;
307 u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64;
308
309 params.regval = *reg;
310 params.is_write = is_write;
311 params.is_aarch32 = false;
312 params.is_32bit = false;
313
314 if (find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
315 ARRAY_SIZE(gic_v3_icc_reg_descs)))
316 return 0;
317
318 return -ENXIO;
319}
320
321int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id,
322 u64 *reg)
323{
324 struct sys_reg_params params;
325 const struct sys_reg_desc *r;
326 u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64;
327
328 if (is_write)
329 params.regval = *reg;
330 params.is_write = is_write;
331 params.is_aarch32 = false;
332 params.is_32bit = false;
333
334 r = find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
335 ARRAY_SIZE(gic_v3_icc_reg_descs));
336 if (!r)
337 return -ENXIO;
338
339 if (!r->access(vcpu, &params, r))
340 return -EINVAL;
341
342 if (!is_write)
343 *reg = params.regval;
344
345 return 0;
346}
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bebec370324f..05e785fc061d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
43#define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) 43#define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0)
44#define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) 44#define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0)
45#define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0) 45#define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0)
46#define KVM_REG_MIPS_CP0_INTCTL MIPS_CP0_32(12, 1)
46#define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0) 47#define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0)
47#define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0) 48#define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0)
48#define KVM_REG_MIPS_CP0_PRID MIPS_CP0_32(15, 0) 49#define KVM_REG_MIPS_CP0_PRID MIPS_CP0_32(15, 0)
@@ -64,7 +65,7 @@
64#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7) 65#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7)
65 66
66 67
67#define KVM_MAX_VCPUS 1 68#define KVM_MAX_VCPUS 8
68#define KVM_USER_MEM_SLOTS 8 69#define KVM_USER_MEM_SLOTS 8
69/* memory slots that does not exposed to userspace */ 70/* memory slots that does not exposed to userspace */
70#define KVM_PRIVATE_MEM_SLOTS 0 71#define KVM_PRIVATE_MEM_SLOTS 0
@@ -88,6 +89,7 @@
88 89
89#define KVM_GUEST_KUSEG 0x00000000UL 90#define KVM_GUEST_KUSEG 0x00000000UL
90#define KVM_GUEST_KSEG0 0x40000000UL 91#define KVM_GUEST_KSEG0 0x40000000UL
92#define KVM_GUEST_KSEG1 0x40000000UL
91#define KVM_GUEST_KSEG23 0x60000000UL 93#define KVM_GUEST_KSEG23 0x60000000UL
92#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0xe0000000) 94#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0xe0000000)
93#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff) 95#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff)
@@ -104,7 +106,6 @@
104#define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23) 106#define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
105 107
106#define KVM_INVALID_PAGE 0xdeadbeef 108#define KVM_INVALID_PAGE 0xdeadbeef
107#define KVM_INVALID_INST 0xdeadbeef
108#define KVM_INVALID_ADDR 0xdeadbeef 109#define KVM_INVALID_ADDR 0xdeadbeef
109 110
110/* 111/*
@@ -121,8 +122,6 @@ static inline bool kvm_is_error_hva(unsigned long addr)
121 return IS_ERR_VALUE(addr); 122 return IS_ERR_VALUE(addr);
122} 123}
123 124
124extern atomic_t kvm_mips_instance;
125
126struct kvm_vm_stat { 125struct kvm_vm_stat {
127 ulong remote_tlb_flush; 126 ulong remote_tlb_flush;
128}; 127};
@@ -156,12 +155,8 @@ struct kvm_arch_memory_slot {
156}; 155};
157 156
158struct kvm_arch { 157struct kvm_arch {
159 /* Guest GVA->HPA page table */ 158 /* Guest physical mm */
160 unsigned long *guest_pmap; 159 struct mm_struct gpa_mm;
161 unsigned long guest_pmap_npages;
162
163 /* Wired host TLB used for the commpage */
164 int commpage_tlb;
165}; 160};
166 161
167#define N_MIPS_COPROC_REGS 32 162#define N_MIPS_COPROC_REGS 32
@@ -233,6 +228,7 @@ enum emulation_result {
233 EMULATE_FAIL, /* can't emulate this instruction */ 228 EMULATE_FAIL, /* can't emulate this instruction */
234 EMULATE_WAIT, /* WAIT instruction */ 229 EMULATE_WAIT, /* WAIT instruction */
235 EMULATE_PRIV_FAIL, 230 EMULATE_PRIV_FAIL,
231 EMULATE_EXCEPT, /* A guest exception has been generated */
236}; 232};
237 233
238#define mips3_paddr_to_tlbpfn(x) \ 234#define mips3_paddr_to_tlbpfn(x) \
@@ -250,6 +246,7 @@ enum emulation_result {
250#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) 246#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID)
251#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) 247#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1)
252#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V) 248#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
249#define TLB_IS_DIRTY(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D)
253#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ 250#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \
254 ((y) & VPN2_MASK & ~(x).tlb_mask)) 251 ((y) & VPN2_MASK & ~(x).tlb_mask))
255#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ 252#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \
@@ -261,6 +258,17 @@ struct kvm_mips_tlb {
261 long tlb_lo[2]; 258 long tlb_lo[2];
262}; 259};
263 260
261#define KVM_NR_MEM_OBJS 4
262
263/*
264 * We don't want allocation failures within the mmu code, so we preallocate
265 * enough memory for a single page fault in a cache.
266 */
267struct kvm_mmu_memory_cache {
268 int nobjs;
269 void *objects[KVM_NR_MEM_OBJS];
270};
271
264#define KVM_MIPS_AUX_FPU 0x1 272#define KVM_MIPS_AUX_FPU 0x1
265#define KVM_MIPS_AUX_MSA 0x2 273#define KVM_MIPS_AUX_MSA 0x2
266 274
@@ -275,6 +283,8 @@ struct kvm_vcpu_arch {
275 unsigned long host_cp0_badvaddr; 283 unsigned long host_cp0_badvaddr;
276 unsigned long host_cp0_epc; 284 unsigned long host_cp0_epc;
277 u32 host_cp0_cause; 285 u32 host_cp0_cause;
286 u32 host_cp0_badinstr;
287 u32 host_cp0_badinstrp;
278 288
279 /* GPRS */ 289 /* GPRS */
280 unsigned long gprs[32]; 290 unsigned long gprs[32];
@@ -318,20 +328,18 @@ struct kvm_vcpu_arch {
318 /* Bitmask of pending exceptions to be cleared */ 328 /* Bitmask of pending exceptions to be cleared */
319 unsigned long pending_exceptions_clr; 329 unsigned long pending_exceptions_clr;
320 330
321 /* Save/Restore the entryhi register when are are preempted/scheduled back in */
322 unsigned long preempt_entryhi;
323
324 /* S/W Based TLB for guest */ 331 /* S/W Based TLB for guest */
325 struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; 332 struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
326 333
327 /* Cached guest kernel/user ASIDs */ 334 /* Guest kernel/user [partial] mm */
328 u32 guest_user_asid[NR_CPUS];
329 u32 guest_kernel_asid[NR_CPUS];
330 struct mm_struct guest_kernel_mm, guest_user_mm; 335 struct mm_struct guest_kernel_mm, guest_user_mm;
331 336
332 /* Guest ASID of last user mode execution */ 337 /* Guest ASID of last user mode execution */
333 unsigned int last_user_gasid; 338 unsigned int last_user_gasid;
334 339
340 /* Cache some mmu pages needed inside spinlock regions */
341 struct kvm_mmu_memory_cache mmu_page_cache;
342
335 int last_sched_cpu; 343 int last_sched_cpu;
336 344
337 /* WAIT executed */ 345 /* WAIT executed */
@@ -339,14 +347,15 @@ struct kvm_vcpu_arch {
339 347
340 u8 fpu_enabled; 348 u8 fpu_enabled;
341 u8 msa_enabled; 349 u8 msa_enabled;
342 u8 kscratch_enabled;
343}; 350};
344 351
345 352
346#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) 353#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0])
347#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) 354#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
348#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) 355#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0])
356#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
349#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) 357#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0])
358#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
350#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) 359#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
351#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) 360#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
352#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) 361#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
@@ -522,9 +531,17 @@ struct kvm_mips_callbacks {
522 int (*handle_msa_fpe)(struct kvm_vcpu *vcpu); 531 int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
523 int (*handle_fpe)(struct kvm_vcpu *vcpu); 532 int (*handle_fpe)(struct kvm_vcpu *vcpu);
524 int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); 533 int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
525 int (*vm_init)(struct kvm *kvm);
526 int (*vcpu_init)(struct kvm_vcpu *vcpu); 534 int (*vcpu_init)(struct kvm_vcpu *vcpu);
535 void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
527 int (*vcpu_setup)(struct kvm_vcpu *vcpu); 536 int (*vcpu_setup)(struct kvm_vcpu *vcpu);
537 void (*flush_shadow_all)(struct kvm *kvm);
538 /*
539 * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
540 * VZ root TLB, or T&E GVA page tables and corresponding root TLB
541 * mappings).
542 */
543 void (*flush_shadow_memslot)(struct kvm *kvm,
544 const struct kvm_memory_slot *slot);
528 gpa_t (*gva_to_gpa)(gva_t gva); 545 gpa_t (*gva_to_gpa)(gva_t gva);
529 void (*queue_timer_int)(struct kvm_vcpu *vcpu); 546 void (*queue_timer_int)(struct kvm_vcpu *vcpu);
530 void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); 547 void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -542,8 +559,10 @@ struct kvm_mips_callbacks {
542 const struct kvm_one_reg *reg, s64 *v); 559 const struct kvm_one_reg *reg, s64 *v);
543 int (*set_one_reg)(struct kvm_vcpu *vcpu, 560 int (*set_one_reg)(struct kvm_vcpu *vcpu,
544 const struct kvm_one_reg *reg, s64 v); 561 const struct kvm_one_reg *reg, s64 v);
545 int (*vcpu_get_regs)(struct kvm_vcpu *vcpu); 562 int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
546 int (*vcpu_set_regs)(struct kvm_vcpu *vcpu); 563 int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu);
564 int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
565 void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu);
547}; 566};
548extern struct kvm_mips_callbacks *kvm_mips_callbacks; 567extern struct kvm_mips_callbacks *kvm_mips_callbacks;
549int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); 568int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
@@ -556,6 +575,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
556/* Building of entry/exception code */ 575/* Building of entry/exception code */
557int kvm_mips_entry_setup(void); 576int kvm_mips_entry_setup(void);
558void *kvm_mips_build_vcpu_run(void *addr); 577void *kvm_mips_build_vcpu_run(void *addr);
578void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler);
559void *kvm_mips_build_exception(void *addr, void *handler); 579void *kvm_mips_build_exception(void *addr, void *handler);
560void *kvm_mips_build_exit(void *addr); 580void *kvm_mips_build_exit(void *addr);
561 581
@@ -580,54 +600,125 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
580u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); 600u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
581 601
582extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, 602extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
583 struct kvm_vcpu *vcpu); 603 struct kvm_vcpu *vcpu,
604 bool write_fault);
584 605
585extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, 606extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
586 struct kvm_vcpu *vcpu); 607 struct kvm_vcpu *vcpu);
587 608
588extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, 609extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
589 struct kvm_mips_tlb *tlb); 610 struct kvm_mips_tlb *tlb,
611 unsigned long gva,
612 bool write_fault);
590 613
591extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, 614extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
592 u32 *opc, 615 u32 *opc,
593 struct kvm_run *run, 616 struct kvm_run *run,
594 struct kvm_vcpu *vcpu); 617 struct kvm_vcpu *vcpu,
595 618 bool write_fault);
596extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
597 u32 *opc,
598 struct kvm_run *run,
599 struct kvm_vcpu *vcpu);
600 619
601extern void kvm_mips_dump_host_tlbs(void); 620extern void kvm_mips_dump_host_tlbs(void);
602extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); 621extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
603extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, 622extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
604 unsigned long entrylo0, 623 bool user, bool kernel);
605 unsigned long entrylo1,
606 int flush_dcache_mask);
607extern void kvm_mips_flush_host_tlb(int skip_kseg0);
608extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
609 624
610extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, 625extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
611 unsigned long entryhi); 626 unsigned long entryhi);
612extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr); 627
613extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, 628void kvm_mips_suspend_mm(int cpu);
614 unsigned long gva); 629void kvm_mips_resume_mm(int cpu);
615extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, 630
616 struct kvm_vcpu *vcpu); 631/* MMU handling */
617extern void kvm_local_flush_tlb_all(void); 632
618extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu); 633/**
619extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); 634 * enum kvm_mips_flush - Types of MMU flushes.
620extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); 635 * @KMF_USER: Flush guest user virtual memory mappings.
636 * Guest USeg only.
637 * @KMF_KERN: Flush guest kernel virtual memory mappings.
638 * Guest USeg and KSeg2/3.
639 * @KMF_GPA: Flush guest physical memory mappings.
640 * Also includes KSeg0 if KMF_KERN is set.
641 */
642enum kvm_mips_flush {
643 KMF_USER = 0x0,
644 KMF_KERN = 0x1,
645 KMF_GPA = 0x2,
646};
647void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
648bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
649int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
650pgd_t *kvm_pgd_alloc(void);
651void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
652void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
653 bool user);
654void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
655void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu);
656
657enum kvm_mips_fault_result {
658 KVM_MIPS_MAPPED = 0,
659 KVM_MIPS_GVA,
660 KVM_MIPS_GPA,
661 KVM_MIPS_TLB,
662 KVM_MIPS_TLBINV,
663 KVM_MIPS_TLBMOD,
664};
665enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
666 unsigned long gva,
667 bool write);
668
669#define KVM_ARCH_WANT_MMU_NOTIFIER
670int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
671int kvm_unmap_hva_range(struct kvm *kvm,
672 unsigned long start, unsigned long end);
673void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
674int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
675int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
676
677static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
678 unsigned long address)
679{
680}
621 681
622/* Emulation */ 682/* Emulation */
623u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); 683int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
624enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); 684enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
685int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
686int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
687
688/**
689 * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault.
690 * @vcpu: Virtual CPU.
691 *
692 * Returns: Whether the TLBL exception was likely due to an instruction
693 * fetch fault rather than a data load fault.
694 */
695static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu)
696{
697 unsigned long badvaddr = vcpu->host_cp0_badvaddr;
698 unsigned long epc = msk_isa16_mode(vcpu->pc);
699 u32 cause = vcpu->host_cp0_cause;
700
701 if (epc == badvaddr)
702 return true;
703
704 /*
705 * Branches may be 32-bit or 16-bit instructions.
706 * This isn't exact, but we don't really support MIPS16 or microMIPS yet
707 * in KVM anyway.
708 */
709 if ((cause & CAUSEF_BD) && badvaddr - epc <= 4)
710 return true;
711
712 return false;
713}
625 714
626extern enum emulation_result kvm_mips_emulate_inst(u32 cause, 715extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
627 u32 *opc, 716 u32 *opc,
628 struct kvm_run *run, 717 struct kvm_run *run,
629 struct kvm_vcpu *vcpu); 718 struct kvm_vcpu *vcpu);
630 719
720long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu);
721
631extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, 722extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
632 u32 *opc, 723 u32 *opc,
633 struct kvm_run *run, 724 struct kvm_run *run,
@@ -761,10 +852,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
761static inline void kvm_arch_free_memslot(struct kvm *kvm, 852static inline void kvm_arch_free_memslot(struct kvm *kvm,
762 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} 853 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
763static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} 854static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
764static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
765static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
766 struct kvm_memory_slot *slot) {}
767static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
768static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 855static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
769static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} 856static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
770static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} 857static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index ddd57ade1aa8..2abf94f72c0a 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -29,9 +29,11 @@ do { \
29 } \ 29 } \
30} while (0) 30} while (0)
31 31
32extern void tlbmiss_handler_setup_pgd(unsigned long);
33
34/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */
32#define TLBMISS_HANDLER_SETUP_PGD(pgd) \ 35#define TLBMISS_HANDLER_SETUP_PGD(pgd) \
33do { \ 36do { \
34 extern void tlbmiss_handler_setup_pgd(unsigned long); \
35 tlbmiss_handler_setup_pgd((unsigned long)(pgd)); \ 37 tlbmiss_handler_setup_pgd((unsigned long)(pgd)); \
36 htw_set_pwbase((unsigned long)pgd); \ 38 htw_set_pwbase((unsigned long)pgd); \
37} while (0) 39} while (0)
@@ -97,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
97static inline void 99static inline void
98get_new_mmu_context(struct mm_struct *mm, unsigned long cpu) 100get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
99{ 101{
100 extern void kvm_local_flush_tlb_all(void);
101 unsigned long asid = asid_cache(cpu); 102 unsigned long asid = asid_cache(cpu);
102 103
103 if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) { 104 if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) {
104 if (cpu_has_vtag_icache) 105 if (cpu_has_vtag_icache)
105 flush_icache_all(); 106 flush_icache_all();
106#ifdef CONFIG_KVM
107 kvm_local_flush_tlb_all(); /* start new asid cycle */
108#else
109 local_flush_tlb_all(); /* start new asid cycle */ 107 local_flush_tlb_all(); /* start new asid cycle */
110#endif
111 if (!asid) /* fix version if needed */ 108 if (!asid) /* fix version if needed */
112 asid = asid_first_version(cpu); 109 asid = asid_first_version(cpu);
113 } 110 }
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 6985eb59b085..a8a0199bf760 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -19,6 +19,8 @@
19 * Some parts derived from the x86 version of this file. 19 * Some parts derived from the x86 version of this file.
20 */ 20 */
21 21
22#define __KVM_HAVE_READONLY_MEM
23
22/* 24/*
23 * for KVM_GET_REGS and KVM_SET_REGS 25 * for KVM_GET_REGS and KVM_SET_REGS
24 * 26 *
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 7c56d6b124d1..65067327db12 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -20,7 +20,9 @@ config KVM
20 select EXPORT_UASM 20 select EXPORT_UASM
21 select PREEMPT_NOTIFIERS 21 select PREEMPT_NOTIFIERS
22 select ANON_INODES 22 select ANON_INODES
23 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
23 select KVM_MMIO 24 select KVM_MMIO
25 select MMU_NOTIFIER
24 select SRCU 26 select SRCU
25 ---help--- 27 ---help---
26 Support for hosting Guest kernels. 28 Support for hosting Guest kernels.
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 010cef240688..f8e772564d74 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -13,6 +13,7 @@
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
16#include <linux/uaccess.h>
16#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
17#include <linux/fs.h> 18#include <linux/fs.h>
18#include <linux/bootmem.h> 19#include <linux/bootmem.h>
@@ -29,28 +30,37 @@
29static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, 30static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
30 union mips_instruction replace) 31 union mips_instruction replace)
31{ 32{
32 unsigned long paddr, flags; 33 unsigned long vaddr = (unsigned long)opc;
33 void *vaddr; 34 int err;
34 35
35 if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) { 36retry:
36 paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, 37 /* The GVA page table is still active so use the Linux TLB handlers */
37 (unsigned long)opc); 38 kvm_trap_emul_gva_lockless_begin(vcpu);
38 vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); 39 err = put_user(replace.word, opc);
39 vaddr += paddr & ~PAGE_MASK; 40 kvm_trap_emul_gva_lockless_end(vcpu);
40 memcpy(vaddr, (void *)&replace, sizeof(u32)); 41
41 local_flush_icache_range((unsigned long)vaddr, 42 if (unlikely(err)) {
42 (unsigned long)vaddr + 32); 43 /*
43 kunmap_atomic(vaddr); 44 * We write protect clean pages in GVA page table so normal
44 } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { 45 * Linux TLB mod handler doesn't silently dirty the page.
45 local_irq_save(flags); 46 * Its also possible we raced with a GVA invalidation.
46 memcpy((void *)opc, (void *)&replace, sizeof(u32)); 47 * Try to force the page to become dirty.
47 __local_flush_icache_user_range((unsigned long)opc, 48 */
48 (unsigned long)opc + 32); 49 err = kvm_trap_emul_gva_fault(vcpu, vaddr, true);
49 local_irq_restore(flags); 50 if (unlikely(err)) {
50 } else { 51 kvm_info("%s: Address unwriteable: %p\n",
51 kvm_err("%s: Invalid address: %p\n", __func__, opc); 52 __func__, opc);
52 return -EFAULT; 53 return -EFAULT;
54 }
55
56 /*
57 * Try again. This will likely trigger a TLB refill, which will
58 * fetch the new dirty entry from the GVA page table, which
59 * should then succeed.
60 */
61 goto retry;
53 } 62 }
63 __local_flush_icache_user_range(vaddr, vaddr + 4);
54 64
55 return 0; 65 return 0;
56} 66}
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index aa0937423e28..d40cfaad4529 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -38,23 +38,25 @@
38 * Compute the return address and do emulate branch simulation, if required. 38 * Compute the return address and do emulate branch simulation, if required.
39 * This function should be called only in branch delay slot active. 39 * This function should be called only in branch delay slot active.
40 */ 40 */
41unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, 41static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc,
42 unsigned long instpc) 42 unsigned long *out)
43{ 43{
44 unsigned int dspcontrol; 44 unsigned int dspcontrol;
45 union mips_instruction insn; 45 union mips_instruction insn;
46 struct kvm_vcpu_arch *arch = &vcpu->arch; 46 struct kvm_vcpu_arch *arch = &vcpu->arch;
47 long epc = instpc; 47 long epc = instpc;
48 long nextpc = KVM_INVALID_INST; 48 long nextpc;
49 int err;
49 50
50 if (epc & 3) 51 if (epc & 3) {
51 goto unaligned; 52 kvm_err("%s: unaligned epc\n", __func__);
53 return -EINVAL;
54 }
52 55
53 /* Read the instruction */ 56 /* Read the instruction */
54 insn.word = kvm_get_inst((u32 *) epc, vcpu); 57 err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word);
55 58 if (err)
56 if (insn.word == KVM_INVALID_INST) 59 return err;
57 return KVM_INVALID_INST;
58 60
59 switch (insn.i_format.opcode) { 61 switch (insn.i_format.opcode) {
60 /* jr and jalr are in r_format format. */ 62 /* jr and jalr are in r_format format. */
@@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
66 case jr_op: 68 case jr_op:
67 nextpc = arch->gprs[insn.r_format.rs]; 69 nextpc = arch->gprs[insn.r_format.rs];
68 break; 70 break;
71 default:
72 return -EINVAL;
69 } 73 }
70 break; 74 break;
71 75
@@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
114 nextpc = epc; 118 nextpc = epc;
115 break; 119 break;
116 case bposge32_op: 120 case bposge32_op:
117 if (!cpu_has_dsp) 121 if (!cpu_has_dsp) {
118 goto sigill; 122 kvm_err("%s: DSP branch but not DSP ASE\n",
123 __func__);
124 return -EINVAL;
125 }
119 126
120 dspcontrol = rddsp(0x01); 127 dspcontrol = rddsp(0x01);
121 128
@@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
125 epc += 8; 132 epc += 8;
126 nextpc = epc; 133 nextpc = epc;
127 break; 134 break;
135 default:
136 return -EINVAL;
128 } 137 }
129 break; 138 break;
130 139
@@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
189 /* And now the FPA/cp1 branch instructions. */ 198 /* And now the FPA/cp1 branch instructions. */
190 case cop1_op: 199 case cop1_op:
191 kvm_err("%s: unsupported cop1_op\n", __func__); 200 kvm_err("%s: unsupported cop1_op\n", __func__);
192 break; 201 return -EINVAL;
193 202
194#ifdef CONFIG_CPU_MIPSR6 203#ifdef CONFIG_CPU_MIPSR6
195 /* R6 added the following compact branches with forbidden slots */ 204 /* R6 added the following compact branches with forbidden slots */
@@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
198 /* only rt == 0 isn't compact branch */ 207 /* only rt == 0 isn't compact branch */
199 if (insn.i_format.rt != 0) 208 if (insn.i_format.rt != 0)
200 goto compact_branch; 209 goto compact_branch;
201 break; 210 return -EINVAL;
202 case pop10_op: 211 case pop10_op:
203 case pop30_op: 212 case pop30_op:
204 /* only rs == rt == 0 is reserved, rest are compact branches */ 213 /* only rs == rt == 0 is reserved, rest are compact branches */
205 if (insn.i_format.rs != 0 || insn.i_format.rt != 0) 214 if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
206 goto compact_branch; 215 goto compact_branch;
207 break; 216 return -EINVAL;
208 case pop66_op: 217 case pop66_op:
209 case pop76_op: 218 case pop76_op:
210 /* only rs == 0 isn't compact branch */ 219 /* only rs == 0 isn't compact branch */
211 if (insn.i_format.rs != 0) 220 if (insn.i_format.rs != 0)
212 goto compact_branch; 221 goto compact_branch;
213 break; 222 return -EINVAL;
214compact_branch: 223compact_branch:
215 /* 224 /*
216 * If we've hit an exception on the forbidden slot, then 225 * If we've hit an exception on the forbidden slot, then
@@ -221,42 +230,74 @@ compact_branch:
221 break; 230 break;
222#else 231#else
223compact_branch: 232compact_branch:
224 /* Compact branches not supported before R6 */ 233 /* Fall through - Compact branches not supported before R6 */
225 break;
226#endif 234#endif
235 default:
236 return -EINVAL;
227 } 237 }
228 238
229 return nextpc; 239 *out = nextpc;
230 240 return 0;
231unaligned:
232 kvm_err("%s: unaligned epc\n", __func__);
233 return nextpc;
234
235sigill:
236 kvm_err("%s: DSP branch but not DSP ASE\n", __func__);
237 return nextpc;
238} 241}
239 242
240enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) 243enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
241{ 244{
242 unsigned long branch_pc; 245 int err;
243 enum emulation_result er = EMULATE_DONE;
244 246
245 if (cause & CAUSEF_BD) { 247 if (cause & CAUSEF_BD) {
246 branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc); 248 err = kvm_compute_return_epc(vcpu, vcpu->arch.pc,
247 if (branch_pc == KVM_INVALID_INST) { 249 &vcpu->arch.pc);
248 er = EMULATE_FAIL; 250 if (err)
249 } else { 251 return EMULATE_FAIL;
250 vcpu->arch.pc = branch_pc; 252 } else {
251 kvm_debug("BD update_pc(): New PC: %#lx\n",
252 vcpu->arch.pc);
253 }
254 } else
255 vcpu->arch.pc += 4; 253 vcpu->arch.pc += 4;
254 }
256 255
257 kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc); 256 kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc);
258 257
259 return er; 258 return EMULATE_DONE;
259}
260
261/**
262 * kvm_get_badinstr() - Get bad instruction encoding.
263 * @opc: Guest pointer to faulting instruction.
264 * @vcpu: KVM VCPU information.
265 *
266 * Gets the instruction encoding of the faulting instruction, using the saved
267 * BadInstr register value if it exists, otherwise falling back to reading guest
268 * memory at @opc.
269 *
270 * Returns: The instruction encoding of the faulting instruction.
271 */
272int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
273{
274 if (cpu_has_badinstr) {
275 *out = vcpu->arch.host_cp0_badinstr;
276 return 0;
277 } else {
278 return kvm_get_inst(opc, vcpu, out);
279 }
280}
281
282/**
283 * kvm_get_badinstrp() - Get bad prior instruction encoding.
284 * @opc: Guest pointer to prior faulting instruction.
285 * @vcpu: KVM VCPU information.
286 *
287 * Gets the instruction encoding of the prior faulting instruction (the branch
288 * containing the delay slot which faulted), using the saved BadInstrP register
289 * value if it exists, otherwise falling back to reading guest memory at @opc.
290 *
291 * Returns: The instruction encoding of the prior faulting instruction.
292 */
293int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
294{
295 if (cpu_has_badinstrp) {
296 *out = vcpu->arch.host_cp0_badinstrp;
297 return 0;
298 } else {
299 return kvm_get_inst(opc, vcpu, out);
300 }
260} 301}
261 302
262/** 303/**
@@ -856,22 +897,30 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
856static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, 897static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
857 struct kvm_mips_tlb *tlb) 898 struct kvm_mips_tlb *tlb)
858{ 899{
900 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
901 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
859 int cpu, i; 902 int cpu, i;
860 bool user; 903 bool user;
861 904
862 /* No need to flush for entries which are already invalid */ 905 /* No need to flush for entries which are already invalid */
863 if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V)) 906 if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
864 return; 907 return;
908 /* Don't touch host kernel page tables or TLB mappings */
909 if ((unsigned long)tlb->tlb_hi > 0x7fffffff)
910 return;
865 /* User address space doesn't need flushing for KSeg2/3 changes */ 911 /* User address space doesn't need flushing for KSeg2/3 changes */
866 user = tlb->tlb_hi < KVM_GUEST_KSEG0; 912 user = tlb->tlb_hi < KVM_GUEST_KSEG0;
867 913
868 preempt_disable(); 914 preempt_disable();
869 915
916 /* Invalidate page table entries */
917 kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user);
918
870 /* 919 /*
871 * Probe the shadow host TLB for the entry being overwritten, if one 920 * Probe the shadow host TLB for the entry being overwritten, if one
872 * matches, invalidate it 921 * matches, invalidate it
873 */ 922 */
874 kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); 923 kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true);
875 924
876 /* Invalidate the whole ASID on other CPUs */ 925 /* Invalidate the whole ASID on other CPUs */
877 cpu = smp_processor_id(); 926 cpu = smp_processor_id();
@@ -879,8 +928,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
879 if (i == cpu) 928 if (i == cpu)
880 continue; 929 continue;
881 if (user) 930 if (user)
882 vcpu->arch.guest_user_asid[i] = 0; 931 cpu_context(i, user_mm) = 0;
883 vcpu->arch.guest_kernel_asid[i] = 0; 932 cpu_context(i, kern_mm) = 0;
884 } 933 }
885 934
886 preempt_enable(); 935 preempt_enable();
@@ -1017,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
1017 unsigned int mask = MIPS_CONF_M; 1066 unsigned int mask = MIPS_CONF_M;
1018 1067
1019 /* KScrExist */ 1068 /* KScrExist */
1020 mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16; 1069 mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT;
1021 1070
1022 return mask; 1071 return mask;
1023} 1072}
@@ -1056,6 +1105,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
1056 struct kvm_vcpu *vcpu) 1105 struct kvm_vcpu *vcpu)
1057{ 1106{
1058 struct mips_coproc *cop0 = vcpu->arch.cop0; 1107 struct mips_coproc *cop0 = vcpu->arch.cop0;
1108 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
1059 enum emulation_result er = EMULATE_DONE; 1109 enum emulation_result er = EMULATE_DONE;
1060 u32 rt, rd, sel; 1110 u32 rt, rd, sel;
1061 unsigned long curr_pc; 1111 unsigned long curr_pc;
@@ -1150,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
1150 er = EMULATE_FAIL; 1200 er = EMULATE_FAIL;
1151 break; 1201 break;
1152 } 1202 }
1153#define C0_EBASE_CORE_MASK 0xff
1154 if ((rd == MIPS_CP0_PRID) && (sel == 1)) { 1203 if ((rd == MIPS_CP0_PRID) && (sel == 1)) {
1155 /* Preserve CORE number */ 1204 /*
1156 kvm_change_c0_guest_ebase(cop0, 1205 * Preserve core number, and keep the exception
1157 ~(C0_EBASE_CORE_MASK), 1206 * base in guest KSeg0.
1207 */
1208 kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
1158 vcpu->arch.gprs[rt]); 1209 vcpu->arch.gprs[rt]);
1159 kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
1160 kvm_read_c0_guest_ebase(cop0));
1161 } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { 1210 } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
1162 u32 nasid = 1211 u32 nasid =
1163 vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; 1212 vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
@@ -1169,6 +1218,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
1169 nasid); 1218 nasid);
1170 1219
1171 /* 1220 /*
1221 * Flush entries from the GVA page
1222 * tables.
1223 * Guest user page table will get
1224 * flushed lazily on re-entry to guest
1225 * user if the guest ASID actually
1226 * changes.
1227 */
1228 kvm_mips_flush_gva_pt(kern_mm->pgd,
1229 KMF_KERN);
1230
1231 /*
1172 * Regenerate/invalidate kernel MMU 1232 * Regenerate/invalidate kernel MMU
1173 * context. 1233 * context.
1174 * The user MMU context will be 1234 * The user MMU context will be
@@ -1178,13 +1238,10 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
1178 */ 1238 */
1179 preempt_disable(); 1239 preempt_disable();
1180 cpu = smp_processor_id(); 1240 cpu = smp_processor_id();
1181 kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, 1241 get_new_mmu_context(kern_mm, cpu);
1182 cpu, vcpu);
1183 vcpu->arch.guest_kernel_asid[cpu] =
1184 vcpu->arch.guest_kernel_mm.context.asid[cpu];
1185 for_each_possible_cpu(i) 1242 for_each_possible_cpu(i)
1186 if (i != cpu) 1243 if (i != cpu)
1187 vcpu->arch.guest_kernel_asid[i] = 0; 1244 cpu_context(i, kern_mm) = 0;
1188 preempt_enable(); 1245 preempt_enable();
1189 } 1246 }
1190 kvm_write_c0_guest_entryhi(cop0, 1247 kvm_write_c0_guest_entryhi(cop0,
@@ -1639,12 +1696,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
1639 return er; 1696 return er;
1640} 1697}
1641 1698
1699static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
1700 unsigned long curr_pc,
1701 unsigned long addr,
1702 struct kvm_run *run,
1703 struct kvm_vcpu *vcpu,
1704 u32 cause)
1705{
1706 int err;
1707
1708 for (;;) {
1709 /* Carefully attempt the cache operation */
1710 kvm_trap_emul_gva_lockless_begin(vcpu);
1711 err = fn(addr);
1712 kvm_trap_emul_gva_lockless_end(vcpu);
1713
1714 if (likely(!err))
1715 return EMULATE_DONE;
1716
1717 /*
1718 * Try to handle the fault and retry, maybe we just raced with a
1719 * GVA invalidation.
1720 */
1721 switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) {
1722 case KVM_MIPS_GVA:
1723 case KVM_MIPS_GPA:
1724 /* bad virtual or physical address */
1725 return EMULATE_FAIL;
1726 case KVM_MIPS_TLB:
1727 /* no matching guest TLB */
1728 vcpu->arch.host_cp0_badvaddr = addr;
1729 vcpu->arch.pc = curr_pc;
1730 kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
1731 return EMULATE_EXCEPT;
1732 case KVM_MIPS_TLBINV:
1733 /* invalid matching guest TLB */
1734 vcpu->arch.host_cp0_badvaddr = addr;
1735 vcpu->arch.pc = curr_pc;
1736 kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
1737 return EMULATE_EXCEPT;
1738 default:
1739 break;
1740 };
1741 }
1742}
1743
1642enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, 1744enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
1643 u32 *opc, u32 cause, 1745 u32 *opc, u32 cause,
1644 struct kvm_run *run, 1746 struct kvm_run *run,
1645 struct kvm_vcpu *vcpu) 1747 struct kvm_vcpu *vcpu)
1646{ 1748{
1647 struct mips_coproc *cop0 = vcpu->arch.cop0;
1648 enum emulation_result er = EMULATE_DONE; 1749 enum emulation_result er = EMULATE_DONE;
1649 u32 cache, op_inst, op, base; 1750 u32 cache, op_inst, op, base;
1650 s16 offset; 1751 s16 offset;
@@ -1701,80 +1802,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
1701 goto done; 1802 goto done;
1702 } 1803 }
1703 1804
1704 preempt_disable();
1705 if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
1706 if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
1707 kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
1708 kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
1709 __func__, va, vcpu, read_c0_entryhi());
1710 er = EMULATE_FAIL;
1711 preempt_enable();
1712 goto done;
1713 }
1714 } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
1715 KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
1716 int index;
1717
1718 /* If an entry already exists then skip */
1719 if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0)
1720 goto skip_fault;
1721
1722 /*
1723 * If address not in the guest TLB, then give the guest a fault,
1724 * the resulting handler will do the right thing
1725 */
1726 index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
1727 (kvm_read_c0_guest_entryhi
1728 (cop0) & KVM_ENTRYHI_ASID));
1729
1730 if (index < 0) {
1731 vcpu->arch.host_cp0_badvaddr = va;
1732 vcpu->arch.pc = curr_pc;
1733 er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
1734 vcpu);
1735 preempt_enable();
1736 goto dont_update_pc;
1737 } else {
1738 struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index];
1739 /*
1740 * Check if the entry is valid, if not then setup a TLB
1741 * invalid exception to the guest
1742 */
1743 if (!TLB_IS_VALID(*tlb, va)) {
1744 vcpu->arch.host_cp0_badvaddr = va;
1745 vcpu->arch.pc = curr_pc;
1746 er = kvm_mips_emulate_tlbinv_ld(cause, NULL,
1747 run, vcpu);
1748 preempt_enable();
1749 goto dont_update_pc;
1750 }
1751 /*
1752 * We fault an entry from the guest tlb to the
1753 * shadow host TLB
1754 */
1755 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
1756 kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
1757 __func__, va, index, vcpu,
1758 read_c0_entryhi());
1759 er = EMULATE_FAIL;
1760 preempt_enable();
1761 goto done;
1762 }
1763 }
1764 } else {
1765 kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
1766 cache, op, base, arch->gprs[base], offset);
1767 er = EMULATE_FAIL;
1768 preempt_enable();
1769 goto done;
1770
1771 }
1772
1773skip_fault:
1774 /* XXXKYMA: Only a subset of cache ops are supported, used by Linux */ 1805 /* XXXKYMA: Only a subset of cache ops are supported, used by Linux */
1775 if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) { 1806 if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) {
1776 flush_dcache_line(va); 1807 /*
1777 1808 * Perform the dcache part of icache synchronisation on the
1809 * guest's behalf.
1810 */
1811 er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
1812 curr_pc, va, run, vcpu, cause);
1813 if (er != EMULATE_DONE)
1814 goto done;
1778#ifdef CONFIG_KVM_MIPS_DYN_TRANS 1815#ifdef CONFIG_KVM_MIPS_DYN_TRANS
1779 /* 1816 /*
1780 * Replace the CACHE instruction, with a SYNCI, not the same, 1817 * Replace the CACHE instruction, with a SYNCI, not the same,
@@ -1783,8 +1820,15 @@ skip_fault:
1783 kvm_mips_trans_cache_va(inst, opc, vcpu); 1820 kvm_mips_trans_cache_va(inst, opc, vcpu);
1784#endif 1821#endif
1785 } else if (op_inst == Hit_Invalidate_I) { 1822 } else if (op_inst == Hit_Invalidate_I) {
1786 flush_dcache_line(va); 1823 /* Perform the icache synchronisation on the guest's behalf */
1787 flush_icache_line(va); 1824 er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
1825 curr_pc, va, run, vcpu, cause);
1826 if (er != EMULATE_DONE)
1827 goto done;
1828 er = kvm_mips_guest_cache_op(protected_flush_icache_line,
1829 curr_pc, va, run, vcpu, cause);
1830 if (er != EMULATE_DONE)
1831 goto done;
1788 1832
1789#ifdef CONFIG_KVM_MIPS_DYN_TRANS 1833#ifdef CONFIG_KVM_MIPS_DYN_TRANS
1790 /* Replace the CACHE instruction, with a SYNCI */ 1834 /* Replace the CACHE instruction, with a SYNCI */
@@ -1796,17 +1840,13 @@ skip_fault:
1796 er = EMULATE_FAIL; 1840 er = EMULATE_FAIL;
1797 } 1841 }
1798 1842
1799 preempt_enable();
1800done: 1843done:
1801 /* Rollback PC only if emulation was unsuccessful */ 1844 /* Rollback PC only if emulation was unsuccessful */
1802 if (er == EMULATE_FAIL) 1845 if (er == EMULATE_FAIL)
1803 vcpu->arch.pc = curr_pc; 1846 vcpu->arch.pc = curr_pc;
1804 1847 /* Guest exception needs guest to resume */
1805dont_update_pc: 1848 if (er == EMULATE_EXCEPT)
1806 /* 1849 er = EMULATE_DONE;
1807 * This is for exceptions whose emulation updates the PC, so do not
1808 * overwrite the PC under any circumstances
1809 */
1810 1850
1811 return er; 1851 return er;
1812} 1852}
@@ -1817,12 +1857,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
1817{ 1857{
1818 union mips_instruction inst; 1858 union mips_instruction inst;
1819 enum emulation_result er = EMULATE_DONE; 1859 enum emulation_result er = EMULATE_DONE;
1860 int err;
1820 1861
1821 /* Fetch the instruction. */ 1862 /* Fetch the instruction. */
1822 if (cause & CAUSEF_BD) 1863 if (cause & CAUSEF_BD)
1823 opc += 1; 1864 opc += 1;
1824 1865 err = kvm_get_badinstr(opc, vcpu, &inst.word);
1825 inst.word = kvm_get_inst(opc, vcpu); 1866 if (err)
1867 return EMULATE_FAIL;
1826 1868
1827 switch (inst.r_format.opcode) { 1869 switch (inst.r_format.opcode) {
1828 case cop0_op: 1870 case cop0_op:
@@ -1874,6 +1916,22 @@ unknown:
1874 return er; 1916 return er;
1875} 1917}
1876 1918
1919/**
1920 * kvm_mips_guest_exception_base() - Find guest exception vector base address.
1921 *
1922 * Returns: The base address of the current guest exception vector, taking
1923 * both Guest.CP0_Status.BEV and Guest.CP0_EBase into account.
1924 */
1925long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu)
1926{
1927 struct mips_coproc *cop0 = vcpu->arch.cop0;
1928
1929 if (kvm_read_c0_guest_status(cop0) & ST0_BEV)
1930 return KVM_GUEST_CKSEG1ADDR(0x1fc00200);
1931 else
1932 return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE;
1933}
1934
1877enum emulation_result kvm_mips_emulate_syscall(u32 cause, 1935enum emulation_result kvm_mips_emulate_syscall(u32 cause,
1878 u32 *opc, 1936 u32 *opc,
1879 struct kvm_run *run, 1937 struct kvm_run *run,
@@ -1899,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause,
1899 (EXCCODE_SYS << CAUSEB_EXCCODE)); 1957 (EXCCODE_SYS << CAUSEB_EXCCODE));
1900 1958
1901 /* Set PC to the exception entry point */ 1959 /* Set PC to the exception entry point */
1902 arch->pc = KVM_GUEST_KSEG0 + 0x180; 1960 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
1903 1961
1904 } else { 1962 } else {
1905 kvm_err("Trying to deliver SYSCALL when EXL is already set\n"); 1963 kvm_err("Trying to deliver SYSCALL when EXL is already set\n");
@@ -1933,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
1933 arch->pc); 1991 arch->pc);
1934 1992
1935 /* set pc to the exception entry point */ 1993 /* set pc to the exception entry point */
1936 arch->pc = KVM_GUEST_KSEG0 + 0x0; 1994 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
1937 1995
1938 } else { 1996 } else {
1939 kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", 1997 kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
1940 arch->pc); 1998 arch->pc);
1941 1999
1942 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2000 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
1943 } 2001 }
1944 2002
1945 kvm_change_c0_guest_cause(cop0, (0xff), 2003 kvm_change_c0_guest_cause(cop0, (0xff),
@@ -1949,8 +2007,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
1949 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2007 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
1950 /* XXXKYMA: is the context register used by linux??? */ 2008 /* XXXKYMA: is the context register used by linux??? */
1951 kvm_write_c0_guest_entryhi(cop0, entryhi); 2009 kvm_write_c0_guest_entryhi(cop0, entryhi);
1952 /* Blow away the shadow host TLBs */
1953 kvm_mips_flush_host_tlb(1);
1954 2010
1955 return EMULATE_DONE; 2011 return EMULATE_DONE;
1956} 2012}
@@ -1978,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
1978 2034
1979 kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n", 2035 kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n",
1980 arch->pc); 2036 arch->pc);
1981
1982 /* set pc to the exception entry point */
1983 arch->pc = KVM_GUEST_KSEG0 + 0x180;
1984
1985 } else { 2037 } else {
1986 kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", 2038 kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
1987 arch->pc); 2039 arch->pc);
1988 arch->pc = KVM_GUEST_KSEG0 + 0x180;
1989 } 2040 }
1990 2041
2042 /* set pc to the exception entry point */
2043 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2044
1991 kvm_change_c0_guest_cause(cop0, (0xff), 2045 kvm_change_c0_guest_cause(cop0, (0xff),
1992 (EXCCODE_TLBL << CAUSEB_EXCCODE)); 2046 (EXCCODE_TLBL << CAUSEB_EXCCODE));
1993 2047
@@ -1995,8 +2049,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
1995 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2049 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
1996 /* XXXKYMA: is the context register used by linux??? */ 2050 /* XXXKYMA: is the context register used by linux??? */
1997 kvm_write_c0_guest_entryhi(cop0, entryhi); 2051 kvm_write_c0_guest_entryhi(cop0, entryhi);
1998 /* Blow away the shadow host TLBs */
1999 kvm_mips_flush_host_tlb(1);
2000 2052
2001 return EMULATE_DONE; 2053 return EMULATE_DONE;
2002} 2054}
@@ -2025,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
2025 arch->pc); 2077 arch->pc);
2026 2078
2027 /* Set PC to the exception entry point */ 2079 /* Set PC to the exception entry point */
2028 arch->pc = KVM_GUEST_KSEG0 + 0x0; 2080 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
2029 } else { 2081 } else {
2030 kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", 2082 kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
2031 arch->pc); 2083 arch->pc);
2032 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2084 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2033 } 2085 }
2034 2086
2035 kvm_change_c0_guest_cause(cop0, (0xff), 2087 kvm_change_c0_guest_cause(cop0, (0xff),
@@ -2039,8 +2091,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
2039 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2091 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
2040 /* XXXKYMA: is the context register used by linux??? */ 2092 /* XXXKYMA: is the context register used by linux??? */
2041 kvm_write_c0_guest_entryhi(cop0, entryhi); 2093 kvm_write_c0_guest_entryhi(cop0, entryhi);
2042 /* Blow away the shadow host TLBs */
2043 kvm_mips_flush_host_tlb(1);
2044 2094
2045 return EMULATE_DONE; 2095 return EMULATE_DONE;
2046} 2096}
@@ -2067,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
2067 2117
2068 kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n", 2118 kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n",
2069 arch->pc); 2119 arch->pc);
2070
2071 /* Set PC to the exception entry point */
2072 arch->pc = KVM_GUEST_KSEG0 + 0x180;
2073 } else { 2120 } else {
2074 kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", 2121 kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
2075 arch->pc); 2122 arch->pc);
2076 arch->pc = KVM_GUEST_KSEG0 + 0x180;
2077 } 2123 }
2078 2124
2125 /* Set PC to the exception entry point */
2126 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2127
2079 kvm_change_c0_guest_cause(cop0, (0xff), 2128 kvm_change_c0_guest_cause(cop0, (0xff),
2080 (EXCCODE_TLBS << CAUSEB_EXCCODE)); 2129 (EXCCODE_TLBS << CAUSEB_EXCCODE));
2081 2130
@@ -2083,41 +2132,10 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
2083 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2132 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
2084 /* XXXKYMA: is the context register used by linux??? */ 2133 /* XXXKYMA: is the context register used by linux??? */
2085 kvm_write_c0_guest_entryhi(cop0, entryhi); 2134 kvm_write_c0_guest_entryhi(cop0, entryhi);
2086 /* Blow away the shadow host TLBs */
2087 kvm_mips_flush_host_tlb(1);
2088 2135
2089 return EMULATE_DONE; 2136 return EMULATE_DONE;
2090} 2137}
2091 2138
2092/* TLBMOD: store into address matching TLB with Dirty bit off */
2093enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
2094 struct kvm_run *run,
2095 struct kvm_vcpu *vcpu)
2096{
2097 enum emulation_result er = EMULATE_DONE;
2098#ifdef DEBUG
2099 struct mips_coproc *cop0 = vcpu->arch.cop0;
2100 unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
2101 (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
2102 int index;
2103
2104 /* If address not in the guest TLB, then we are in trouble */
2105 index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
2106 if (index < 0) {
2107 /* XXXKYMA Invalidate and retry */
2108 kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr);
2109 kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n",
2110 __func__, entryhi);
2111 kvm_mips_dump_guest_tlbs(vcpu);
2112 kvm_mips_dump_host_tlbs();
2113 return EMULATE_FAIL;
2114 }
2115#endif
2116
2117 er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
2118 return er;
2119}
2120
2121enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, 2139enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
2122 u32 *opc, 2140 u32 *opc,
2123 struct kvm_run *run, 2141 struct kvm_run *run,
@@ -2140,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
2140 2158
2141 kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n", 2159 kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n",
2142 arch->pc); 2160 arch->pc);
2143
2144 arch->pc = KVM_GUEST_KSEG0 + 0x180;
2145 } else { 2161 } else {
2146 kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n", 2162 kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n",
2147 arch->pc); 2163 arch->pc);
2148 arch->pc = KVM_GUEST_KSEG0 + 0x180;
2149 } 2164 }
2150 2165
2166 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2167
2151 kvm_change_c0_guest_cause(cop0, (0xff), 2168 kvm_change_c0_guest_cause(cop0, (0xff),
2152 (EXCCODE_MOD << CAUSEB_EXCCODE)); 2169 (EXCCODE_MOD << CAUSEB_EXCCODE));
2153 2170
@@ -2155,8 +2172,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
2155 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2172 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
2156 /* XXXKYMA: is the context register used by linux??? */ 2173 /* XXXKYMA: is the context register used by linux??? */
2157 kvm_write_c0_guest_entryhi(cop0, entryhi); 2174 kvm_write_c0_guest_entryhi(cop0, entryhi);
2158 /* Blow away the shadow host TLBs */
2159 kvm_mips_flush_host_tlb(1);
2160 2175
2161 return EMULATE_DONE; 2176 return EMULATE_DONE;
2162} 2177}
@@ -2181,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
2181 2196
2182 } 2197 }
2183 2198
2184 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2199 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2185 2200
2186 kvm_change_c0_guest_cause(cop0, (0xff), 2201 kvm_change_c0_guest_cause(cop0, (0xff),
2187 (EXCCODE_CPU << CAUSEB_EXCCODE)); 2202 (EXCCODE_CPU << CAUSEB_EXCCODE));
@@ -2215,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
2215 (EXCCODE_RI << CAUSEB_EXCCODE)); 2230 (EXCCODE_RI << CAUSEB_EXCCODE));
2216 2231
2217 /* Set PC to the exception entry point */ 2232 /* Set PC to the exception entry point */
2218 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2233 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2219 2234
2220 } else { 2235 } else {
2221 kvm_err("Trying to deliver RI when EXL is already set\n"); 2236 kvm_err("Trying to deliver RI when EXL is already set\n");
@@ -2250,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
2250 (EXCCODE_BP << CAUSEB_EXCCODE)); 2265 (EXCCODE_BP << CAUSEB_EXCCODE));
2251 2266
2252 /* Set PC to the exception entry point */ 2267 /* Set PC to the exception entry point */
2253 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2268 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2254 2269
2255 } else { 2270 } else {
2256 kvm_err("Trying to deliver BP when EXL is already set\n"); 2271 kvm_err("Trying to deliver BP when EXL is already set\n");
@@ -2285,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
2285 (EXCCODE_TR << CAUSEB_EXCCODE)); 2300 (EXCCODE_TR << CAUSEB_EXCCODE));
2286 2301
2287 /* Set PC to the exception entry point */ 2302 /* Set PC to the exception entry point */
2288 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2303 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2289 2304
2290 } else { 2305 } else {
2291 kvm_err("Trying to deliver TRAP when EXL is already set\n"); 2306 kvm_err("Trying to deliver TRAP when EXL is already set\n");
@@ -2320,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
2320 (EXCCODE_MSAFPE << CAUSEB_EXCCODE)); 2335 (EXCCODE_MSAFPE << CAUSEB_EXCCODE));
2321 2336
2322 /* Set PC to the exception entry point */ 2337 /* Set PC to the exception entry point */
2323 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2338 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2324 2339
2325 } else { 2340 } else {
2326 kvm_err("Trying to deliver MSAFPE when EXL is already set\n"); 2341 kvm_err("Trying to deliver MSAFPE when EXL is already set\n");
@@ -2355,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
2355 (EXCCODE_FPE << CAUSEB_EXCCODE)); 2370 (EXCCODE_FPE << CAUSEB_EXCCODE));
2356 2371
2357 /* Set PC to the exception entry point */ 2372 /* Set PC to the exception entry point */
2358 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2373 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2359 2374
2360 } else { 2375 } else {
2361 kvm_err("Trying to deliver FPE when EXL is already set\n"); 2376 kvm_err("Trying to deliver FPE when EXL is already set\n");
@@ -2390,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
2390 (EXCCODE_MSADIS << CAUSEB_EXCCODE)); 2405 (EXCCODE_MSADIS << CAUSEB_EXCCODE));
2391 2406
2392 /* Set PC to the exception entry point */ 2407 /* Set PC to the exception entry point */
2393 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2408 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2394 2409
2395 } else { 2410 } else {
2396 kvm_err("Trying to deliver MSADIS when EXL is already set\n"); 2411 kvm_err("Trying to deliver MSADIS when EXL is already set\n");
@@ -2409,6 +2424,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
2409 enum emulation_result er = EMULATE_DONE; 2424 enum emulation_result er = EMULATE_DONE;
2410 unsigned long curr_pc; 2425 unsigned long curr_pc;
2411 union mips_instruction inst; 2426 union mips_instruction inst;
2427 int err;
2412 2428
2413 /* 2429 /*
2414 * Update PC and hold onto current PC in case there is 2430 * Update PC and hold onto current PC in case there is
@@ -2422,11 +2438,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
2422 /* Fetch the instruction. */ 2438 /* Fetch the instruction. */
2423 if (cause & CAUSEF_BD) 2439 if (cause & CAUSEF_BD)
2424 opc += 1; 2440 opc += 1;
2425 2441 err = kvm_get_badinstr(opc, vcpu, &inst.word);
2426 inst.word = kvm_get_inst(opc, vcpu); 2442 if (err) {
2427 2443 kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err);
2428 if (inst.word == KVM_INVALID_INST) {
2429 kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
2430 return EMULATE_FAIL; 2444 return EMULATE_FAIL;
2431 } 2445 }
2432 2446
@@ -2557,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause,
2557 (exccode << CAUSEB_EXCCODE)); 2571 (exccode << CAUSEB_EXCCODE));
2558 2572
2559 /* Set PC to the exception entry point */ 2573 /* Set PC to the exception entry point */
2560 arch->pc = KVM_GUEST_KSEG0 + 0x180; 2574 arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
2561 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); 2575 kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
2562 2576
2563 kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n", 2577 kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n",
@@ -2670,7 +2684,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause,
2670enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, 2684enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
2671 u32 *opc, 2685 u32 *opc,
2672 struct kvm_run *run, 2686 struct kvm_run *run,
2673 struct kvm_vcpu *vcpu) 2687 struct kvm_vcpu *vcpu,
2688 bool write_fault)
2674{ 2689{
2675 enum emulation_result er = EMULATE_DONE; 2690 enum emulation_result er = EMULATE_DONE;
2676 u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; 2691 u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
@@ -2726,7 +2741,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
2726 * OK we have a Guest TLB entry, now inject it into the 2741 * OK we have a Guest TLB entry, now inject it into the
2727 * shadow host TLB 2742 * shadow host TLB
2728 */ 2743 */
2729 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { 2744 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va,
2745 write_fault)) {
2730 kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", 2746 kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
2731 __func__, va, index, vcpu, 2747 __func__, va, index, vcpu,
2732 read_c0_entryhi()); 2748 read_c0_entryhi());
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index e92fb190e2d6..c5b254c4d0da 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -12,8 +12,11 @@
12 */ 12 */
13 13
14#include <linux/kvm_host.h> 14#include <linux/kvm_host.h>
15#include <linux/log2.h>
16#include <asm/mmu_context.h>
15#include <asm/msa.h> 17#include <asm/msa.h>
16#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/tlbex.h>
17#include <asm/uasm.h> 20#include <asm/uasm.h>
18 21
19/* Register names */ 22/* Register names */
@@ -50,6 +53,8 @@
50/* Some CP0 registers */ 53/* Some CP0 registers */
51#define C0_HWRENA 7, 0 54#define C0_HWRENA 7, 0
52#define C0_BADVADDR 8, 0 55#define C0_BADVADDR 8, 0
56#define C0_BADINSTR 8, 1
57#define C0_BADINSTRP 8, 2
53#define C0_ENTRYHI 10, 0 58#define C0_ENTRYHI 10, 0
54#define C0_STATUS 12, 0 59#define C0_STATUS 12, 0
55#define C0_CAUSE 13, 0 60#define C0_CAUSE 13, 0
@@ -89,6 +94,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr);
89static void *kvm_mips_build_ret_to_guest(void *addr); 94static void *kvm_mips_build_ret_to_guest(void *addr);
90static void *kvm_mips_build_ret_to_host(void *addr); 95static void *kvm_mips_build_ret_to_host(void *addr);
91 96
97/*
98 * The version of this function in tlbex.c uses current_cpu_type(), but for KVM
99 * we assume symmetry.
100 */
101static int c0_kscratch(void)
102{
103 switch (boot_cpu_type()) {
104 case CPU_XLP:
105 case CPU_XLR:
106 return 22;
107 default:
108 return 31;
109 }
110}
111
92/** 112/**
93 * kvm_mips_entry_setup() - Perform global setup for entry code. 113 * kvm_mips_entry_setup() - Perform global setup for entry code.
94 * 114 *
@@ -103,18 +123,21 @@ int kvm_mips_entry_setup(void)
103 * We prefer to use KScratchN registers if they are available over the 123 * We prefer to use KScratchN registers if they are available over the
104 * defaults above, which may not work on all cores. 124 * defaults above, which may not work on all cores.
105 */ 125 */
106 unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; 126 unsigned int kscratch_mask = cpu_data[0].kscratch_mask;
127
128 if (pgd_reg != -1)
129 kscratch_mask &= ~BIT(pgd_reg);
107 130
108 /* Pick a scratch register for storing VCPU */ 131 /* Pick a scratch register for storing VCPU */
109 if (kscratch_mask) { 132 if (kscratch_mask) {
110 scratch_vcpu[0] = 31; 133 scratch_vcpu[0] = c0_kscratch();
111 scratch_vcpu[1] = ffs(kscratch_mask) - 1; 134 scratch_vcpu[1] = ffs(kscratch_mask) - 1;
112 kscratch_mask &= ~BIT(scratch_vcpu[1]); 135 kscratch_mask &= ~BIT(scratch_vcpu[1]);
113 } 136 }
114 137
115 /* Pick a scratch register to use as a temp for saving state */ 138 /* Pick a scratch register to use as a temp for saving state */
116 if (kscratch_mask) { 139 if (kscratch_mask) {
117 scratch_tmp[0] = 31; 140 scratch_tmp[0] = c0_kscratch();
118 scratch_tmp[1] = ffs(kscratch_mask) - 1; 141 scratch_tmp[1] = ffs(kscratch_mask) - 1;
119 kscratch_mask &= ~BIT(scratch_tmp[1]); 142 kscratch_mask &= ~BIT(scratch_tmp[1]);
120 } 143 }
@@ -130,7 +153,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
130 UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); 153 UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
131 154
132 /* Save the temp scratch register value in cp0_cause of stack frame */ 155 /* Save the temp scratch register value in cp0_cause of stack frame */
133 if (scratch_tmp[0] == 31) { 156 if (scratch_tmp[0] == c0_kscratch()) {
134 UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); 157 UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
135 UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); 158 UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
136 } 159 }
@@ -146,7 +169,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
146 UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); 169 UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
147 UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); 170 UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
148 171
149 if (scratch_tmp[0] == 31) { 172 if (scratch_tmp[0] == c0_kscratch()) {
150 UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); 173 UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
151 UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); 174 UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
152 } 175 }
@@ -286,23 +309,26 @@ static void *kvm_mips_build_enter_guest(void *addr)
286 uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); 309 uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
287 uasm_i_xori(&p, T0, T0, KSU_USER); 310 uasm_i_xori(&p, T0, T0, KSU_USER);
288 uasm_il_bnez(&p, &r, T0, label_kernel_asid); 311 uasm_il_bnez(&p, &r, T0, label_kernel_asid);
289 UASM_i_ADDIU(&p, T1, K1, 312 UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
290 offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); 313 guest_kernel_mm.context.asid));
291 /* else user */ 314 /* else user */
292 UASM_i_ADDIU(&p, T1, K1, 315 UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
293 offsetof(struct kvm_vcpu_arch, guest_user_asid)); 316 guest_user_mm.context.asid));
294 uasm_l_kernel_asid(&l, p); 317 uasm_l_kernel_asid(&l, p);
295 318
296 /* t1: contains the base of the ASID array, need to get the cpu id */ 319 /* t1: contains the base of the ASID array, need to get the cpu id */
297 /* smp_processor_id */ 320 /* smp_processor_id */
298 uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP); 321 uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
299 /* x4 */ 322 /* index the ASID array */
300 uasm_i_sll(&p, T2, T2, 2); 323 uasm_i_sll(&p, T2, T2, ilog2(sizeof(long)));
301 UASM_i_ADDU(&p, T3, T1, T2); 324 UASM_i_ADDU(&p, T3, T1, T2);
302 uasm_i_lw(&p, K0, 0, T3); 325 UASM_i_LW(&p, K0, 0, T3);
303#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE 326#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
304 /* x sizeof(struct cpuinfo_mips)/4 */ 327 /*
305 uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); 328 * reuse ASID array offset
329 * cpuinfo_mips is a multiple of sizeof(long)
330 */
331 uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long));
306 uasm_i_mul(&p, T2, T2, T3); 332 uasm_i_mul(&p, T2, T2, T3);
307 333
308 UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); 334 UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
@@ -312,7 +338,20 @@ static void *kvm_mips_build_enter_guest(void *addr)
312#else 338#else
313 uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); 339 uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
314#endif 340#endif
315 uasm_i_mtc0(&p, K0, C0_ENTRYHI); 341
342 /*
343 * Set up KVM T&E GVA pgd.
344 * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
345 * - call tlbmiss_handler_setup_pgd(mm->pgd)
346 * - but skips write into CP0_PWBase for now
347 */
348 UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) -
349 (int)offsetof(struct mm_struct, context.asid), T1);
350
351 UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
352 uasm_i_jalr(&p, RA, T9);
353 uasm_i_mtc0(&p, K0, C0_ENTRYHI);
354
316 uasm_i_ehb(&p); 355 uasm_i_ehb(&p);
317 356
318 /* Disable RDHWR access */ 357 /* Disable RDHWR access */
@@ -348,6 +387,80 @@ static void *kvm_mips_build_enter_guest(void *addr)
348} 387}
349 388
350/** 389/**
390 * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler.
391 * @addr: Address to start writing code.
392 * @handler: Address of common handler (within range of @addr).
393 *
394 * Assemble TLB refill exception fast path handler for guest execution.
395 *
396 * Returns: Next address after end of written function.
397 */
398void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler)
399{
400 u32 *p = addr;
401 struct uasm_label labels[2];
402 struct uasm_reloc relocs[2];
403 struct uasm_label *l = labels;
404 struct uasm_reloc *r = relocs;
405
406 memset(labels, 0, sizeof(labels));
407 memset(relocs, 0, sizeof(relocs));
408
409 /* Save guest k1 into scratch register */
410 UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
411
412 /* Get the VCPU pointer from the VCPU scratch register */
413 UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
414
415 /* Save guest k0 into VCPU structure */
416 UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
417
418 /*
419 * Some of the common tlbex code uses current_cpu_type(). For KVM we
420 * assume symmetry and just disable preemption to silence the warning.
421 */
422 preempt_disable();
423
424 /*
425 * Now for the actual refill bit. A lot of this can be common with the
426 * Linux TLB refill handler, however we don't need to handle so many
427 * cases. We only need to handle user mode refills, and user mode runs
428 * with 32-bit addressing.
429 *
430 * Therefore the branch to label_vmalloc generated by build_get_pmde64()
431 * that isn't resolved should never actually get taken and is harmless
432 * to leave in place for now.
433 */
434
435#ifdef CONFIG_64BIT
436 build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
437#else
438 build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
439#endif
440
441 /* we don't support huge pages yet */
442
443 build_get_ptep(&p, K0, K1);
444 build_update_entries(&p, K0, K1);
445 build_tlb_write_entry(&p, &l, &r, tlb_random);
446
447 preempt_enable();
448
449 /* Get the VCPU pointer from the VCPU scratch register again */
450 UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
451
452 /* Restore the guest's k0/k1 registers */
453 UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
454 uasm_i_ehb(&p);
455 UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
456
457 /* Jump to guest */
458 uasm_i_eret(&p);
459
460 return p;
461}
462
463/**
351 * kvm_mips_build_exception() - Assemble first level guest exception handler. 464 * kvm_mips_build_exception() - Assemble first level guest exception handler.
352 * @addr: Address to start writing code. 465 * @addr: Address to start writing code.
353 * @handler: Address of common handler (within range of @addr). 466 * @handler: Address of common handler (within range of @addr).
@@ -468,6 +581,18 @@ void *kvm_mips_build_exit(void *addr)
468 uasm_i_mfc0(&p, K0, C0_CAUSE); 581 uasm_i_mfc0(&p, K0, C0_CAUSE);
469 uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); 582 uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
470 583
584 if (cpu_has_badinstr) {
585 uasm_i_mfc0(&p, K0, C0_BADINSTR);
586 uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
587 host_cp0_badinstr), K1);
588 }
589
590 if (cpu_has_badinstrp) {
591 uasm_i_mfc0(&p, K0, C0_BADINSTRP);
592 uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
593 host_cp0_badinstrp), K1);
594 }
595
471 /* Now restore the host state just enough to run the handlers */ 596 /* Now restore the host state just enough to run the handlers */
472 597
473 /* Switch EBASE to the one used by Linux */ 598 /* Switch EBASE to the one used by Linux */
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index e88403b3dcdd..aa0a1a00faf6 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
183 (exccode << CAUSEB_EXCCODE)); 183 (exccode << CAUSEB_EXCCODE));
184 184
185 /* XXXSL Set PC to the interrupt exception entry point */ 185 /* XXXSL Set PC to the interrupt exception entry point */
186 arch->pc = kvm_mips_guest_exception_base(vcpu);
186 if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV) 187 if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV)
187 arch->pc = KVM_GUEST_KSEG0 + 0x200; 188 arch->pc += 0x200;
188 else 189 else
189 arch->pc = KVM_GUEST_KSEG0 + 0x180; 190 arch->pc += 0x180;
190 191
191 clear_bit(priority, &vcpu->arch.pending_exceptions); 192 clear_bit(priority, &vcpu->arch.pending_exceptions);
192 } 193 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29ec9ab3fd55..ed81e5ac1426 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -22,6 +22,7 @@
22#include <asm/page.h> 22#include <asm/page.h>
23#include <asm/cacheflush.h> 23#include <asm/cacheflush.h>
24#include <asm/mmu_context.h> 24#include <asm/mmu_context.h>
25#include <asm/pgalloc.h>
25#include <asm/pgtable.h> 26#include <asm/pgtable.h>
26 27
27#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
@@ -63,18 +64,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
63 {NULL} 64 {NULL}
64}; 65};
65 66
66static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
67{
68 int i;
69
70 for_each_possible_cpu(i) {
71 vcpu->arch.guest_kernel_asid[i] = 0;
72 vcpu->arch.guest_user_asid[i] = 0;
73 }
74
75 return 0;
76}
77
78/* 67/*
79 * XXXKYMA: We are simulatoring a processor that has the WII bit set in 68 * XXXKYMA: We are simulatoring a processor that has the WII bit set in
80 * Config7, so we are "runnable" if interrupts are pending 69 * Config7, so we are "runnable" if interrupts are pending
@@ -104,39 +93,12 @@ void kvm_arch_check_processor_compat(void *rtn)
104 *(int *)rtn = 0; 93 *(int *)rtn = 0;
105} 94}
106 95
107static void kvm_mips_init_tlbs(struct kvm *kvm)
108{
109 unsigned long wired;
110
111 /*
112 * Add a wired entry to the TLB, it is used to map the commpage to
113 * the Guest kernel
114 */
115 wired = read_c0_wired();
116 write_c0_wired(wired + 1);
117 mtc0_tlbw_hazard();
118 kvm->arch.commpage_tlb = wired;
119
120 kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(),
121 kvm->arch.commpage_tlb);
122}
123
124static void kvm_mips_init_vm_percpu(void *arg)
125{
126 struct kvm *kvm = (struct kvm *)arg;
127
128 kvm_mips_init_tlbs(kvm);
129 kvm_mips_callbacks->vm_init(kvm);
130
131}
132
133int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 96int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
134{ 97{
135 if (atomic_inc_return(&kvm_mips_instance) == 1) { 98 /* Allocate page table to map GPA -> RPA */
136 kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n", 99 kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
137 __func__); 100 if (!kvm->arch.gpa_mm.pgd)
138 on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1); 101 return -ENOMEM;
139 }
140 102
141 return 0; 103 return 0;
142} 104}
@@ -156,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
156 unsigned int i; 118 unsigned int i;
157 struct kvm_vcpu *vcpu; 119 struct kvm_vcpu *vcpu;
158 120
159 /* Put the pages we reserved for the guest pmap */
160 for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
161 if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
162 kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
163 }
164 kfree(kvm->arch.guest_pmap);
165
166 kvm_for_each_vcpu(i, vcpu, kvm) { 121 kvm_for_each_vcpu(i, vcpu, kvm) {
167 kvm_arch_vcpu_free(vcpu); 122 kvm_arch_vcpu_free(vcpu);
168 } 123 }
@@ -177,25 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
177 mutex_unlock(&kvm->lock); 132 mutex_unlock(&kvm->lock);
178} 133}
179 134
180static void kvm_mips_uninit_tlbs(void *arg) 135static void kvm_mips_free_gpa_pt(struct kvm *kvm)
181{ 136{
182 /* Restore wired count */ 137 /* It should always be safe to remove after flushing the whole range */
183 write_c0_wired(0); 138 WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0));
184 mtc0_tlbw_hazard(); 139 pgd_free(NULL, kvm->arch.gpa_mm.pgd);
185 /* Clear out all the TLBs */
186 kvm_local_flush_tlb_all();
187} 140}
188 141
189void kvm_arch_destroy_vm(struct kvm *kvm) 142void kvm_arch_destroy_vm(struct kvm *kvm)
190{ 143{
191 kvm_mips_free_vcpus(kvm); 144 kvm_mips_free_vcpus(kvm);
192 145 kvm_mips_free_gpa_pt(kvm);
193 /* If this is the last instance, restore wired count */
194 if (atomic_dec_return(&kvm_mips_instance) == 0) {
195 kvm_debug("%s: last KVM instance, restoring TLB parameters\n",
196 __func__);
197 on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1);
198 }
199} 146}
200 147
201long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, 148long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
@@ -210,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
210 return 0; 157 return 0;
211} 158}
212 159
160void kvm_arch_flush_shadow_all(struct kvm *kvm)
161{
162 /* Flush whole GPA */
163 kvm_mips_flush_gpa_pt(kvm, 0, ~0);
164
165 /* Let implementation do the rest */
166 kvm_mips_callbacks->flush_shadow_all(kvm);
167}
168
169void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
170 struct kvm_memory_slot *slot)
171{
172 /*
173 * The slot has been made invalid (ready for moving or deletion), so we
174 * need to ensure that it can no longer be accessed by any guest VCPUs.
175 */
176
177 spin_lock(&kvm->mmu_lock);
178 /* Flush slot from GPA */
179 kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
180 slot->base_gfn + slot->npages - 1);
181 /* Let implementation do the rest */
182 kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
183 spin_unlock(&kvm->mmu_lock);
184}
185
213int kvm_arch_prepare_memory_region(struct kvm *kvm, 186int kvm_arch_prepare_memory_region(struct kvm *kvm,
214 struct kvm_memory_slot *memslot, 187 struct kvm_memory_slot *memslot,
215 const struct kvm_userspace_memory_region *mem, 188 const struct kvm_userspace_memory_region *mem,
@@ -224,35 +197,32 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
224 const struct kvm_memory_slot *new, 197 const struct kvm_memory_slot *new,
225 enum kvm_mr_change change) 198 enum kvm_mr_change change)
226{ 199{
227 unsigned long npages = 0; 200 int needs_flush;
228 int i;
229 201
230 kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n", 202 kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
231 __func__, kvm, mem->slot, mem->guest_phys_addr, 203 __func__, kvm, mem->slot, mem->guest_phys_addr,
232 mem->memory_size, mem->userspace_addr); 204 mem->memory_size, mem->userspace_addr);
233 205
234 /* Setup Guest PMAP table */ 206 /*
235 if (!kvm->arch.guest_pmap) { 207 * If dirty page logging is enabled, write protect all pages in the slot
236 if (mem->slot == 0) 208 * ready for dirty logging.
237 npages = mem->memory_size >> PAGE_SHIFT; 209 *
238 210 * There is no need to do this in any of the following cases:
239 if (npages) { 211 * CREATE: No dirty mappings will already exist.
240 kvm->arch.guest_pmap_npages = npages; 212 * MOVE/DELETE: The old mappings will already have been cleaned up by
241 kvm->arch.guest_pmap = 213 * kvm_arch_flush_shadow_memslot()
242 kzalloc(npages * sizeof(unsigned long), GFP_KERNEL); 214 */
243 215 if (change == KVM_MR_FLAGS_ONLY &&
244 if (!kvm->arch.guest_pmap) { 216 (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
245 kvm_err("Failed to allocate guest PMAP\n"); 217 new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
246 return; 218 spin_lock(&kvm->mmu_lock);
247 } 219 /* Write protect GPA page table entries */
248 220 needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
249 kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n", 221 new->base_gfn + new->npages - 1);
250 npages, kvm->arch.guest_pmap); 222 /* Let implementation do the rest */
251 223 if (needs_flush)
252 /* Now setup the page table */ 224 kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
253 for (i = 0; i < npages; i++) 225 spin_unlock(&kvm->mmu_lock);
254 kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE;
255 }
256 } 226 }
257} 227}
258 228
@@ -276,7 +246,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end)
276struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) 246struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
277{ 247{
278 int err, size; 248 int err, size;
279 void *gebase, *p, *handler; 249 void *gebase, *p, *handler, *refill_start, *refill_end;
280 int i; 250 int i;
281 251
282 struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 252 struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -329,8 +299,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
329 /* Build guest exception vectors dynamically in unmapped memory */ 299 /* Build guest exception vectors dynamically in unmapped memory */
330 handler = gebase + 0x2000; 300 handler = gebase + 0x2000;
331 301
332 /* TLB Refill, EXL = 0 */ 302 /* TLB refill */
333 kvm_mips_build_exception(gebase, handler); 303 refill_start = gebase;
304 refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
334 305
335 /* General Exception Entry point */ 306 /* General Exception Entry point */
336 kvm_mips_build_exception(gebase + 0x180, handler); 307 kvm_mips_build_exception(gebase + 0x180, handler);
@@ -356,6 +327,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
356 pr_debug("#include <asm/regdef.h>\n"); 327 pr_debug("#include <asm/regdef.h>\n");
357 pr_debug("\n"); 328 pr_debug("\n");
358 dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); 329 dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
330 dump_handler("kvm_tlb_refill", refill_start, refill_end);
359 dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); 331 dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
360 dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); 332 dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
361 333
@@ -406,6 +378,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
406 378
407 kvm_mips_dump_stats(vcpu); 379 kvm_mips_dump_stats(vcpu);
408 380
381 kvm_mmu_free_memory_caches(vcpu);
409 kfree(vcpu->arch.guest_ebase); 382 kfree(vcpu->arch.guest_ebase);
410 kfree(vcpu->arch.kseg0_commpage); 383 kfree(vcpu->arch.kseg0_commpage);
411 kfree(vcpu); 384 kfree(vcpu);
@@ -422,37 +395,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
422 return -ENOIOCTLCMD; 395 return -ENOIOCTLCMD;
423} 396}
424 397
425/* Must be called with preemption disabled, just before entering guest */
426static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
427{
428 struct mips_coproc *cop0 = vcpu->arch.cop0;
429 int i, cpu = smp_processor_id();
430 unsigned int gasid;
431
432 /*
433 * Lazy host ASID regeneration for guest user mode.
434 * If the guest ASID has changed since the last guest usermode
435 * execution, regenerate the host ASID so as to invalidate stale TLB
436 * entries.
437 */
438 if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
439 gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
440 if (gasid != vcpu->arch.last_user_gasid) {
441 kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
442 vcpu);
443 vcpu->arch.guest_user_asid[cpu] =
444 vcpu->arch.guest_user_mm.context.asid[cpu];
445 for_each_possible_cpu(i)
446 if (i != cpu)
447 vcpu->arch.guest_user_asid[cpu] = 0;
448 vcpu->arch.last_user_gasid = gasid;
449 }
450 }
451}
452
453int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) 398int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
454{ 399{
455 int r = 0; 400 int r = -EINTR;
456 sigset_t sigsaved; 401 sigset_t sigsaved;
457 402
458 if (vcpu->sigset_active) 403 if (vcpu->sigset_active)
@@ -464,31 +409,30 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
464 vcpu->mmio_needed = 0; 409 vcpu->mmio_needed = 0;
465 } 410 }
466 411
412 if (run->immediate_exit)
413 goto out;
414
467 lose_fpu(1); 415 lose_fpu(1);
468 416
469 local_irq_disable(); 417 local_irq_disable();
470 /* Check if we have any exceptions/interrupts pending */
471 kvm_mips_deliver_interrupts(vcpu,
472 kvm_read_c0_guest_cause(vcpu->arch.cop0));
473
474 guest_enter_irqoff(); 418 guest_enter_irqoff();
475
476 /* Disable hardware page table walking while in guest */
477 htw_stop();
478
479 trace_kvm_enter(vcpu); 419 trace_kvm_enter(vcpu);
480 420
481 kvm_mips_check_asids(vcpu); 421 /*
482 422 * Make sure the read of VCPU requests in vcpu_run() callback is not
483 r = vcpu->arch.vcpu_run(run, vcpu); 423 * reordered ahead of the write to vcpu->mode, or we could miss a TLB
484 trace_kvm_out(vcpu); 424 * flush request while the requester sees the VCPU as outside of guest
425 * mode and not needing an IPI.
426 */
427 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
485 428
486 /* Re-enable HTW before enabling interrupts */ 429 r = kvm_mips_callbacks->vcpu_run(run, vcpu);
487 htw_start();
488 430
431 trace_kvm_out(vcpu);
489 guest_exit_irqoff(); 432 guest_exit_irqoff();
490 local_irq_enable(); 433 local_irq_enable();
491 434
435out:
492 if (vcpu->sigset_active) 436 if (vcpu->sigset_active)
493 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 437 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
494 438
@@ -580,33 +524,6 @@ static u64 kvm_mips_get_one_regs[] = {
580 KVM_REG_MIPS_LO, 524 KVM_REG_MIPS_LO,
581#endif 525#endif
582 KVM_REG_MIPS_PC, 526 KVM_REG_MIPS_PC,
583
584 KVM_REG_MIPS_CP0_INDEX,
585 KVM_REG_MIPS_CP0_CONTEXT,
586 KVM_REG_MIPS_CP0_USERLOCAL,
587 KVM_REG_MIPS_CP0_PAGEMASK,
588 KVM_REG_MIPS_CP0_WIRED,
589 KVM_REG_MIPS_CP0_HWRENA,
590 KVM_REG_MIPS_CP0_BADVADDR,
591 KVM_REG_MIPS_CP0_COUNT,
592 KVM_REG_MIPS_CP0_ENTRYHI,
593 KVM_REG_MIPS_CP0_COMPARE,
594 KVM_REG_MIPS_CP0_STATUS,
595 KVM_REG_MIPS_CP0_CAUSE,
596 KVM_REG_MIPS_CP0_EPC,
597 KVM_REG_MIPS_CP0_PRID,
598 KVM_REG_MIPS_CP0_CONFIG,
599 KVM_REG_MIPS_CP0_CONFIG1,
600 KVM_REG_MIPS_CP0_CONFIG2,
601 KVM_REG_MIPS_CP0_CONFIG3,
602 KVM_REG_MIPS_CP0_CONFIG4,
603 KVM_REG_MIPS_CP0_CONFIG5,
604 KVM_REG_MIPS_CP0_CONFIG7,
605 KVM_REG_MIPS_CP0_ERROREPC,
606
607 KVM_REG_MIPS_COUNT_CTL,
608 KVM_REG_MIPS_COUNT_RESUME,
609 KVM_REG_MIPS_COUNT_HZ,
610}; 527};
611 528
612static u64 kvm_mips_get_one_regs_fpu[] = { 529static u64 kvm_mips_get_one_regs_fpu[] = {
@@ -619,15 +536,6 @@ static u64 kvm_mips_get_one_regs_msa[] = {
619 KVM_REG_MIPS_MSA_CSR, 536 KVM_REG_MIPS_MSA_CSR,
620}; 537};
621 538
622static u64 kvm_mips_get_one_regs_kscratch[] = {
623 KVM_REG_MIPS_CP0_KSCRATCH1,
624 KVM_REG_MIPS_CP0_KSCRATCH2,
625 KVM_REG_MIPS_CP0_KSCRATCH3,
626 KVM_REG_MIPS_CP0_KSCRATCH4,
627 KVM_REG_MIPS_CP0_KSCRATCH5,
628 KVM_REG_MIPS_CP0_KSCRATCH6,
629};
630
631static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) 539static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
632{ 540{
633 unsigned long ret; 541 unsigned long ret;
@@ -641,7 +549,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
641 } 549 }
642 if (kvm_mips_guest_can_have_msa(&vcpu->arch)) 550 if (kvm_mips_guest_can_have_msa(&vcpu->arch))
643 ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; 551 ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
644 ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
645 ret += kvm_mips_callbacks->num_regs(vcpu); 552 ret += kvm_mips_callbacks->num_regs(vcpu);
646 553
647 return ret; 554 return ret;
@@ -694,16 +601,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
694 } 601 }
695 } 602 }
696 603
697 for (i = 0; i < 6; ++i) {
698 if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
699 continue;
700
701 if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
702 sizeof(kvm_mips_get_one_regs_kscratch[i])))
703 return -EFAULT;
704 ++indices;
705 }
706
707 return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); 604 return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
708} 605}
709 606
@@ -794,95 +691,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
794 v = fpu->msacsr; 691 v = fpu->msacsr;
795 break; 692 break;
796 693
797 /* Co-processor 0 registers */
798 case KVM_REG_MIPS_CP0_INDEX:
799 v = (long)kvm_read_c0_guest_index(cop0);
800 break;
801 case KVM_REG_MIPS_CP0_CONTEXT:
802 v = (long)kvm_read_c0_guest_context(cop0);
803 break;
804 case KVM_REG_MIPS_CP0_USERLOCAL:
805 v = (long)kvm_read_c0_guest_userlocal(cop0);
806 break;
807 case KVM_REG_MIPS_CP0_PAGEMASK:
808 v = (long)kvm_read_c0_guest_pagemask(cop0);
809 break;
810 case KVM_REG_MIPS_CP0_WIRED:
811 v = (long)kvm_read_c0_guest_wired(cop0);
812 break;
813 case KVM_REG_MIPS_CP0_HWRENA:
814 v = (long)kvm_read_c0_guest_hwrena(cop0);
815 break;
816 case KVM_REG_MIPS_CP0_BADVADDR:
817 v = (long)kvm_read_c0_guest_badvaddr(cop0);
818 break;
819 case KVM_REG_MIPS_CP0_ENTRYHI:
820 v = (long)kvm_read_c0_guest_entryhi(cop0);
821 break;
822 case KVM_REG_MIPS_CP0_COMPARE:
823 v = (long)kvm_read_c0_guest_compare(cop0);
824 break;
825 case KVM_REG_MIPS_CP0_STATUS:
826 v = (long)kvm_read_c0_guest_status(cop0);
827 break;
828 case KVM_REG_MIPS_CP0_CAUSE:
829 v = (long)kvm_read_c0_guest_cause(cop0);
830 break;
831 case KVM_REG_MIPS_CP0_EPC:
832 v = (long)kvm_read_c0_guest_epc(cop0);
833 break;
834 case KVM_REG_MIPS_CP0_PRID:
835 v = (long)kvm_read_c0_guest_prid(cop0);
836 break;
837 case KVM_REG_MIPS_CP0_CONFIG:
838 v = (long)kvm_read_c0_guest_config(cop0);
839 break;
840 case KVM_REG_MIPS_CP0_CONFIG1:
841 v = (long)kvm_read_c0_guest_config1(cop0);
842 break;
843 case KVM_REG_MIPS_CP0_CONFIG2:
844 v = (long)kvm_read_c0_guest_config2(cop0);
845 break;
846 case KVM_REG_MIPS_CP0_CONFIG3:
847 v = (long)kvm_read_c0_guest_config3(cop0);
848 break;
849 case KVM_REG_MIPS_CP0_CONFIG4:
850 v = (long)kvm_read_c0_guest_config4(cop0);
851 break;
852 case KVM_REG_MIPS_CP0_CONFIG5:
853 v = (long)kvm_read_c0_guest_config5(cop0);
854 break;
855 case KVM_REG_MIPS_CP0_CONFIG7:
856 v = (long)kvm_read_c0_guest_config7(cop0);
857 break;
858 case KVM_REG_MIPS_CP0_ERROREPC:
859 v = (long)kvm_read_c0_guest_errorepc(cop0);
860 break;
861 case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
862 idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
863 if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
864 return -EINVAL;
865 switch (idx) {
866 case 2:
867 v = (long)kvm_read_c0_guest_kscratch1(cop0);
868 break;
869 case 3:
870 v = (long)kvm_read_c0_guest_kscratch2(cop0);
871 break;
872 case 4:
873 v = (long)kvm_read_c0_guest_kscratch3(cop0);
874 break;
875 case 5:
876 v = (long)kvm_read_c0_guest_kscratch4(cop0);
877 break;
878 case 6:
879 v = (long)kvm_read_c0_guest_kscratch5(cop0);
880 break;
881 case 7:
882 v = (long)kvm_read_c0_guest_kscratch6(cop0);
883 break;
884 }
885 break;
886 /* registers to be handled specially */ 694 /* registers to be handled specially */
887 default: 695 default:
888 ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); 696 ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
@@ -1014,68 +822,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
1014 fpu->msacsr = v; 822 fpu->msacsr = v;
1015 break; 823 break;
1016 824
1017 /* Co-processor 0 registers */
1018 case KVM_REG_MIPS_CP0_INDEX:
1019 kvm_write_c0_guest_index(cop0, v);
1020 break;
1021 case KVM_REG_MIPS_CP0_CONTEXT:
1022 kvm_write_c0_guest_context(cop0, v);
1023 break;
1024 case KVM_REG_MIPS_CP0_USERLOCAL:
1025 kvm_write_c0_guest_userlocal(cop0, v);
1026 break;
1027 case KVM_REG_MIPS_CP0_PAGEMASK:
1028 kvm_write_c0_guest_pagemask(cop0, v);
1029 break;
1030 case KVM_REG_MIPS_CP0_WIRED:
1031 kvm_write_c0_guest_wired(cop0, v);
1032 break;
1033 case KVM_REG_MIPS_CP0_HWRENA:
1034 kvm_write_c0_guest_hwrena(cop0, v);
1035 break;
1036 case KVM_REG_MIPS_CP0_BADVADDR:
1037 kvm_write_c0_guest_badvaddr(cop0, v);
1038 break;
1039 case KVM_REG_MIPS_CP0_ENTRYHI:
1040 kvm_write_c0_guest_entryhi(cop0, v);
1041 break;
1042 case KVM_REG_MIPS_CP0_STATUS:
1043 kvm_write_c0_guest_status(cop0, v);
1044 break;
1045 case KVM_REG_MIPS_CP0_EPC:
1046 kvm_write_c0_guest_epc(cop0, v);
1047 break;
1048 case KVM_REG_MIPS_CP0_PRID:
1049 kvm_write_c0_guest_prid(cop0, v);
1050 break;
1051 case KVM_REG_MIPS_CP0_ERROREPC:
1052 kvm_write_c0_guest_errorepc(cop0, v);
1053 break;
1054 case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
1055 idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
1056 if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
1057 return -EINVAL;
1058 switch (idx) {
1059 case 2:
1060 kvm_write_c0_guest_kscratch1(cop0, v);
1061 break;
1062 case 3:
1063 kvm_write_c0_guest_kscratch2(cop0, v);
1064 break;
1065 case 4:
1066 kvm_write_c0_guest_kscratch3(cop0, v);
1067 break;
1068 case 5:
1069 kvm_write_c0_guest_kscratch4(cop0, v);
1070 break;
1071 case 6:
1072 kvm_write_c0_guest_kscratch5(cop0, v);
1073 break;
1074 case 7:
1075 kvm_write_c0_guest_kscratch6(cop0, v);
1076 break;
1077 }
1078 break;
1079 /* registers to be handled specially */ 825 /* registers to be handled specially */
1080 default: 826 default:
1081 return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); 827 return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
@@ -1144,18 +890,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
1144 return -E2BIG; 890 return -E2BIG;
1145 return kvm_mips_copy_reg_indices(vcpu, user_list->reg); 891 return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
1146 } 892 }
1147 case KVM_NMI:
1148 /* Treat the NMI as a CPU reset */
1149 r = kvm_mips_reset_vcpu(vcpu);
1150 break;
1151 case KVM_INTERRUPT: 893 case KVM_INTERRUPT:
1152 { 894 {
1153 struct kvm_mips_interrupt irq; 895 struct kvm_mips_interrupt irq;
1154 896
1155 r = -EFAULT;
1156 if (copy_from_user(&irq, argp, sizeof(irq))) 897 if (copy_from_user(&irq, argp, sizeof(irq)))
1157 goto out; 898 return -EFAULT;
1158
1159 kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, 899 kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
1160 irq.irq); 900 irq.irq);
1161 901
@@ -1165,56 +905,57 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
1165 case KVM_ENABLE_CAP: { 905 case KVM_ENABLE_CAP: {
1166 struct kvm_enable_cap cap; 906 struct kvm_enable_cap cap;
1167 907
1168 r = -EFAULT;
1169 if (copy_from_user(&cap, argp, sizeof(cap))) 908 if (copy_from_user(&cap, argp, sizeof(cap)))
1170 goto out; 909 return -EFAULT;
1171 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); 910 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
1172 break; 911 break;
1173 } 912 }
1174 default: 913 default:
1175 r = -ENOIOCTLCMD; 914 r = -ENOIOCTLCMD;
1176 } 915 }
1177
1178out:
1179 return r; 916 return r;
1180} 917}
1181 918
1182/* Get (and clear) the dirty memory log for a memory slot. */ 919/**
920 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
921 * @kvm: kvm instance
922 * @log: slot id and address to which we copy the log
923 *
924 * Steps 1-4 below provide general overview of dirty page logging. See
925 * kvm_get_dirty_log_protect() function description for additional details.
926 *
927 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
928 * always flush the TLB (step 4) even if previous step failed and the dirty
929 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
930 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
931 * writes will be marked dirty for next log read.
932 *
933 * 1. Take a snapshot of the bit and clear it if needed.
934 * 2. Write protect the corresponding page.
935 * 3. Copy the snapshot to the userspace.
936 * 4. Flush TLB's if needed.
937 */
1183int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) 938int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1184{ 939{
1185 struct kvm_memslots *slots; 940 struct kvm_memslots *slots;
1186 struct kvm_memory_slot *memslot; 941 struct kvm_memory_slot *memslot;
1187 unsigned long ga, ga_end; 942 bool is_dirty = false;
1188 int is_dirty = 0;
1189 int r; 943 int r;
1190 unsigned long n;
1191 944
1192 mutex_lock(&kvm->slots_lock); 945 mutex_lock(&kvm->slots_lock);
1193 946
1194 r = kvm_get_dirty_log(kvm, log, &is_dirty); 947 r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
1195 if (r)
1196 goto out;
1197 948
1198 /* If nothing is dirty, don't bother messing with page tables. */
1199 if (is_dirty) { 949 if (is_dirty) {
1200 slots = kvm_memslots(kvm); 950 slots = kvm_memslots(kvm);
1201 memslot = id_to_memslot(slots, log->slot); 951 memslot = id_to_memslot(slots, log->slot);
1202 952
1203 ga = memslot->base_gfn << PAGE_SHIFT; 953 /* Let implementation handle TLB/GVA invalidation */
1204 ga_end = ga + (memslot->npages << PAGE_SHIFT); 954 kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
1205
1206 kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga,
1207 ga_end);
1208
1209 n = kvm_dirty_bitmap_bytes(memslot);
1210 memset(memslot->dirty_bitmap, 0, n);
1211 } 955 }
1212 956
1213 r = 0;
1214out:
1215 mutex_unlock(&kvm->slots_lock); 957 mutex_unlock(&kvm->slots_lock);
1216 return r; 958 return r;
1217
1218} 959}
1219 960
1220long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) 961long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
@@ -1282,11 +1023,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
1282 switch (ext) { 1023 switch (ext) {
1283 case KVM_CAP_ONE_REG: 1024 case KVM_CAP_ONE_REG:
1284 case KVM_CAP_ENABLE_CAP: 1025 case KVM_CAP_ENABLE_CAP:
1026 case KVM_CAP_READONLY_MEM:
1027 case KVM_CAP_SYNC_MMU:
1028 case KVM_CAP_IMMEDIATE_EXIT:
1285 r = 1; 1029 r = 1;
1286 break; 1030 break;
1287 case KVM_CAP_COALESCED_MMIO: 1031 case KVM_CAP_COALESCED_MMIO:
1288 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 1032 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1289 break; 1033 break;
1034 case KVM_CAP_NR_VCPUS:
1035 r = num_online_cpus();
1036 break;
1037 case KVM_CAP_MAX_VCPUS:
1038 r = KVM_MAX_VCPUS;
1039 break;
1290 case KVM_CAP_MIPS_FPU: 1040 case KVM_CAP_MIPS_FPU:
1291 /* We don't handle systems with inconsistent cpu_has_fpu */ 1041 /* We don't handle systems with inconsistent cpu_has_fpu */
1292 r = !!raw_cpu_has_fpu; 1042 r = !!raw_cpu_has_fpu;
@@ -1400,13 +1150,23 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
1400 1150
1401int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 1151int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
1402{ 1152{
1403 kvm_mips_callbacks->vcpu_init(vcpu); 1153 int err;
1154
1155 err = kvm_mips_callbacks->vcpu_init(vcpu);
1156 if (err)
1157 return err;
1158
1404 hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, 1159 hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC,
1405 HRTIMER_MODE_REL); 1160 HRTIMER_MODE_REL);
1406 vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; 1161 vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup;
1407 return 0; 1162 return 0;
1408} 1163}
1409 1164
1165void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
1166{
1167 kvm_mips_callbacks->vcpu_uninit(vcpu);
1168}
1169
1410int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, 1170int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
1411 struct kvm_translation *tr) 1171 struct kvm_translation *tr)
1412{ 1172{
@@ -1440,8 +1200,11 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1440 u32 __user *opc = (u32 __user *) vcpu->arch.pc; 1200 u32 __user *opc = (u32 __user *) vcpu->arch.pc;
1441 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; 1201 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
1442 enum emulation_result er = EMULATE_DONE; 1202 enum emulation_result er = EMULATE_DONE;
1203 u32 inst;
1443 int ret = RESUME_GUEST; 1204 int ret = RESUME_GUEST;
1444 1205
1206 vcpu->mode = OUTSIDE_GUEST_MODE;
1207
1445 /* re-enable HTW before enabling interrupts */ 1208 /* re-enable HTW before enabling interrupts */
1446 htw_start(); 1209 htw_start();
1447 1210
@@ -1564,8 +1327,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1564 break; 1327 break;
1565 1328
1566 default: 1329 default:
1330 if (cause & CAUSEF_BD)
1331 opc += 1;
1332 inst = 0;
1333 kvm_get_badinstr(opc, vcpu, &inst);
1567 kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n", 1334 kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n",
1568 exccode, opc, kvm_get_inst(opc, vcpu), badvaddr, 1335 exccode, opc, inst, badvaddr,
1569 kvm_read_c0_guest_status(vcpu->arch.cop0)); 1336 kvm_read_c0_guest_status(vcpu->arch.cop0));
1570 kvm_arch_vcpu_dump_regs(vcpu); 1337 kvm_arch_vcpu_dump_regs(vcpu);
1571 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 1338 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1593,7 +1360,15 @@ skip_emul:
1593 if (ret == RESUME_GUEST) { 1360 if (ret == RESUME_GUEST) {
1594 trace_kvm_reenter(vcpu); 1361 trace_kvm_reenter(vcpu);
1595 1362
1596 kvm_mips_check_asids(vcpu); 1363 /*
1364 * Make sure the read of VCPU requests in vcpu_reenter()
1365 * callback is not reordered ahead of the write to vcpu->mode,
1366 * or we could miss a TLB flush request while the requester sees
1367 * the VCPU as outside of guest mode and not needing an IPI.
1368 */
1369 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1370
1371 kvm_mips_callbacks->vcpu_reenter(run, vcpu);
1597 1372
1598 /* 1373 /*
1599 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context 1374 * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 3b677c851be0..cb0faade311e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -11,86 +11,995 @@
11 11
12#include <linux/highmem.h> 12#include <linux/highmem.h>
13#include <linux/kvm_host.h> 13#include <linux/kvm_host.h>
14#include <linux/uaccess.h>
14#include <asm/mmu_context.h> 15#include <asm/mmu_context.h>
16#include <asm/pgalloc.h>
15 17
16static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) 18/*
19 * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
20 * for which pages need to be cached.
21 */
22#if defined(__PAGETABLE_PMD_FOLDED)
23#define KVM_MMU_CACHE_MIN_PAGES 1
24#else
25#define KVM_MMU_CACHE_MIN_PAGES 2
26#endif
27
28static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
29 int min, int max)
17{ 30{
18 int cpu = smp_processor_id(); 31 void *page;
32
33 BUG_ON(max > KVM_NR_MEM_OBJS);
34 if (cache->nobjs >= min)
35 return 0;
36 while (cache->nobjs < max) {
37 page = (void *)__get_free_page(GFP_KERNEL);
38 if (!page)
39 return -ENOMEM;
40 cache->objects[cache->nobjs++] = page;
41 }
42 return 0;
43}
19 44
20 return vcpu->arch.guest_kernel_asid[cpu] & 45static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
21 cpu_asid_mask(&cpu_data[cpu]); 46{
47 while (mc->nobjs)
48 free_page((unsigned long)mc->objects[--mc->nobjs]);
22} 49}
23 50
24static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) 51static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
25{ 52{
26 int cpu = smp_processor_id(); 53 void *p;
27 54
28 return vcpu->arch.guest_user_asid[cpu] & 55 BUG_ON(!mc || !mc->nobjs);
29 cpu_asid_mask(&cpu_data[cpu]); 56 p = mc->objects[--mc->nobjs];
57 return p;
30} 58}
31 59
32static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) 60void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
33{ 61{
34 int srcu_idx, err = 0; 62 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
35 kvm_pfn_t pfn; 63}
64
65/**
66 * kvm_pgd_init() - Initialise KVM GPA page directory.
67 * @page: Pointer to page directory (PGD) for KVM GPA.
68 *
69 * Initialise a KVM GPA page directory with pointers to the invalid table, i.e.
70 * representing no mappings. This is similar to pgd_init(), however it
71 * initialises all the page directory pointers, not just the ones corresponding
72 * to the userland address space (since it is for the guest physical address
73 * space rather than a virtual address space).
74 */
75static void kvm_pgd_init(void *page)
76{
77 unsigned long *p, *end;
78 unsigned long entry;
79
80#ifdef __PAGETABLE_PMD_FOLDED
81 entry = (unsigned long)invalid_pte_table;
82#else
83 entry = (unsigned long)invalid_pmd_table;
84#endif
85
86 p = (unsigned long *)page;
87 end = p + PTRS_PER_PGD;
88
89 do {
90 p[0] = entry;
91 p[1] = entry;
92 p[2] = entry;
93 p[3] = entry;
94 p[4] = entry;
95 p += 8;
96 p[-3] = entry;
97 p[-2] = entry;
98 p[-1] = entry;
99 } while (p != end);
100}
101
102/**
103 * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
104 *
105 * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
106 * to host physical page mappings.
107 *
108 * Returns: Pointer to new KVM GPA page directory.
109 * NULL on allocation failure.
110 */
111pgd_t *kvm_pgd_alloc(void)
112{
113 pgd_t *ret;
114
115 ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER);
116 if (ret)
117 kvm_pgd_init(ret);
118
119 return ret;
120}
121
122/**
123 * kvm_mips_walk_pgd() - Walk page table with optional allocation.
124 * @pgd: Page directory pointer.
125 * @addr: Address to index page table using.
126 * @cache: MMU page cache to allocate new page tables from, or NULL.
127 *
128 * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
129 * address @addr. If page tables don't exist for @addr, they will be created
130 * from the MMU cache if @cache is not NULL.
131 *
132 * Returns: Pointer to pte_t corresponding to @addr.
133 * NULL if a page table doesn't exist for @addr and !@cache.
134 * NULL if a page table allocation failed.
135 */
136static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
137 unsigned long addr)
138{
139 pud_t *pud;
140 pmd_t *pmd;
141
142 pgd += pgd_index(addr);
143 if (pgd_none(*pgd)) {
144 /* Not used on MIPS yet */
145 BUG();
146 return NULL;
147 }
148 pud = pud_offset(pgd, addr);
149 if (pud_none(*pud)) {
150 pmd_t *new_pmd;
151
152 if (!cache)
153 return NULL;
154 new_pmd = mmu_memory_cache_alloc(cache);
155 pmd_init((unsigned long)new_pmd,
156 (unsigned long)invalid_pte_table);
157 pud_populate(NULL, pud, new_pmd);
158 }
159 pmd = pmd_offset(pud, addr);
160 if (pmd_none(*pmd)) {
161 pte_t *new_pte;
162
163 if (!cache)
164 return NULL;
165 new_pte = mmu_memory_cache_alloc(cache);
166 clear_page(new_pte);
167 pmd_populate_kernel(NULL, pmd, new_pte);
168 }
169 return pte_offset(pmd, addr);
170}
171
172/* Caller must hold kvm->mm_lock */
173static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm,
174 struct kvm_mmu_memory_cache *cache,
175 unsigned long addr)
176{
177 return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
178}
179
180/*
181 * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}.
182 * Flush a range of guest physical address space from the VM's GPA page tables.
183 */
184
185static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
186 unsigned long end_gpa)
187{
188 int i_min = __pte_offset(start_gpa);
189 int i_max = __pte_offset(end_gpa);
190 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
191 int i;
192
193 for (i = i_min; i <= i_max; ++i) {
194 if (!pte_present(pte[i]))
195 continue;
196
197 set_pte(pte + i, __pte(0));
198 }
199 return safe_to_remove;
200}
201
202static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
203 unsigned long end_gpa)
204{
205 pte_t *pte;
206 unsigned long end = ~0ul;
207 int i_min = __pmd_offset(start_gpa);
208 int i_max = __pmd_offset(end_gpa);
209 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
210 int i;
211
212 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
213 if (!pmd_present(pmd[i]))
214 continue;
215
216 pte = pte_offset(pmd + i, 0);
217 if (i == i_max)
218 end = end_gpa;
219
220 if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) {
221 pmd_clear(pmd + i);
222 pte_free_kernel(NULL, pte);
223 } else {
224 safe_to_remove = false;
225 }
226 }
227 return safe_to_remove;
228}
229
230static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
231 unsigned long end_gpa)
232{
233 pmd_t *pmd;
234 unsigned long end = ~0ul;
235 int i_min = __pud_offset(start_gpa);
236 int i_max = __pud_offset(end_gpa);
237 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
238 int i;
239
240 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
241 if (!pud_present(pud[i]))
242 continue;
243
244 pmd = pmd_offset(pud + i, 0);
245 if (i == i_max)
246 end = end_gpa;
247
248 if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) {
249 pud_clear(pud + i);
250 pmd_free(NULL, pmd);
251 } else {
252 safe_to_remove = false;
253 }
254 }
255 return safe_to_remove;
256}
257
258static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
259 unsigned long end_gpa)
260{
261 pud_t *pud;
262 unsigned long end = ~0ul;
263 int i_min = pgd_index(start_gpa);
264 int i_max = pgd_index(end_gpa);
265 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
266 int i;
267
268 for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
269 if (!pgd_present(pgd[i]))
270 continue;
271
272 pud = pud_offset(pgd + i, 0);
273 if (i == i_max)
274 end = end_gpa;
275
276 if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) {
277 pgd_clear(pgd + i);
278 pud_free(NULL, pud);
279 } else {
280 safe_to_remove = false;
281 }
282 }
283 return safe_to_remove;
284}
285
286/**
287 * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses.
288 * @kvm: KVM pointer.
289 * @start_gfn: Guest frame number of first page in GPA range to flush.
290 * @end_gfn: Guest frame number of last page in GPA range to flush.
291 *
292 * Flushes a range of GPA mappings from the GPA page tables.
293 *
294 * The caller must hold the @kvm->mmu_lock spinlock.
295 *
296 * Returns: Whether its safe to remove the top level page directory because
297 * all lower levels have been removed.
298 */
299bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
300{
301 return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
302 start_gfn << PAGE_SHIFT,
303 end_gfn << PAGE_SHIFT);
304}
305
306#define BUILD_PTE_RANGE_OP(name, op) \
307static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start, \
308 unsigned long end) \
309{ \
310 int ret = 0; \
311 int i_min = __pte_offset(start); \
312 int i_max = __pte_offset(end); \
313 int i; \
314 pte_t old, new; \
315 \
316 for (i = i_min; i <= i_max; ++i) { \
317 if (!pte_present(pte[i])) \
318 continue; \
319 \
320 old = pte[i]; \
321 new = op(old); \
322 if (pte_val(new) == pte_val(old)) \
323 continue; \
324 set_pte(pte + i, new); \
325 ret = 1; \
326 } \
327 return ret; \
328} \
329 \
330/* returns true if anything was done */ \
331static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start, \
332 unsigned long end) \
333{ \
334 int ret = 0; \
335 pte_t *pte; \
336 unsigned long cur_end = ~0ul; \
337 int i_min = __pmd_offset(start); \
338 int i_max = __pmd_offset(end); \
339 int i; \
340 \
341 for (i = i_min; i <= i_max; ++i, start = 0) { \
342 if (!pmd_present(pmd[i])) \
343 continue; \
344 \
345 pte = pte_offset(pmd + i, 0); \
346 if (i == i_max) \
347 cur_end = end; \
348 \
349 ret |= kvm_mips_##name##_pte(pte, start, cur_end); \
350 } \
351 return ret; \
352} \
353 \
354static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start, \
355 unsigned long end) \
356{ \
357 int ret = 0; \
358 pmd_t *pmd; \
359 unsigned long cur_end = ~0ul; \
360 int i_min = __pud_offset(start); \
361 int i_max = __pud_offset(end); \
362 int i; \
363 \
364 for (i = i_min; i <= i_max; ++i, start = 0) { \
365 if (!pud_present(pud[i])) \
366 continue; \
367 \
368 pmd = pmd_offset(pud + i, 0); \
369 if (i == i_max) \
370 cur_end = end; \
371 \
372 ret |= kvm_mips_##name##_pmd(pmd, start, cur_end); \
373 } \
374 return ret; \
375} \
376 \
377static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start, \
378 unsigned long end) \
379{ \
380 int ret = 0; \
381 pud_t *pud; \
382 unsigned long cur_end = ~0ul; \
383 int i_min = pgd_index(start); \
384 int i_max = pgd_index(end); \
385 int i; \
386 \
387 for (i = i_min; i <= i_max; ++i, start = 0) { \
388 if (!pgd_present(pgd[i])) \
389 continue; \
390 \
391 pud = pud_offset(pgd + i, 0); \
392 if (i == i_max) \
393 cur_end = end; \
394 \
395 ret |= kvm_mips_##name##_pud(pud, start, cur_end); \
396 } \
397 return ret; \
398}
399
400/*
401 * kvm_mips_mkclean_gpa_pt.
402 * Mark a range of guest physical address space clean (writes fault) in the VM's
403 * GPA page table to allow dirty page tracking.
404 */
36 405
37 if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) 406BUILD_PTE_RANGE_OP(mkclean, pte_mkclean)
407
408/**
409 * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
410 * @kvm: KVM pointer.
411 * @start_gfn: Guest frame number of first page in GPA range to flush.
412 * @end_gfn: Guest frame number of last page in GPA range to flush.
413 *
414 * Make a range of GPA mappings clean so that guest writes will fault and
415 * trigger dirty page logging.
416 *
417 * The caller must hold the @kvm->mmu_lock spinlock.
418 *
419 * Returns: Whether any GPA mappings were modified, which would require
420 * derived mappings (GVA page tables & TLB enties) to be
421 * invalidated.
422 */
423int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
424{
425 return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd,
426 start_gfn << PAGE_SHIFT,
427 end_gfn << PAGE_SHIFT);
428}
429
430/**
431 * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
432 * @kvm: The KVM pointer
433 * @slot: The memory slot associated with mask
434 * @gfn_offset: The gfn offset in memory slot
435 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
436 * slot to be write protected
437 *
438 * Walks bits set in mask write protects the associated pte's. Caller must
439 * acquire @kvm->mmu_lock.
440 */
441void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
442 struct kvm_memory_slot *slot,
443 gfn_t gfn_offset, unsigned long mask)
444{
445 gfn_t base_gfn = slot->base_gfn + gfn_offset;
446 gfn_t start = base_gfn + __ffs(mask);
447 gfn_t end = base_gfn + __fls(mask);
448
449 kvm_mips_mkclean_gpa_pt(kvm, start, end);
450}
451
452/*
453 * kvm_mips_mkold_gpa_pt.
454 * Mark a range of guest physical address space old (all accesses fault) in the
455 * VM's GPA page table to allow detection of commonly used pages.
456 */
457
458BUILD_PTE_RANGE_OP(mkold, pte_mkold)
459
460static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
461 gfn_t end_gfn)
462{
463 return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd,
464 start_gfn << PAGE_SHIFT,
465 end_gfn << PAGE_SHIFT);
466}
467
468static int handle_hva_to_gpa(struct kvm *kvm,
469 unsigned long start,
470 unsigned long end,
471 int (*handler)(struct kvm *kvm, gfn_t gfn,
472 gpa_t gfn_end,
473 struct kvm_memory_slot *memslot,
474 void *data),
475 void *data)
476{
477 struct kvm_memslots *slots;
478 struct kvm_memory_slot *memslot;
479 int ret = 0;
480
481 slots = kvm_memslots(kvm);
482
483 /* we only care about the pages that the guest sees */
484 kvm_for_each_memslot(memslot, slots) {
485 unsigned long hva_start, hva_end;
486 gfn_t gfn, gfn_end;
487
488 hva_start = max(start, memslot->userspace_addr);
489 hva_end = min(end, memslot->userspace_addr +
490 (memslot->npages << PAGE_SHIFT));
491 if (hva_start >= hva_end)
492 continue;
493
494 /*
495 * {gfn(page) | page intersects with [hva_start, hva_end)} =
496 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
497 */
498 gfn = hva_to_gfn_memslot(hva_start, memslot);
499 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
500
501 ret |= handler(kvm, gfn, gfn_end, memslot, data);
502 }
503
504 return ret;
505}
506
507
508static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
509 struct kvm_memory_slot *memslot, void *data)
510{
511 kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
512 return 1;
513}
514
515int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
516{
517 unsigned long end = hva + PAGE_SIZE;
518
519 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
520
521 kvm_mips_callbacks->flush_shadow_all(kvm);
522 return 0;
523}
524
525int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
526{
527 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
528
529 kvm_mips_callbacks->flush_shadow_all(kvm);
530 return 0;
531}
532
533static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
534 struct kvm_memory_slot *memslot, void *data)
535{
536 gpa_t gpa = gfn << PAGE_SHIFT;
537 pte_t hva_pte = *(pte_t *)data;
538 pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
539 pte_t old_pte;
540
541 if (!gpa_pte)
542 return 0;
543
544 /* Mapping may need adjusting depending on memslot flags */
545 old_pte = *gpa_pte;
546 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
547 hva_pte = pte_mkclean(hva_pte);
548 else if (memslot->flags & KVM_MEM_READONLY)
549 hva_pte = pte_wrprotect(hva_pte);
550
551 set_pte(gpa_pte, hva_pte);
552
553 /* Replacing an absent or old page doesn't need flushes */
554 if (!pte_present(old_pte) || !pte_young(old_pte))
38 return 0; 555 return 0;
39 556
557 /* Pages swapped, aged, moved, or cleaned require flushes */
558 return !pte_present(hva_pte) ||
559 !pte_young(hva_pte) ||
560 pte_pfn(old_pte) != pte_pfn(hva_pte) ||
561 (pte_dirty(old_pte) && !pte_dirty(hva_pte));
562}
563
564void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
565{
566 unsigned long end = hva + PAGE_SIZE;
567 int ret;
568
569 ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
570 if (ret)
571 kvm_mips_callbacks->flush_shadow_all(kvm);
572}
573
574static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
575 struct kvm_memory_slot *memslot, void *data)
576{
577 return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
578}
579
580static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
581 struct kvm_memory_slot *memslot, void *data)
582{
583 gpa_t gpa = gfn << PAGE_SHIFT;
584 pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
585
586 if (!gpa_pte)
587 return 0;
588 return pte_young(*gpa_pte);
589}
590
591int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
592{
593 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
594}
595
596int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
597{
598 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
599}
600
601/**
602 * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
603 * @vcpu: VCPU pointer.
604 * @gpa: Guest physical address of fault.
605 * @write_fault: Whether the fault was due to a write.
606 * @out_entry: New PTE for @gpa (written on success unless NULL).
607 * @out_buddy: New PTE for @gpa's buddy (written on success unless
608 * NULL).
609 *
610 * Perform fast path GPA fault handling, doing all that can be done without
611 * calling into KVM. This handles marking old pages young (for idle page
612 * tracking), and dirtying of clean pages (for dirty page logging).
613 *
614 * Returns: 0 on success, in which case we can update derived mappings and
615 * resume guest execution.
616 * -EFAULT on failure due to absent GPA mapping or write to
617 * read-only page, in which case KVM must be consulted.
618 */
619static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
620 bool write_fault,
621 pte_t *out_entry, pte_t *out_buddy)
622{
623 struct kvm *kvm = vcpu->kvm;
624 gfn_t gfn = gpa >> PAGE_SHIFT;
625 pte_t *ptep;
626 kvm_pfn_t pfn = 0; /* silence bogus GCC warning */
627 bool pfn_valid = false;
628 int ret = 0;
629
630 spin_lock(&kvm->mmu_lock);
631
632 /* Fast path - just check GPA page table for an existing entry */
633 ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
634 if (!ptep || !pte_present(*ptep)) {
635 ret = -EFAULT;
636 goto out;
637 }
638
639 /* Track access to pages marked old */
640 if (!pte_young(*ptep)) {
641 set_pte(ptep, pte_mkyoung(*ptep));
642 pfn = pte_pfn(*ptep);
643 pfn_valid = true;
644 /* call kvm_set_pfn_accessed() after unlock */
645 }
646 if (write_fault && !pte_dirty(*ptep)) {
647 if (!pte_write(*ptep)) {
648 ret = -EFAULT;
649 goto out;
650 }
651
652 /* Track dirtying of writeable pages */
653 set_pte(ptep, pte_mkdirty(*ptep));
654 pfn = pte_pfn(*ptep);
655 mark_page_dirty(kvm, gfn);
656 kvm_set_pfn_dirty(pfn);
657 }
658
659 if (out_entry)
660 *out_entry = *ptep;
661 if (out_buddy)
662 *out_buddy = *ptep_buddy(ptep);
663
664out:
665 spin_unlock(&kvm->mmu_lock);
666 if (pfn_valid)
667 kvm_set_pfn_accessed(pfn);
668 return ret;
669}
670
671/**
672 * kvm_mips_map_page() - Map a guest physical page.
673 * @vcpu: VCPU pointer.
674 * @gpa: Guest physical address of fault.
675 * @write_fault: Whether the fault was due to a write.
676 * @out_entry: New PTE for @gpa (written on success unless NULL).
677 * @out_buddy: New PTE for @gpa's buddy (written on success unless
678 * NULL).
679 *
680 * Handle GPA faults by creating a new GPA mapping (or updating an existing
681 * one).
682 *
683 * This takes care of marking pages young or dirty (idle/dirty page tracking),
684 * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
685 * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
686 * caller.
687 *
688 * Returns: 0 on success, in which case the caller may use the @out_entry
689 * and @out_buddy PTEs to update derived mappings and resume guest
690 * execution.
691 * -EFAULT if there is no memory region at @gpa or a write was
692 * attempted to a read-only memory region. This is usually handled
693 * as an MMIO access.
694 */
695static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
696 bool write_fault,
697 pte_t *out_entry, pte_t *out_buddy)
698{
699 struct kvm *kvm = vcpu->kvm;
700 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
701 gfn_t gfn = gpa >> PAGE_SHIFT;
702 int srcu_idx, err;
703 kvm_pfn_t pfn;
704 pte_t *ptep, entry, old_pte;
705 bool writeable;
706 unsigned long prot_bits;
707 unsigned long mmu_seq;
708
709 /* Try the fast path to handle old / clean pages */
40 srcu_idx = srcu_read_lock(&kvm->srcu); 710 srcu_idx = srcu_read_lock(&kvm->srcu);
41 pfn = gfn_to_pfn(kvm, gfn); 711 err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry,
712 out_buddy);
713 if (!err)
714 goto out;
42 715
716 /* We need a minimum of cached pages ready for page table creation */
717 err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
718 KVM_NR_MEM_OBJS);
719 if (err)
720 goto out;
721
722retry:
723 /*
724 * Used to check for invalidations in progress, of the pfn that is
725 * returned by pfn_to_pfn_prot below.
726 */
727 mmu_seq = kvm->mmu_notifier_seq;
728 /*
729 * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
730 * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
731 * risk the page we get a reference to getting unmapped before we have a
732 * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
733 *
734 * This smp_rmb() pairs with the effective smp_wmb() of the combination
735 * of the pte_unmap_unlock() after the PTE is zapped, and the
736 * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
737 * mmu_notifier_seq is incremented.
738 */
739 smp_rmb();
740
741 /* Slow path - ask KVM core whether we can access this GPA */
742 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable);
43 if (is_error_noslot_pfn(pfn)) { 743 if (is_error_noslot_pfn(pfn)) {
44 kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
45 err = -EFAULT; 744 err = -EFAULT;
46 goto out; 745 goto out;
47 } 746 }
48 747
49 kvm->arch.guest_pmap[gfn] = pfn; 748 spin_lock(&kvm->mmu_lock);
749 /* Check if an invalidation has taken place since we got pfn */
750 if (mmu_notifier_retry(kvm, mmu_seq)) {
751 /*
752 * This can happen when mappings are changed asynchronously, but
753 * also synchronously if a COW is triggered by
754 * gfn_to_pfn_prot().
755 */
756 spin_unlock(&kvm->mmu_lock);
757 kvm_release_pfn_clean(pfn);
758 goto retry;
759 }
760
761 /* Ensure page tables are allocated */
762 ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
763
764 /* Set up the PTE */
765 prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default;
766 if (writeable) {
767 prot_bits |= _PAGE_WRITE;
768 if (write_fault) {
769 prot_bits |= __WRITEABLE;
770 mark_page_dirty(kvm, gfn);
771 kvm_set_pfn_dirty(pfn);
772 }
773 }
774 entry = pfn_pte(pfn, __pgprot(prot_bits));
775
776 /* Write the PTE */
777 old_pte = *ptep;
778 set_pte(ptep, entry);
779
780 err = 0;
781 if (out_entry)
782 *out_entry = *ptep;
783 if (out_buddy)
784 *out_buddy = *ptep_buddy(ptep);
785
786 spin_unlock(&kvm->mmu_lock);
787 kvm_release_pfn_clean(pfn);
788 kvm_set_pfn_accessed(pfn);
50out: 789out:
51 srcu_read_unlock(&kvm->srcu, srcu_idx); 790 srcu_read_unlock(&kvm->srcu, srcu_idx);
52 return err; 791 return err;
53} 792}
54 793
55/* Translate guest KSEG0 addresses to Host PA */ 794static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu,
56unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, 795 unsigned long addr)
57 unsigned long gva)
58{ 796{
59 gfn_t gfn; 797 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
60 unsigned long offset = gva & ~PAGE_MASK; 798 pgd_t *pgdp;
61 struct kvm *kvm = vcpu->kvm; 799 int ret;
800
801 /* We need a minimum of cached pages ready for page table creation */
802 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
803 KVM_NR_MEM_OBJS);
804 if (ret)
805 return NULL;
806
807 if (KVM_GUEST_KERNEL_MODE(vcpu))
808 pgdp = vcpu->arch.guest_kernel_mm.pgd;
809 else
810 pgdp = vcpu->arch.guest_user_mm.pgd;
811
812 return kvm_mips_walk_pgd(pgdp, memcache, addr);
813}
62 814
63 if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { 815void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
64 kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, 816 bool user)
65 __builtin_return_address(0), gva); 817{
66 return KVM_INVALID_PAGE; 818 pgd_t *pgdp;
819 pte_t *ptep;
820
821 addr &= PAGE_MASK << 1;
822
823 pgdp = vcpu->arch.guest_kernel_mm.pgd;
824 ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
825 if (ptep) {
826 ptep[0] = pfn_pte(0, __pgprot(0));
827 ptep[1] = pfn_pte(0, __pgprot(0));
828 }
829
830 if (user) {
831 pgdp = vcpu->arch.guest_user_mm.pgd;
832 ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
833 if (ptep) {
834 ptep[0] = pfn_pte(0, __pgprot(0));
835 ptep[1] = pfn_pte(0, __pgprot(0));
836 }
67 } 837 }
838}
68 839
69 gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); 840/*
841 * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}.
842 * Flush a range of guest physical address space from the VM's GPA page tables.
843 */
70 844
71 if (gfn >= kvm->arch.guest_pmap_npages) { 845static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva,
72 kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, 846 unsigned long end_gva)
73 gva); 847{
74 return KVM_INVALID_PAGE; 848 int i_min = __pte_offset(start_gva);
849 int i_max = __pte_offset(end_gva);
850 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
851 int i;
852
853 /*
854 * There's no freeing to do, so there's no point clearing individual
855 * entries unless only part of the last level page table needs flushing.
856 */
857 if (safe_to_remove)
858 return true;
859
860 for (i = i_min; i <= i_max; ++i) {
861 if (!pte_present(pte[i]))
862 continue;
863
864 set_pte(pte + i, __pte(0));
75 } 865 }
866 return false;
867}
76 868
77 if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) 869static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva,
78 return KVM_INVALID_ADDR; 870 unsigned long end_gva)
871{
872 pte_t *pte;
873 unsigned long end = ~0ul;
874 int i_min = __pmd_offset(start_gva);
875 int i_max = __pmd_offset(end_gva);
876 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
877 int i;
878
879 for (i = i_min; i <= i_max; ++i, start_gva = 0) {
880 if (!pmd_present(pmd[i]))
881 continue;
882
883 pte = pte_offset(pmd + i, 0);
884 if (i == i_max)
885 end = end_gva;
886
887 if (kvm_mips_flush_gva_pte(pte, start_gva, end)) {
888 pmd_clear(pmd + i);
889 pte_free_kernel(NULL, pte);
890 } else {
891 safe_to_remove = false;
892 }
893 }
894 return safe_to_remove;
895}
79 896
80 return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; 897static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva,
898 unsigned long end_gva)
899{
900 pmd_t *pmd;
901 unsigned long end = ~0ul;
902 int i_min = __pud_offset(start_gva);
903 int i_max = __pud_offset(end_gva);
904 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
905 int i;
906
907 for (i = i_min; i <= i_max; ++i, start_gva = 0) {
908 if (!pud_present(pud[i]))
909 continue;
910
911 pmd = pmd_offset(pud + i, 0);
912 if (i == i_max)
913 end = end_gva;
914
915 if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) {
916 pud_clear(pud + i);
917 pmd_free(NULL, pmd);
918 } else {
919 safe_to_remove = false;
920 }
921 }
922 return safe_to_remove;
923}
924
925static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva,
926 unsigned long end_gva)
927{
928 pud_t *pud;
929 unsigned long end = ~0ul;
930 int i_min = pgd_index(start_gva);
931 int i_max = pgd_index(end_gva);
932 bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
933 int i;
934
935 for (i = i_min; i <= i_max; ++i, start_gva = 0) {
936 if (!pgd_present(pgd[i]))
937 continue;
938
939 pud = pud_offset(pgd + i, 0);
940 if (i == i_max)
941 end = end_gva;
942
943 if (kvm_mips_flush_gva_pud(pud, start_gva, end)) {
944 pgd_clear(pgd + i);
945 pud_free(NULL, pud);
946 } else {
947 safe_to_remove = false;
948 }
949 }
950 return safe_to_remove;
951}
952
953void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
954{
955 if (flags & KMF_GPA) {
956 /* all of guest virtual address space could be affected */
957 if (flags & KMF_KERN)
958 /* useg, kseg0, seg2/3 */
959 kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff);
960 else
961 /* useg */
962 kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
963 } else {
964 /* useg */
965 kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
966
967 /* kseg2/3 */
968 if (flags & KMF_KERN)
969 kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff);
970 }
971}
972
973static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte)
974{
975 /*
976 * Don't leak writeable but clean entries from GPA page tables. We don't
977 * want the normal Linux tlbmod handler to handle dirtying when KVM
978 * accesses guest memory.
979 */
980 if (!pte_dirty(pte))
981 pte = pte_wrprotect(pte);
982
983 return pte;
984}
985
986static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
987{
988 /* Guest EntryLo overrides host EntryLo */
989 if (!(entrylo & ENTRYLO_D))
990 pte = pte_mkclean(pte);
991
992 return kvm_mips_gpa_pte_to_gva_unmapped(pte);
81} 993}
82 994
83/* XXXKYMA: Must be called with interrupts disabled */ 995/* XXXKYMA: Must be called with interrupts disabled */
84int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, 996int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
85 struct kvm_vcpu *vcpu) 997 struct kvm_vcpu *vcpu,
998 bool write_fault)
86{ 999{
87 gfn_t gfn; 1000 unsigned long gpa;
88 kvm_pfn_t pfn0, pfn1; 1001 pte_t pte_gpa[2], *ptep_gva;
89 unsigned long vaddr = 0; 1002 int idx;
90 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
91 struct kvm *kvm = vcpu->kvm;
92 const int flush_dcache_mask = 0;
93 int ret;
94 1003
95 if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { 1004 if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
96 kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); 1005 kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -98,49 +1007,39 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
98 return -1; 1007 return -1;
99 } 1008 }
100 1009
101 gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); 1010 /* Get the GPA page table entry */
102 if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { 1011 gpa = KVM_GUEST_CPHYSADDR(badvaddr);
103 kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, 1012 idx = (badvaddr >> PAGE_SHIFT) & 1;
104 gfn, badvaddr); 1013 if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx],
105 kvm_mips_dump_host_tlbs(); 1014 &pte_gpa[!idx]) < 0)
106 return -1; 1015 return -1;
107 }
108 vaddr = badvaddr & (PAGE_MASK << 1);
109 1016
110 if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) 1017 /* Get the GVA page table entry */
1018 ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE);
1019 if (!ptep_gva) {
1020 kvm_err("No ptep for gva %lx\n", badvaddr);
111 return -1; 1021 return -1;
1022 }
112 1023
113 if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) 1024 /* Copy a pair of entries from GPA page table to GVA page table */
114 return -1; 1025 ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]);
115 1026 ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]);
116 pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
117 pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
118
119 entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
120 ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
121 ENTRYLO_D | ENTRYLO_V;
122 entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
123 ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
124 ENTRYLO_D | ENTRYLO_V;
125
126 preempt_disable();
127 entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
128 ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
129 flush_dcache_mask);
130 preempt_enable();
131 1027
132 return ret; 1028 /* Invalidate this entry in the TLB, guest kernel ASID only */
1029 kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
1030 return 0;
133} 1031}
134 1032
135int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, 1033int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
136 struct kvm_mips_tlb *tlb) 1034 struct kvm_mips_tlb *tlb,
1035 unsigned long gva,
1036 bool write_fault)
137{ 1037{
138 unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
139 struct kvm *kvm = vcpu->kvm; 1038 struct kvm *kvm = vcpu->kvm;
140 kvm_pfn_t pfn0, pfn1;
141 gfn_t gfn0, gfn1;
142 long tlb_lo[2]; 1039 long tlb_lo[2];
143 int ret; 1040 pte_t pte_gpa[2], *ptep_buddy, *ptep_gva;
1041 unsigned int idx = TLB_LO_IDX(*tlb, gva);
1042 bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
144 1043
145 tlb_lo[0] = tlb->tlb_lo[0]; 1044 tlb_lo[0] = tlb->tlb_lo[0];
146 tlb_lo[1] = tlb->tlb_lo[1]; 1045 tlb_lo[1] = tlb->tlb_lo[1];
@@ -149,70 +1048,64 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
149 * The commpage address must not be mapped to anything else if the guest 1048 * The commpage address must not be mapped to anything else if the guest
150 * TLB contains entries nearby, or commpage accesses will break. 1049 * TLB contains entries nearby, or commpage accesses will break.
151 */ 1050 */
152 if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) & 1051 if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1)))
153 VPN2_MASK & (PAGE_MASK << 1))) 1052 tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0;
154 tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
155
156 gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
157 gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
158 if (gfn0 >= kvm->arch.guest_pmap_npages ||
159 gfn1 >= kvm->arch.guest_pmap_npages) {
160 kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
161 __func__, gfn0, gfn1, tlb->tlb_hi);
162 kvm_mips_dump_guest_tlbs(vcpu);
163 return -1;
164 }
165 1053
166 if (kvm_mips_map_page(kvm, gfn0) < 0) 1054 /* Get the GPA page table entry */
1055 if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]),
1056 write_fault, &pte_gpa[idx], NULL) < 0)
167 return -1; 1057 return -1;
168 1058
169 if (kvm_mips_map_page(kvm, gfn1) < 0) 1059 /* And its GVA buddy's GPA page table entry if it also exists */
1060 pte_gpa[!idx] = pfn_pte(0, __pgprot(0));
1061 if (tlb_lo[!idx] & ENTRYLO_V) {
1062 spin_lock(&kvm->mmu_lock);
1063 ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL,
1064 mips3_tlbpfn_to_paddr(tlb_lo[!idx]));
1065 if (ptep_buddy)
1066 pte_gpa[!idx] = *ptep_buddy;
1067 spin_unlock(&kvm->mmu_lock);
1068 }
1069
1070 /* Get the GVA page table entry pair */
1071 ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE);
1072 if (!ptep_gva) {
1073 kvm_err("No ptep for gva %lx\n", gva);
170 return -1; 1074 return -1;
1075 }
171 1076
172 pfn0 = kvm->arch.guest_pmap[gfn0]; 1077 /* Copy a pair of entries from GPA page table to GVA page table */
173 pfn1 = kvm->arch.guest_pmap[gfn1]; 1078 ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]);
1079 ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]);
174 1080
175 /* Get attributes from the Guest TLB */ 1081 /* Invalidate this entry in the TLB, current guest mode ASID only */
176 entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | 1082 kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel);
177 ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
178 (tlb_lo[0] & ENTRYLO_D) |
179 (tlb_lo[0] & ENTRYLO_V);
180 entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
181 ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
182 (tlb_lo[1] & ENTRYLO_D) |
183 (tlb_lo[1] & ENTRYLO_V);
184 1083
185 kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, 1084 kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
186 tlb->tlb_lo[0], tlb->tlb_lo[1]); 1085 tlb->tlb_lo[0], tlb->tlb_lo[1]);
187 1086
188 preempt_disable(); 1087 return 0;
189 entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
190 kvm_mips_get_kernel_asid(vcpu) :
191 kvm_mips_get_user_asid(vcpu));
192 ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
193 tlb->tlb_mask);
194 preempt_enable();
195
196 return ret;
197} 1088}
198 1089
199void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, 1090int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
200 struct kvm_vcpu *vcpu) 1091 struct kvm_vcpu *vcpu)
201{ 1092{
202 unsigned long asid = asid_cache(cpu); 1093 kvm_pfn_t pfn;
203 1094 pte_t *ptep;
204 asid += cpu_asid_inc();
205 if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
206 if (cpu_has_vtag_icache)
207 flush_icache_all();
208
209 kvm_local_flush_tlb_all(); /* start new asid cycle */
210 1095
211 if (!asid) /* fix version if needed */ 1096 ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr);
212 asid = asid_first_version(cpu); 1097 if (!ptep) {
1098 kvm_err("No ptep for commpage %lx\n", badvaddr);
1099 return -1;
213 } 1100 }
214 1101
215 cpu_context(cpu, mm) = asid_cache(cpu) = asid; 1102 pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
1103 /* Also set valid and dirty, so refill handler doesn't have to */
1104 *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED)));
1105
1106 /* Invalidate this entry in the TLB, guest kernel ASID only */
1107 kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
1108 return 0;
216} 1109}
217 1110
218/** 1111/**
@@ -235,42 +1128,13 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
235/* Restore ASID once we are scheduled back after preemption */ 1128/* Restore ASID once we are scheduled back after preemption */
236void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1129void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
237{ 1130{
238 unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
239 unsigned long flags; 1131 unsigned long flags;
240 int newasid = 0;
241 1132
242 kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); 1133 kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
243 1134
244 /* Allocate new kernel and user ASIDs if needed */
245
246 local_irq_save(flags); 1135 local_irq_save(flags);
247 1136
248 if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & 1137 vcpu->cpu = cpu;
249 asid_version_mask(cpu)) {
250 kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
251 vcpu->arch.guest_kernel_asid[cpu] =
252 vcpu->arch.guest_kernel_mm.context.asid[cpu];
253 newasid++;
254
255 kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
256 cpu_context(cpu, current->mm));
257 kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
258 cpu, vcpu->arch.guest_kernel_asid[cpu]);
259 }
260
261 if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
262 asid_version_mask(cpu)) {
263 kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
264 vcpu->arch.guest_user_asid[cpu] =
265 vcpu->arch.guest_user_mm.context.asid[cpu];
266 newasid++;
267
268 kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
269 cpu_context(cpu, current->mm));
270 kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
271 vcpu->arch.guest_user_asid[cpu]);
272 }
273
274 if (vcpu->arch.last_sched_cpu != cpu) { 1138 if (vcpu->arch.last_sched_cpu != cpu) {
275 kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", 1139 kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
276 vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); 1140 vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
@@ -282,42 +1146,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
282 kvm_mips_migrate_count(vcpu); 1146 kvm_mips_migrate_count(vcpu);
283 } 1147 }
284 1148
285 if (!newasid) {
286 /*
287 * If we preempted while the guest was executing, then reload
288 * the pre-empted ASID
289 */
290 if (current->flags & PF_VCPU) {
291 write_c0_entryhi(vcpu->arch.
292 preempt_entryhi & asid_mask);
293 ehb();
294 }
295 } else {
296 /* New ASIDs were allocated for the VM */
297
298 /*
299 * Were we in guest context? If so then the pre-empted ASID is
300 * no longer valid, we need to set it to what it should be based
301 * on the mode of the Guest (Kernel/User)
302 */
303 if (current->flags & PF_VCPU) {
304 if (KVM_GUEST_KERNEL_MODE(vcpu))
305 write_c0_entryhi(vcpu->arch.
306 guest_kernel_asid[cpu] &
307 asid_mask);
308 else
309 write_c0_entryhi(vcpu->arch.
310 guest_user_asid[cpu] &
311 asid_mask);
312 ehb();
313 }
314 }
315
316 /* restore guest state to registers */ 1149 /* restore guest state to registers */
317 kvm_mips_callbacks->vcpu_set_regs(vcpu); 1150 kvm_mips_callbacks->vcpu_load(vcpu, cpu);
318 1151
319 local_irq_restore(flags); 1152 local_irq_restore(flags);
320
321} 1153}
322 1154
323/* ASID can change if another task is scheduled during preemption */ 1155/* ASID can change if another task is scheduled during preemption */
@@ -329,75 +1161,90 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
329 local_irq_save(flags); 1161 local_irq_save(flags);
330 1162
331 cpu = smp_processor_id(); 1163 cpu = smp_processor_id();
332
333 vcpu->arch.preempt_entryhi = read_c0_entryhi();
334 vcpu->arch.last_sched_cpu = cpu; 1164 vcpu->arch.last_sched_cpu = cpu;
1165 vcpu->cpu = -1;
335 1166
336 /* save guest state in registers */ 1167 /* save guest state in registers */
337 kvm_mips_callbacks->vcpu_get_regs(vcpu); 1168 kvm_mips_callbacks->vcpu_put(vcpu, cpu);
338
339 if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
340 asid_version_mask(cpu))) {
341 kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
342 cpu_context(cpu, current->mm));
343 drop_mmu_context(current->mm, cpu);
344 }
345 write_c0_entryhi(cpu_asid(cpu, current->mm));
346 ehb();
347 1169
348 local_irq_restore(flags); 1170 local_irq_restore(flags);
349} 1171}
350 1172
351u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) 1173/**
1174 * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault.
1175 * @vcpu: Virtual CPU.
1176 * @gva: Guest virtual address to be accessed.
1177 * @write: True if write attempted (must be dirtied and made writable).
1178 *
1179 * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and
1180 * dirtying the page if @write so that guest instructions can be modified.
1181 *
1182 * Returns: KVM_MIPS_MAPPED on success.
1183 * KVM_MIPS_GVA if bad guest virtual address.
1184 * KVM_MIPS_GPA if bad guest physical address.
1185 * KVM_MIPS_TLB if guest TLB not present.
1186 * KVM_MIPS_TLBINV if guest TLB present but not valid.
1187 * KVM_MIPS_TLBMOD if guest TLB read only.
1188 */
1189enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
1190 unsigned long gva,
1191 bool write)
352{ 1192{
353 struct mips_coproc *cop0 = vcpu->arch.cop0; 1193 struct mips_coproc *cop0 = vcpu->arch.cop0;
354 unsigned long paddr, flags, vpn2, asid; 1194 struct kvm_mips_tlb *tlb;
355 unsigned long va = (unsigned long)opc;
356 void *vaddr;
357 u32 inst;
358 int index; 1195 int index;
359 1196
360 if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 || 1197 if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) {
361 KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { 1198 if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0)
362 local_irq_save(flags); 1199 return KVM_MIPS_GPA;
363 index = kvm_mips_host_tlb_lookup(vcpu, va); 1200 } else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) ||
364 if (index >= 0) { 1201 KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) {
365 inst = *(opc); 1202 /* Address should be in the guest TLB */
366 } else { 1203 index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) |
367 vpn2 = va & VPN2_MASK; 1204 (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID));
368 asid = kvm_read_c0_guest_entryhi(cop0) & 1205 if (index < 0)
369 KVM_ENTRYHI_ASID; 1206 return KVM_MIPS_TLB;
370 index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); 1207 tlb = &vcpu->arch.guest_tlb[index];
371 if (index < 0) { 1208
372 kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", 1209 /* Entry should be valid, and dirty for writes */
373 __func__, opc, vcpu, read_c0_entryhi()); 1210 if (!TLB_IS_VALID(*tlb, gva))
374 kvm_mips_dump_host_tlbs(); 1211 return KVM_MIPS_TLBINV;
375 kvm_mips_dump_guest_tlbs(vcpu); 1212 if (write && !TLB_IS_DIRTY(*tlb, gva))
376 local_irq_restore(flags); 1213 return KVM_MIPS_TLBMOD;
377 return KVM_INVALID_INST; 1214
378 } 1215 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write))
379 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, 1216 return KVM_MIPS_GPA;
380 &vcpu->arch.guest_tlb[index])) {
381 kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
382 __func__, opc, index, vcpu,
383 read_c0_entryhi());
384 kvm_mips_dump_guest_tlbs(vcpu);
385 local_irq_restore(flags);
386 return KVM_INVALID_INST;
387 }
388 inst = *(opc);
389 }
390 local_irq_restore(flags);
391 } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
392 paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
393 vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
394 vaddr += paddr & ~PAGE_MASK;
395 inst = *(u32 *)vaddr;
396 kunmap_atomic(vaddr);
397 } else { 1217 } else {
398 kvm_err("%s: illegal address: %p\n", __func__, opc); 1218 return KVM_MIPS_GVA;
399 return KVM_INVALID_INST;
400 } 1219 }
401 1220
402 return inst; 1221 return KVM_MIPS_MAPPED;
1222}
1223
1224int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
1225{
1226 int err;
1227
1228retry:
1229 kvm_trap_emul_gva_lockless_begin(vcpu);
1230 err = get_user(*out, opc);
1231 kvm_trap_emul_gva_lockless_end(vcpu);
1232
1233 if (unlikely(err)) {
1234 /*
1235 * Try to handle the fault, maybe we just raced with a GVA
1236 * invalidation.
1237 */
1238 err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc,
1239 false);
1240 if (unlikely(err)) {
1241 kvm_err("%s: illegal address: %p\n",
1242 __func__, opc);
1243 return -EFAULT;
1244 }
1245
1246 /* Hopefully it'll work now */
1247 goto retry;
1248 }
1249 return 0;
403} 1250}
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 254377d8e0b9..2819eb793345 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,28 +33,20 @@
33#define KVM_GUEST_PC_TLB 0 33#define KVM_GUEST_PC_TLB 0
34#define KVM_GUEST_SP_TLB 1 34#define KVM_GUEST_SP_TLB 1
35 35
36atomic_t kvm_mips_instance;
37EXPORT_SYMBOL_GPL(kvm_mips_instance);
38
39static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) 36static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
40{ 37{
38 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
41 int cpu = smp_processor_id(); 39 int cpu = smp_processor_id();
42 40
43 return vcpu->arch.guest_kernel_asid[cpu] & 41 return cpu_asid(cpu, kern_mm);
44 cpu_asid_mask(&cpu_data[cpu]);
45} 42}
46 43
47static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) 44static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
48{ 45{
46 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
49 int cpu = smp_processor_id(); 47 int cpu = smp_processor_id();
50 48
51 return vcpu->arch.guest_user_asid[cpu] & 49 return cpu_asid(cpu, user_mm);
52 cpu_asid_mask(&cpu_data[cpu]);
53}
54
55inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
56{
57 return vcpu->kvm->arch.commpage_tlb;
58} 50}
59 51
60/* Structure defining an tlb entry data set. */ 52/* Structure defining an tlb entry data set. */
@@ -104,109 +96,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
104} 96}
105EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); 97EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
106 98
107/* XXXKYMA: Must be called with interrupts disabled */
108/* set flush_dcache_mask == 0 if no dcache flush required */
109int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
110 unsigned long entrylo0, unsigned long entrylo1,
111 int flush_dcache_mask)
112{
113 unsigned long flags;
114 unsigned long old_entryhi;
115 int idx;
116
117 local_irq_save(flags);
118
119 old_entryhi = read_c0_entryhi();
120 write_c0_entryhi(entryhi);
121 mtc0_tlbw_hazard();
122
123 tlb_probe();
124 tlb_probe_hazard();
125 idx = read_c0_index();
126
127 if (idx > current_cpu_data.tlbsize) {
128 kvm_err("%s: Invalid Index: %d\n", __func__, idx);
129 kvm_mips_dump_host_tlbs();
130 local_irq_restore(flags);
131 return -1;
132 }
133
134 write_c0_entrylo0(entrylo0);
135 write_c0_entrylo1(entrylo1);
136 mtc0_tlbw_hazard();
137
138 if (idx < 0)
139 tlb_write_random();
140 else
141 tlb_write_indexed();
142 tlbw_use_hazard();
143
144 kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
145 vcpu->arch.pc, idx, read_c0_entryhi(),
146 read_c0_entrylo0(), read_c0_entrylo1());
147
148 /* Flush D-cache */
149 if (flush_dcache_mask) {
150 if (entrylo0 & ENTRYLO_V) {
151 ++vcpu->stat.flush_dcache_exits;
152 flush_data_cache_page((entryhi & VPN2_MASK) &
153 ~flush_dcache_mask);
154 }
155 if (entrylo1 & ENTRYLO_V) {
156 ++vcpu->stat.flush_dcache_exits;
157 flush_data_cache_page(((entryhi & VPN2_MASK) &
158 ~flush_dcache_mask) |
159 (0x1 << PAGE_SHIFT));
160 }
161 }
162
163 /* Restore old ASID */
164 write_c0_entryhi(old_entryhi);
165 mtc0_tlbw_hazard();
166 local_irq_restore(flags);
167 return 0;
168}
169EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
170
171int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
172 struct kvm_vcpu *vcpu)
173{
174 kvm_pfn_t pfn;
175 unsigned long flags, old_entryhi = 0, vaddr = 0;
176 unsigned long entrylo[2] = { 0, 0 };
177 unsigned int pair_idx;
178
179 pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
180 pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
181 entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
182 ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
183 ENTRYLO_D | ENTRYLO_V;
184
185 local_irq_save(flags);
186
187 old_entryhi = read_c0_entryhi();
188 vaddr = badvaddr & (PAGE_MASK << 1);
189 write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
190 write_c0_entrylo0(entrylo[0]);
191 write_c0_entrylo1(entrylo[1]);
192 write_c0_index(kvm_mips_get_commpage_asid(vcpu));
193 mtc0_tlbw_hazard();
194 tlb_write_indexed();
195 tlbw_use_hazard();
196
197 kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
198 vcpu->arch.pc, read_c0_index(), read_c0_entryhi(),
199 read_c0_entrylo0(), read_c0_entrylo1());
200
201 /* Restore old ASID */
202 write_c0_entryhi(old_entryhi);
203 mtc0_tlbw_hazard();
204 local_irq_restore(flags);
205
206 return 0;
207}
208EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
209
210int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) 99int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
211{ 100{
212 int i; 101 int i;
@@ -228,51 +117,11 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
228} 117}
229EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup); 118EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup);
230 119
231int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) 120static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
232{
233 unsigned long old_entryhi, flags;
234 int idx;
235
236 local_irq_save(flags);
237
238 old_entryhi = read_c0_entryhi();
239
240 if (KVM_GUEST_KERNEL_MODE(vcpu))
241 write_c0_entryhi((vaddr & VPN2_MASK) |
242 kvm_mips_get_kernel_asid(vcpu));
243 else {
244 write_c0_entryhi((vaddr & VPN2_MASK) |
245 kvm_mips_get_user_asid(vcpu));
246 }
247
248 mtc0_tlbw_hazard();
249
250 tlb_probe();
251 tlb_probe_hazard();
252 idx = read_c0_index();
253
254 /* Restore old ASID */
255 write_c0_entryhi(old_entryhi);
256 mtc0_tlbw_hazard();
257
258 local_irq_restore(flags);
259
260 kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx);
261
262 return idx;
263}
264EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup);
265
266int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
267{ 121{
268 int idx; 122 int idx;
269 unsigned long flags, old_entryhi;
270
271 local_irq_save(flags);
272
273 old_entryhi = read_c0_entryhi();
274 123
275 write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu)); 124 write_c0_entryhi(entryhi);
276 mtc0_tlbw_hazard(); 125 mtc0_tlbw_hazard();
277 126
278 tlb_probe(); 127 tlb_probe();
@@ -282,7 +131,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
282 if (idx >= current_cpu_data.tlbsize) 131 if (idx >= current_cpu_data.tlbsize)
283 BUG(); 132 BUG();
284 133
285 if (idx > 0) { 134 if (idx >= 0) {
286 write_c0_entryhi(UNIQUE_ENTRYHI(idx)); 135 write_c0_entryhi(UNIQUE_ENTRYHI(idx));
287 write_c0_entrylo0(0); 136 write_c0_entrylo0(0);
288 write_c0_entrylo1(0); 137 write_c0_entrylo1(0);
@@ -292,93 +141,75 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
292 tlbw_use_hazard(); 141 tlbw_use_hazard();
293 } 142 }
294 143
295 write_c0_entryhi(old_entryhi); 144 return idx;
296 mtc0_tlbw_hazard();
297
298 local_irq_restore(flags);
299
300 if (idx > 0)
301 kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
302 (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
303
304 return 0;
305} 145}
306EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
307 146
308void kvm_mips_flush_host_tlb(int skip_kseg0) 147int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
148 bool user, bool kernel)
309{ 149{
310 unsigned long flags; 150 int idx_user, idx_kernel;
311 unsigned long old_entryhi, entryhi; 151 unsigned long flags, old_entryhi;
312 unsigned long old_pagemask;
313 int entry = 0;
314 int maxentry = current_cpu_data.tlbsize;
315 152
316 local_irq_save(flags); 153 local_irq_save(flags);
317 154
318 old_entryhi = read_c0_entryhi(); 155 old_entryhi = read_c0_entryhi();
319 old_pagemask = read_c0_pagemask();
320
321 /* Blast 'em all away. */
322 for (entry = 0; entry < maxentry; entry++) {
323 write_c0_index(entry);
324
325 if (skip_kseg0) {
326 mtc0_tlbr_hazard();
327 tlb_read();
328 tlb_read_hazard();
329
330 entryhi = read_c0_entryhi();
331 156
332 /* Don't blow away guest kernel entries */ 157 if (user)
333 if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0) 158 idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
334 continue; 159 kvm_mips_get_user_asid(vcpu));
335 160 if (kernel)
336 write_c0_pagemask(old_pagemask); 161 idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
337 } 162 kvm_mips_get_kernel_asid(vcpu));
338
339 /* Make sure all entries differ. */
340 write_c0_entryhi(UNIQUE_ENTRYHI(entry));
341 write_c0_entrylo0(0);
342 write_c0_entrylo1(0);
343 mtc0_tlbw_hazard();
344
345 tlb_write_indexed();
346 tlbw_use_hazard();
347 }
348 163
349 write_c0_entryhi(old_entryhi); 164 write_c0_entryhi(old_entryhi);
350 write_c0_pagemask(old_pagemask);
351 mtc0_tlbw_hazard(); 165 mtc0_tlbw_hazard();
352 166
353 local_irq_restore(flags); 167 local_irq_restore(flags);
168
169 if (user && idx_user >= 0)
170 kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
171 __func__, (va & VPN2_MASK) |
172 kvm_mips_get_user_asid(vcpu), idx_user);
173 if (kernel && idx_kernel >= 0)
174 kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n",
175 __func__, (va & VPN2_MASK) |
176 kvm_mips_get_kernel_asid(vcpu), idx_kernel);
177
178 return 0;
354} 179}
355EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); 180EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
356 181
357void kvm_local_flush_tlb_all(void) 182/**
183 * kvm_mips_suspend_mm() - Suspend the active mm.
184 * @cpu The CPU we're running on.
185 *
186 * Suspend the active_mm, ready for a switch to a KVM guest virtual address
187 * space. This is left active for the duration of guest context, including time
188 * with interrupts enabled, so we need to be careful not to confuse e.g. cache
189 * management IPIs.
190 *
191 * kvm_mips_resume_mm() should be called before context switching to a different
192 * process so we don't need to worry about reference counting.
193 *
194 * This needs to be in static kernel code to avoid exporting init_mm.
195 */
196void kvm_mips_suspend_mm(int cpu)
358{ 197{
359 unsigned long flags; 198 cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm));
360 unsigned long old_ctx; 199 current->active_mm = &init_mm;
361 int entry = 0; 200}
362 201EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm);
363 local_irq_save(flags);
364 /* Save old context and create impossible VPN2 value */
365 old_ctx = read_c0_entryhi();
366 write_c0_entrylo0(0);
367 write_c0_entrylo1(0);
368
369 /* Blast 'em all away. */
370 while (entry < current_cpu_data.tlbsize) {
371 /* Make sure all entries differ. */
372 write_c0_entryhi(UNIQUE_ENTRYHI(entry));
373 write_c0_index(entry);
374 mtc0_tlbw_hazard();
375 tlb_write_indexed();
376 tlbw_use_hazard();
377 entry++;
378 }
379 write_c0_entryhi(old_ctx);
380 mtc0_tlbw_hazard();
381 202
382 local_irq_restore(flags); 203/**
204 * kvm_mips_resume_mm() - Resume the current process mm.
205 * @cpu The CPU we're running on.
206 *
207 * Resume the mm of the current process, after a switch back from a KVM guest
208 * virtual address space (see kvm_mips_suspend_mm()).
209 */
210void kvm_mips_resume_mm(int cpu)
211{
212 cpumask_set_cpu(cpu, mm_cpumask(current->mm));
213 current->active_mm = current->mm;
383} 214}
384EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); 215EXPORT_SYMBOL_GPL(kvm_mips_resume_mm);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3b20441f2beb..b1fa53b252ea 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -11,9 +11,11 @@
11 11
12#include <linux/errno.h> 12#include <linux/errno.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/vmalloc.h>
15
16#include <linux/kvm_host.h> 14#include <linux/kvm_host.h>
15#include <linux/uaccess.h>
16#include <linux/vmalloc.h>
17#include <asm/mmu_context.h>
18#include <asm/pgalloc.h>
17 19
18#include "interrupt.h" 20#include "interrupt.h"
19 21
@@ -21,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
21{ 23{
22 gpa_t gpa; 24 gpa_t gpa;
23 gva_t kseg = KSEGX(gva); 25 gva_t kseg = KSEGX(gva);
26 gva_t gkseg = KVM_GUEST_KSEGX(gva);
24 27
25 if ((kseg == CKSEG0) || (kseg == CKSEG1)) 28 if ((kseg == CKSEG0) || (kseg == CKSEG1))
26 gpa = CPHYSADDR(gva); 29 gpa = CPHYSADDR(gva);
30 else if (gkseg == KVM_GUEST_KSEG0)
31 gpa = KVM_GUEST_CPHYSADDR(gva);
27 else { 32 else {
28 kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva); 33 kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva);
29 kvm_mips_dump_host_tlbs(); 34 kvm_mips_dump_host_tlbs();
@@ -83,48 +88,134 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
83 return ret; 88 return ret;
84} 89}
85 90
91static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run,
92 struct kvm_vcpu *vcpu)
93{
94 enum emulation_result er;
95 union mips_instruction inst;
96 int err;
97
98 /* A code fetch fault doesn't count as an MMIO */
99 if (kvm_is_ifetch_fault(&vcpu->arch)) {
100 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
101 return RESUME_HOST;
102 }
103
104 /* Fetch the instruction. */
105 if (cause & CAUSEF_BD)
106 opc += 1;
107 err = kvm_get_badinstr(opc, vcpu, &inst.word);
108 if (err) {
109 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
110 return RESUME_HOST;
111 }
112
113 /* Emulate the load */
114 er = kvm_mips_emulate_load(inst, cause, run, vcpu);
115 if (er == EMULATE_FAIL) {
116 kvm_err("Emulate load from MMIO space failed\n");
117 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
118 } else {
119 run->exit_reason = KVM_EXIT_MMIO;
120 }
121 return RESUME_HOST;
122}
123
124static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run,
125 struct kvm_vcpu *vcpu)
126{
127 enum emulation_result er;
128 union mips_instruction inst;
129 int err;
130
131 /* Fetch the instruction. */
132 if (cause & CAUSEF_BD)
133 opc += 1;
134 err = kvm_get_badinstr(opc, vcpu, &inst.word);
135 if (err) {
136 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
137 return RESUME_HOST;
138 }
139
140 /* Emulate the store */
141 er = kvm_mips_emulate_store(inst, cause, run, vcpu);
142 if (er == EMULATE_FAIL) {
143 kvm_err("Emulate store to MMIO space failed\n");
144 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
145 } else {
146 run->exit_reason = KVM_EXIT_MMIO;
147 }
148 return RESUME_HOST;
149}
150
151static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run,
152 struct kvm_vcpu *vcpu, bool store)
153{
154 if (store)
155 return kvm_mips_bad_store(cause, opc, run, vcpu);
156 else
157 return kvm_mips_bad_load(cause, opc, run, vcpu);
158}
159
86static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) 160static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
87{ 161{
162 struct mips_coproc *cop0 = vcpu->arch.cop0;
88 struct kvm_run *run = vcpu->run; 163 struct kvm_run *run = vcpu->run;
89 u32 __user *opc = (u32 __user *) vcpu->arch.pc; 164 u32 __user *opc = (u32 __user *) vcpu->arch.pc;
90 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; 165 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
91 u32 cause = vcpu->arch.host_cp0_cause; 166 u32 cause = vcpu->arch.host_cp0_cause;
92 enum emulation_result er = EMULATE_DONE; 167 struct kvm_mips_tlb *tlb;
93 int ret = RESUME_GUEST; 168 unsigned long entryhi;
169 int index;
94 170
95 if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 171 if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
96 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { 172 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
97 kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", 173 /*
98 cause, opc, badvaddr); 174 * First find the mapping in the guest TLB. If the failure to
99 er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); 175 * write was due to the guest TLB, it should be up to the guest
176 * to handle it.
177 */
178 entryhi = (badvaddr & VPN2_MASK) |
179 (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
180 index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
100 181
101 if (er == EMULATE_DONE) 182 /*
102 ret = RESUME_GUEST; 183 * These should never happen.
103 else { 184 * They would indicate stale host TLB entries.
185 */
186 if (unlikely(index < 0)) {
104 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 187 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
105 ret = RESUME_HOST; 188 return RESUME_HOST;
106 } 189 }
107 } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { 190 tlb = vcpu->arch.guest_tlb + index;
191 if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) {
192 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
193 return RESUME_HOST;
194 }
195
108 /* 196 /*
109 * XXXKYMA: The guest kernel does not expect to get this fault 197 * Guest entry not dirty? That would explain the TLB modified
110 * when we are not using HIGHMEM. Need to address this in a 198 * exception. Relay that on to the guest so it can handle it.
111 * HIGHMEM kernel
112 */ 199 */
113 kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n", 200 if (!TLB_IS_DIRTY(*tlb, badvaddr)) {
114 cause, opc, badvaddr); 201 kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
115 kvm_mips_dump_host_tlbs(); 202 return RESUME_GUEST;
116 kvm_arch_vcpu_dump_regs(vcpu); 203 }
117 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 204
118 ret = RESUME_HOST; 205 if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr,
206 true))
207 /* Not writable, needs handling as MMIO */
208 return kvm_mips_bad_store(cause, opc, run, vcpu);
209 return RESUME_GUEST;
210 } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
211 if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0)
212 /* Not writable, needs handling as MMIO */
213 return kvm_mips_bad_store(cause, opc, run, vcpu);
214 return RESUME_GUEST;
119 } else { 215 } else {
120 kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", 216 /* host kernel addresses are all handled as MMIO */
121 cause, opc, badvaddr); 217 return kvm_mips_bad_store(cause, opc, run, vcpu);
122 kvm_mips_dump_host_tlbs();
123 kvm_arch_vcpu_dump_regs(vcpu);
124 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
125 ret = RESUME_HOST;
126 } 218 }
127 return ret;
128} 219}
129 220
130static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) 221static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
@@ -157,7 +248,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
157 * into the shadow host TLB 248 * into the shadow host TLB
158 */ 249 */
159 250
160 er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); 251 er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store);
161 if (er == EMULATE_DONE) 252 if (er == EMULATE_DONE)
162 ret = RESUME_GUEST; 253 ret = RESUME_GUEST;
163 else { 254 else {
@@ -169,29 +260,15 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
169 * All KSEG0 faults are handled by KVM, as the guest kernel does 260 * All KSEG0 faults are handled by KVM, as the guest kernel does
170 * not expect to ever get them 261 * not expect to ever get them
171 */ 262 */
172 if (kvm_mips_handle_kseg0_tlb_fault 263 if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0)
173 (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { 264 ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
174 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
175 ret = RESUME_HOST;
176 }
177 } else if (KVM_GUEST_KERNEL_MODE(vcpu) 265 } else if (KVM_GUEST_KERNEL_MODE(vcpu)
178 && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { 266 && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
179 /* 267 /*
180 * With EVA we may get a TLB exception instead of an address 268 * With EVA we may get a TLB exception instead of an address
181 * error when the guest performs MMIO to KSeg1 addresses. 269 * error when the guest performs MMIO to KSeg1 addresses.
182 */ 270 */
183 kvm_debug("Emulate %s MMIO space\n", 271 ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
184 store ? "Store to" : "Load from");
185 er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
186 if (er == EMULATE_FAIL) {
187 kvm_err("Emulate %s MMIO space failed\n",
188 store ? "Store to" : "Load from");
189 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
190 ret = RESUME_HOST;
191 } else {
192 run->exit_reason = KVM_EXIT_MMIO;
193 ret = RESUME_HOST;
194 }
195 } else { 272 } else {
196 kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", 273 kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
197 store ? "ST" : "LD", cause, opc, badvaddr); 274 store ? "ST" : "LD", cause, opc, badvaddr);
@@ -219,21 +296,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
219 u32 __user *opc = (u32 __user *) vcpu->arch.pc; 296 u32 __user *opc = (u32 __user *) vcpu->arch.pc;
220 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; 297 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
221 u32 cause = vcpu->arch.host_cp0_cause; 298 u32 cause = vcpu->arch.host_cp0_cause;
222 enum emulation_result er = EMULATE_DONE;
223 int ret = RESUME_GUEST; 299 int ret = RESUME_GUEST;
224 300
225 if (KVM_GUEST_KERNEL_MODE(vcpu) 301 if (KVM_GUEST_KERNEL_MODE(vcpu)
226 && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { 302 && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
227 kvm_debug("Emulate Store to MMIO space\n"); 303 ret = kvm_mips_bad_store(cause, opc, run, vcpu);
228 er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
229 if (er == EMULATE_FAIL) {
230 kvm_err("Emulate Store to MMIO space failed\n");
231 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
232 ret = RESUME_HOST;
233 } else {
234 run->exit_reason = KVM_EXIT_MMIO;
235 ret = RESUME_HOST;
236 }
237 } else { 304 } else {
238 kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", 305 kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
239 cause, opc, badvaddr); 306 cause, opc, badvaddr);
@@ -249,26 +316,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
249 u32 __user *opc = (u32 __user *) vcpu->arch.pc; 316 u32 __user *opc = (u32 __user *) vcpu->arch.pc;
250 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; 317 unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
251 u32 cause = vcpu->arch.host_cp0_cause; 318 u32 cause = vcpu->arch.host_cp0_cause;
252 enum emulation_result er = EMULATE_DONE;
253 int ret = RESUME_GUEST; 319 int ret = RESUME_GUEST;
254 320
255 if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { 321 if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
256 kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr); 322 ret = kvm_mips_bad_load(cause, opc, run, vcpu);
257 er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
258 if (er == EMULATE_FAIL) {
259 kvm_err("Emulate Load from MMIO space failed\n");
260 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
261 ret = RESUME_HOST;
262 } else {
263 run->exit_reason = KVM_EXIT_MMIO;
264 ret = RESUME_HOST;
265 }
266 } else { 323 } else {
267 kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", 324 kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
268 cause, opc, badvaddr); 325 cause, opc, badvaddr);
269 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 326 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
270 ret = RESUME_HOST; 327 ret = RESUME_HOST;
271 er = EMULATE_FAIL;
272 } 328 }
273 return ret; 329 return ret;
274} 330}
@@ -428,16 +484,75 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
428 return ret; 484 return ret;
429} 485}
430 486
431static int kvm_trap_emul_vm_init(struct kvm *kvm) 487static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
432{ 488{
489 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
490 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
491
492 /*
493 * Allocate GVA -> HPA page tables.
494 * MIPS doesn't use the mm_struct pointer argument.
495 */
496 kern_mm->pgd = pgd_alloc(kern_mm);
497 if (!kern_mm->pgd)
498 return -ENOMEM;
499
500 user_mm->pgd = pgd_alloc(user_mm);
501 if (!user_mm->pgd) {
502 pgd_free(kern_mm, kern_mm->pgd);
503 return -ENOMEM;
504 }
505
433 return 0; 506 return 0;
434} 507}
435 508
436static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) 509static void kvm_mips_emul_free_gva_pt(pgd_t *pgd)
437{ 510{
438 vcpu->arch.kscratch_enabled = 0xfc; 511 /* Don't free host kernel page tables copied from init_mm.pgd */
512 const unsigned long end = 0x80000000;
513 unsigned long pgd_va, pud_va, pmd_va;
514 pud_t *pud;
515 pmd_t *pmd;
516 pte_t *pte;
517 int i, j, k;
518
519 for (i = 0; i < USER_PTRS_PER_PGD; i++) {
520 if (pgd_none(pgd[i]))
521 continue;
522
523 pgd_va = (unsigned long)i << PGDIR_SHIFT;
524 if (pgd_va >= end)
525 break;
526 pud = pud_offset(pgd + i, 0);
527 for (j = 0; j < PTRS_PER_PUD; j++) {
528 if (pud_none(pud[j]))
529 continue;
530
531 pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT);
532 if (pud_va >= end)
533 break;
534 pmd = pmd_offset(pud + j, 0);
535 for (k = 0; k < PTRS_PER_PMD; k++) {
536 if (pmd_none(pmd[k]))
537 continue;
538
539 pmd_va = pud_va | (k << PMD_SHIFT);
540 if (pmd_va >= end)
541 break;
542 pte = pte_offset(pmd + k, 0);
543 pte_free_kernel(NULL, pte);
544 }
545 pmd_free(NULL, pmd);
546 }
547 pud_free(NULL, pud);
548 }
549 pgd_free(NULL, pgd);
550}
439 551
440 return 0; 552static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu)
553{
554 kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd);
555 kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd);
441} 556}
442 557
443static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) 558static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -499,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
499 /* Set Wait IE/IXMT Ignore in Config7, IAR, AR */ 614 /* Set Wait IE/IXMT Ignore in Config7, IAR, AR */
500 kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10)); 615 kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
501 616
617 /* Status */
618 kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL);
619
502 /* 620 /*
503 * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5) 621 * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5)
504 */ 622 */
@@ -508,17 +626,76 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
508 kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | 626 kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
509 (vcpu_id & MIPS_EBASE_CPUNUM)); 627 (vcpu_id & MIPS_EBASE_CPUNUM));
510 628
629 /* Put PC at guest reset vector */
630 vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000);
631
511 return 0; 632 return 0;
512} 633}
513 634
635static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
636{
637 /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
638 kvm_flush_remote_tlbs(kvm);
639}
640
641static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
642 const struct kvm_memory_slot *slot)
643{
644 kvm_trap_emul_flush_shadow_all(kvm);
645}
646
647static u64 kvm_trap_emul_get_one_regs[] = {
648 KVM_REG_MIPS_CP0_INDEX,
649 KVM_REG_MIPS_CP0_ENTRYLO0,
650 KVM_REG_MIPS_CP0_ENTRYLO1,
651 KVM_REG_MIPS_CP0_CONTEXT,
652 KVM_REG_MIPS_CP0_USERLOCAL,
653 KVM_REG_MIPS_CP0_PAGEMASK,
654 KVM_REG_MIPS_CP0_WIRED,
655 KVM_REG_MIPS_CP0_HWRENA,
656 KVM_REG_MIPS_CP0_BADVADDR,
657 KVM_REG_MIPS_CP0_COUNT,
658 KVM_REG_MIPS_CP0_ENTRYHI,
659 KVM_REG_MIPS_CP0_COMPARE,
660 KVM_REG_MIPS_CP0_STATUS,
661 KVM_REG_MIPS_CP0_INTCTL,
662 KVM_REG_MIPS_CP0_CAUSE,
663 KVM_REG_MIPS_CP0_EPC,
664 KVM_REG_MIPS_CP0_PRID,
665 KVM_REG_MIPS_CP0_EBASE,
666 KVM_REG_MIPS_CP0_CONFIG,
667 KVM_REG_MIPS_CP0_CONFIG1,
668 KVM_REG_MIPS_CP0_CONFIG2,
669 KVM_REG_MIPS_CP0_CONFIG3,
670 KVM_REG_MIPS_CP0_CONFIG4,
671 KVM_REG_MIPS_CP0_CONFIG5,
672 KVM_REG_MIPS_CP0_CONFIG7,
673 KVM_REG_MIPS_CP0_ERROREPC,
674 KVM_REG_MIPS_CP0_KSCRATCH1,
675 KVM_REG_MIPS_CP0_KSCRATCH2,
676 KVM_REG_MIPS_CP0_KSCRATCH3,
677 KVM_REG_MIPS_CP0_KSCRATCH4,
678 KVM_REG_MIPS_CP0_KSCRATCH5,
679 KVM_REG_MIPS_CP0_KSCRATCH6,
680
681 KVM_REG_MIPS_COUNT_CTL,
682 KVM_REG_MIPS_COUNT_RESUME,
683 KVM_REG_MIPS_COUNT_HZ,
684};
685
514static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) 686static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
515{ 687{
516 return 0; 688 return ARRAY_SIZE(kvm_trap_emul_get_one_regs);
517} 689}
518 690
519static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu, 691static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
520 u64 __user *indices) 692 u64 __user *indices)
521{ 693{
694 if (copy_to_user(indices, kvm_trap_emul_get_one_regs,
695 sizeof(kvm_trap_emul_get_one_regs)))
696 return -EFAULT;
697 indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs);
698
522 return 0; 699 return 0;
523} 700}
524 701
@@ -526,7 +703,81 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
526 const struct kvm_one_reg *reg, 703 const struct kvm_one_reg *reg,
527 s64 *v) 704 s64 *v)
528{ 705{
706 struct mips_coproc *cop0 = vcpu->arch.cop0;
707
529 switch (reg->id) { 708 switch (reg->id) {
709 case KVM_REG_MIPS_CP0_INDEX:
710 *v = (long)kvm_read_c0_guest_index(cop0);
711 break;
712 case KVM_REG_MIPS_CP0_ENTRYLO0:
713 *v = kvm_read_c0_guest_entrylo0(cop0);
714 break;
715 case KVM_REG_MIPS_CP0_ENTRYLO1:
716 *v = kvm_read_c0_guest_entrylo1(cop0);
717 break;
718 case KVM_REG_MIPS_CP0_CONTEXT:
719 *v = (long)kvm_read_c0_guest_context(cop0);
720 break;
721 case KVM_REG_MIPS_CP0_USERLOCAL:
722 *v = (long)kvm_read_c0_guest_userlocal(cop0);
723 break;
724 case KVM_REG_MIPS_CP0_PAGEMASK:
725 *v = (long)kvm_read_c0_guest_pagemask(cop0);
726 break;
727 case KVM_REG_MIPS_CP0_WIRED:
728 *v = (long)kvm_read_c0_guest_wired(cop0);
729 break;
730 case KVM_REG_MIPS_CP0_HWRENA:
731 *v = (long)kvm_read_c0_guest_hwrena(cop0);
732 break;
733 case KVM_REG_MIPS_CP0_BADVADDR:
734 *v = (long)kvm_read_c0_guest_badvaddr(cop0);
735 break;
736 case KVM_REG_MIPS_CP0_ENTRYHI:
737 *v = (long)kvm_read_c0_guest_entryhi(cop0);
738 break;
739 case KVM_REG_MIPS_CP0_COMPARE:
740 *v = (long)kvm_read_c0_guest_compare(cop0);
741 break;
742 case KVM_REG_MIPS_CP0_STATUS:
743 *v = (long)kvm_read_c0_guest_status(cop0);
744 break;
745 case KVM_REG_MIPS_CP0_INTCTL:
746 *v = (long)kvm_read_c0_guest_intctl(cop0);
747 break;
748 case KVM_REG_MIPS_CP0_CAUSE:
749 *v = (long)kvm_read_c0_guest_cause(cop0);
750 break;
751 case KVM_REG_MIPS_CP0_EPC:
752 *v = (long)kvm_read_c0_guest_epc(cop0);
753 break;
754 case KVM_REG_MIPS_CP0_PRID:
755 *v = (long)kvm_read_c0_guest_prid(cop0);
756 break;
757 case KVM_REG_MIPS_CP0_EBASE:
758 *v = (long)kvm_read_c0_guest_ebase(cop0);
759 break;
760 case KVM_REG_MIPS_CP0_CONFIG:
761 *v = (long)kvm_read_c0_guest_config(cop0);
762 break;
763 case KVM_REG_MIPS_CP0_CONFIG1:
764 *v = (long)kvm_read_c0_guest_config1(cop0);
765 break;
766 case KVM_REG_MIPS_CP0_CONFIG2:
767 *v = (long)kvm_read_c0_guest_config2(cop0);
768 break;
769 case KVM_REG_MIPS_CP0_CONFIG3:
770 *v = (long)kvm_read_c0_guest_config3(cop0);
771 break;
772 case KVM_REG_MIPS_CP0_CONFIG4:
773 *v = (long)kvm_read_c0_guest_config4(cop0);
774 break;
775 case KVM_REG_MIPS_CP0_CONFIG5:
776 *v = (long)kvm_read_c0_guest_config5(cop0);
777 break;
778 case KVM_REG_MIPS_CP0_CONFIG7:
779 *v = (long)kvm_read_c0_guest_config7(cop0);
780 break;
530 case KVM_REG_MIPS_CP0_COUNT: 781 case KVM_REG_MIPS_CP0_COUNT:
531 *v = kvm_mips_read_count(vcpu); 782 *v = kvm_mips_read_count(vcpu);
532 break; 783 break;
@@ -539,6 +790,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
539 case KVM_REG_MIPS_COUNT_HZ: 790 case KVM_REG_MIPS_COUNT_HZ:
540 *v = vcpu->arch.count_hz; 791 *v = vcpu->arch.count_hz;
541 break; 792 break;
793 case KVM_REG_MIPS_CP0_ERROREPC:
794 *v = (long)kvm_read_c0_guest_errorepc(cop0);
795 break;
796 case KVM_REG_MIPS_CP0_KSCRATCH1:
797 *v = (long)kvm_read_c0_guest_kscratch1(cop0);
798 break;
799 case KVM_REG_MIPS_CP0_KSCRATCH2:
800 *v = (long)kvm_read_c0_guest_kscratch2(cop0);
801 break;
802 case KVM_REG_MIPS_CP0_KSCRATCH3:
803 *v = (long)kvm_read_c0_guest_kscratch3(cop0);
804 break;
805 case KVM_REG_MIPS_CP0_KSCRATCH4:
806 *v = (long)kvm_read_c0_guest_kscratch4(cop0);
807 break;
808 case KVM_REG_MIPS_CP0_KSCRATCH5:
809 *v = (long)kvm_read_c0_guest_kscratch5(cop0);
810 break;
811 case KVM_REG_MIPS_CP0_KSCRATCH6:
812 *v = (long)kvm_read_c0_guest_kscratch6(cop0);
813 break;
542 default: 814 default:
543 return -EINVAL; 815 return -EINVAL;
544 } 816 }
@@ -554,6 +826,56 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
554 unsigned int cur, change; 826 unsigned int cur, change;
555 827
556 switch (reg->id) { 828 switch (reg->id) {
829 case KVM_REG_MIPS_CP0_INDEX:
830 kvm_write_c0_guest_index(cop0, v);
831 break;
832 case KVM_REG_MIPS_CP0_ENTRYLO0:
833 kvm_write_c0_guest_entrylo0(cop0, v);
834 break;
835 case KVM_REG_MIPS_CP0_ENTRYLO1:
836 kvm_write_c0_guest_entrylo1(cop0, v);
837 break;
838 case KVM_REG_MIPS_CP0_CONTEXT:
839 kvm_write_c0_guest_context(cop0, v);
840 break;
841 case KVM_REG_MIPS_CP0_USERLOCAL:
842 kvm_write_c0_guest_userlocal(cop0, v);
843 break;
844 case KVM_REG_MIPS_CP0_PAGEMASK:
845 kvm_write_c0_guest_pagemask(cop0, v);
846 break;
847 case KVM_REG_MIPS_CP0_WIRED:
848 kvm_write_c0_guest_wired(cop0, v);
849 break;
850 case KVM_REG_MIPS_CP0_HWRENA:
851 kvm_write_c0_guest_hwrena(cop0, v);
852 break;
853 case KVM_REG_MIPS_CP0_BADVADDR:
854 kvm_write_c0_guest_badvaddr(cop0, v);
855 break;
856 case KVM_REG_MIPS_CP0_ENTRYHI:
857 kvm_write_c0_guest_entryhi(cop0, v);
858 break;
859 case KVM_REG_MIPS_CP0_STATUS:
860 kvm_write_c0_guest_status(cop0, v);
861 break;
862 case KVM_REG_MIPS_CP0_INTCTL:
863 /* No VInt, so no VS, read-only for now */
864 break;
865 case KVM_REG_MIPS_CP0_EPC:
866 kvm_write_c0_guest_epc(cop0, v);
867 break;
868 case KVM_REG_MIPS_CP0_PRID:
869 kvm_write_c0_guest_prid(cop0, v);
870 break;
871 case KVM_REG_MIPS_CP0_EBASE:
872 /*
873 * Allow core number to be written, but the exception base must
874 * remain in guest KSeg0.
875 */
876 kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM,
877 v);
878 break;
557 case KVM_REG_MIPS_CP0_COUNT: 879 case KVM_REG_MIPS_CP0_COUNT:
558 kvm_mips_write_count(vcpu, v); 880 kvm_mips_write_count(vcpu, v);
559 break; 881 break;
@@ -618,6 +940,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
618 kvm_write_c0_guest_config5(cop0, v); 940 kvm_write_c0_guest_config5(cop0, v);
619 } 941 }
620 break; 942 break;
943 case KVM_REG_MIPS_CP0_CONFIG7:
944 /* writes ignored */
945 break;
621 case KVM_REG_MIPS_COUNT_CTL: 946 case KVM_REG_MIPS_COUNT_CTL:
622 ret = kvm_mips_set_count_ctl(vcpu, v); 947 ret = kvm_mips_set_count_ctl(vcpu, v);
623 break; 948 break;
@@ -627,24 +952,269 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
627 case KVM_REG_MIPS_COUNT_HZ: 952 case KVM_REG_MIPS_COUNT_HZ:
628 ret = kvm_mips_set_count_hz(vcpu, v); 953 ret = kvm_mips_set_count_hz(vcpu, v);
629 break; 954 break;
955 case KVM_REG_MIPS_CP0_ERROREPC:
956 kvm_write_c0_guest_errorepc(cop0, v);
957 break;
958 case KVM_REG_MIPS_CP0_KSCRATCH1:
959 kvm_write_c0_guest_kscratch1(cop0, v);
960 break;
961 case KVM_REG_MIPS_CP0_KSCRATCH2:
962 kvm_write_c0_guest_kscratch2(cop0, v);
963 break;
964 case KVM_REG_MIPS_CP0_KSCRATCH3:
965 kvm_write_c0_guest_kscratch3(cop0, v);
966 break;
967 case KVM_REG_MIPS_CP0_KSCRATCH4:
968 kvm_write_c0_guest_kscratch4(cop0, v);
969 break;
970 case KVM_REG_MIPS_CP0_KSCRATCH5:
971 kvm_write_c0_guest_kscratch5(cop0, v);
972 break;
973 case KVM_REG_MIPS_CP0_KSCRATCH6:
974 kvm_write_c0_guest_kscratch6(cop0, v);
975 break;
630 default: 976 default:
631 return -EINVAL; 977 return -EINVAL;
632 } 978 }
633 return ret; 979 return ret;
634} 980}
635 981
636static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu) 982static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
637{ 983{
638 kvm_lose_fpu(vcpu); 984 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
985 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
986 struct mm_struct *mm;
987
988 /*
989 * Were we in guest context? If so, restore the appropriate ASID based
990 * on the mode of the Guest (Kernel/User).
991 */
992 if (current->flags & PF_VCPU) {
993 mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
994 if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
995 asid_version_mask(cpu))
996 get_new_mmu_context(mm, cpu);
997 write_c0_entryhi(cpu_asid(cpu, mm));
998 TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
999 kvm_mips_suspend_mm(cpu);
1000 ehb();
1001 }
639 1002
640 return 0; 1003 return 0;
641} 1004}
642 1005
643static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu) 1006static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
644{ 1007{
1008 kvm_lose_fpu(vcpu);
1009
1010 if (current->flags & PF_VCPU) {
1011 /* Restore normal Linux process memory map */
1012 if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
1013 asid_version_mask(cpu)))
1014 get_new_mmu_context(current->mm, cpu);
1015 write_c0_entryhi(cpu_asid(cpu, current->mm));
1016 TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
1017 kvm_mips_resume_mm(cpu);
1018 ehb();
1019 }
1020
645 return 0; 1021 return 0;
646} 1022}
647 1023
1024static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
1025 bool reload_asid)
1026{
1027 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
1028 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
1029 struct mm_struct *mm;
1030 int i;
1031
1032 if (likely(!vcpu->requests))
1033 return;
1034
1035 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
1036 /*
1037 * Both kernel & user GVA mappings must be invalidated. The
1038 * caller is just about to check whether the ASID is stale
1039 * anyway so no need to reload it here.
1040 */
1041 kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN);
1042 kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER);
1043 for_each_possible_cpu(i) {
1044 cpu_context(i, kern_mm) = 0;
1045 cpu_context(i, user_mm) = 0;
1046 }
1047
1048 /* Generate new ASID for current mode */
1049 if (reload_asid) {
1050 mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
1051 get_new_mmu_context(mm, cpu);
1052 htw_stop();
1053 write_c0_entryhi(cpu_asid(cpu, mm));
1054 TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
1055 htw_start();
1056 }
1057 }
1058}
1059
1060/**
1061 * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space.
1062 * @vcpu: VCPU pointer.
1063 *
1064 * Call before a GVA space access outside of guest mode, to ensure that
1065 * asynchronous TLB flush requests are handled or delayed until completion of
1066 * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()).
1067 *
1068 * Should be called with IRQs already enabled.
1069 */
1070void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu)
1071{
1072 /* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */
1073 WARN_ON_ONCE(irqs_disabled());
1074
1075 /*
1076 * The caller is about to access the GVA space, so we set the mode to
1077 * force TLB flush requests to send an IPI, and also disable IRQs to
1078 * delay IPI handling until kvm_trap_emul_gva_lockless_end().
1079 */
1080 local_irq_disable();
1081
1082 /*
1083 * Make sure the read of VCPU requests is not reordered ahead of the
1084 * write to vcpu->mode, or we could miss a TLB flush request while
1085 * the requester sees the VCPU as outside of guest mode and not needing
1086 * an IPI.
1087 */
1088 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1089
1090 /*
1091 * If a TLB flush has been requested (potentially while
1092 * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it
1093 * before accessing the GVA space, and be sure to reload the ASID if
1094 * necessary as it'll be immediately used.
1095 *
1096 * TLB flush requests after this check will trigger an IPI due to the
1097 * mode change above, which will be delayed due to IRQs disabled.
1098 */
1099 kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true);
1100}
1101
1102/**
1103 * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space.
1104 * @vcpu: VCPU pointer.
1105 *
1106 * Called after a GVA space access outside of guest mode. Should have a matching
1107 * call to kvm_trap_emul_gva_lockless_begin().
1108 */
1109void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu)
1110{
1111 /*
1112 * Make sure the write to vcpu->mode is not reordered in front of GVA
1113 * accesses, or a TLB flush requester may not think it necessary to send
1114 * an IPI.
1115 */
1116 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1117
1118 /*
1119 * Now that the access to GVA space is complete, its safe for pending
1120 * TLB flush request IPIs to be handled (which indicates completion).
1121 */
1122 local_irq_enable();
1123}
1124
1125static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
1126 struct kvm_vcpu *vcpu)
1127{
1128 struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
1129 struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
1130 struct mm_struct *mm;
1131 struct mips_coproc *cop0 = vcpu->arch.cop0;
1132 int i, cpu = smp_processor_id();
1133 unsigned int gasid;
1134
1135 /*
1136 * No need to reload ASID, IRQs are disabled already so there's no rush,
1137 * and we'll check if we need to regenerate below anyway before
1138 * re-entering the guest.
1139 */
1140 kvm_trap_emul_check_requests(vcpu, cpu, false);
1141
1142 if (KVM_GUEST_KERNEL_MODE(vcpu)) {
1143 mm = kern_mm;
1144 } else {
1145 mm = user_mm;
1146
1147 /*
1148 * Lazy host ASID regeneration / PT flush for guest user mode.
1149 * If the guest ASID has changed since the last guest usermode
1150 * execution, invalidate the stale TLB entries and flush GVA PT
1151 * entries too.
1152 */
1153 gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
1154 if (gasid != vcpu->arch.last_user_gasid) {
1155 kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
1156 for_each_possible_cpu(i)
1157 cpu_context(i, user_mm) = 0;
1158 vcpu->arch.last_user_gasid = gasid;
1159 }
1160 }
1161
1162 /*
1163 * Check if ASID is stale. This may happen due to a TLB flush request or
1164 * a lazy user MM invalidation.
1165 */
1166 if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
1167 asid_version_mask(cpu))
1168 get_new_mmu_context(mm, cpu);
1169}
1170
1171static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
1172{
1173 int cpu = smp_processor_id();
1174 int r;
1175
1176 /* Check if we have any exceptions/interrupts pending */
1177 kvm_mips_deliver_interrupts(vcpu,
1178 kvm_read_c0_guest_cause(vcpu->arch.cop0));
1179
1180 kvm_trap_emul_vcpu_reenter(run, vcpu);
1181
1182 /*
1183 * We use user accessors to access guest memory, but we don't want to
1184 * invoke Linux page faulting.
1185 */
1186 pagefault_disable();
1187
1188 /* Disable hardware page table walking while in guest */
1189 htw_stop();
1190
1191 /*
1192 * While in guest context we're in the guest's address space, not the
1193 * host process address space, so we need to be careful not to confuse
1194 * e.g. cache management IPIs.
1195 */
1196 kvm_mips_suspend_mm(cpu);
1197
1198 r = vcpu->arch.vcpu_run(run, vcpu);
1199
1200 /* We may have migrated while handling guest exits */
1201 cpu = smp_processor_id();
1202
1203 /* Restore normal Linux process memory map */
1204 if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
1205 asid_version_mask(cpu)))
1206 get_new_mmu_context(current->mm, cpu);
1207 write_c0_entryhi(cpu_asid(cpu, current->mm));
1208 TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
1209 kvm_mips_resume_mm(cpu);
1210
1211 htw_start();
1212
1213 pagefault_enable();
1214
1215 return r;
1216}
1217
648static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { 1218static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
649 /* exit handlers */ 1219 /* exit handlers */
650 .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable, 1220 .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -661,9 +1231,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
661 .handle_fpe = kvm_trap_emul_handle_fpe, 1231 .handle_fpe = kvm_trap_emul_handle_fpe,
662 .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled, 1232 .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
663 1233
664 .vm_init = kvm_trap_emul_vm_init,
665 .vcpu_init = kvm_trap_emul_vcpu_init, 1234 .vcpu_init = kvm_trap_emul_vcpu_init,
1235 .vcpu_uninit = kvm_trap_emul_vcpu_uninit,
666 .vcpu_setup = kvm_trap_emul_vcpu_setup, 1236 .vcpu_setup = kvm_trap_emul_vcpu_setup,
1237 .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
1238 .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
667 .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, 1239 .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
668 .queue_timer_int = kvm_mips_queue_timer_int_cb, 1240 .queue_timer_int = kvm_mips_queue_timer_int_cb,
669 .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb, 1241 .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
@@ -675,8 +1247,10 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
675 .copy_reg_indices = kvm_trap_emul_copy_reg_indices, 1247 .copy_reg_indices = kvm_trap_emul_copy_reg_indices,
676 .get_one_reg = kvm_trap_emul_get_one_reg, 1248 .get_one_reg = kvm_trap_emul_get_one_reg,
677 .set_one_reg = kvm_trap_emul_set_one_reg, 1249 .set_one_reg = kvm_trap_emul_set_one_reg,
678 .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs, 1250 .vcpu_load = kvm_trap_emul_vcpu_load,
679 .vcpu_set_regs = kvm_trap_emul_vcpu_set_regs, 1251 .vcpu_put = kvm_trap_emul_vcpu_put,
1252 .vcpu_run = kvm_trap_emul_vcpu_run,
1253 .vcpu_reenter = kvm_trap_emul_vcpu_reenter,
680}; 1254};
681 1255
682int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) 1256int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0db010cc4e65..d9b48f5bb606 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -22,6 +22,10 @@
22 22
23#include <asm/book3s/64/mmu-hash.h> 23#include <asm/book3s/64/mmu-hash.h>
24 24
25/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
26#define PPC_MIN_HPT_ORDER 18
27#define PPC_MAX_HPT_ORDER 46
28
25#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 29#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
26static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) 30static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
27{ 31{
@@ -356,6 +360,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
356 360
357extern void kvmhv_rm_send_ipi(int cpu); 361extern void kvmhv_rm_send_ipi(int cpu);
358 362
363static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt)
364{
365 /* HPTEs are 2**4 bytes long */
366 return 1UL << (hpt->order - 4);
367}
368
369static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
370{
371 /* 128 (2**7) bytes in each HPTEG */
372 return (1UL << (hpt->order - 7)) - 1;
373}
374
359#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 375#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
360 376
361#endif /* __ASM_KVM_BOOK3S_64_H__ */ 377#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b2dbeac3f450..7bba8f415627 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -241,12 +241,24 @@ struct kvm_arch_memory_slot {
241#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 241#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
242}; 242};
243 243
244struct kvm_hpt_info {
245 /* Host virtual (linear mapping) address of guest HPT */
246 unsigned long virt;
247 /* Array of reverse mapping entries for each guest HPTE */
248 struct revmap_entry *rev;
249 /* Guest HPT size is 2**(order) bytes */
250 u32 order;
251 /* 1 if HPT allocated with CMA, 0 otherwise */
252 int cma;
253};
254
255struct kvm_resize_hpt;
256
244struct kvm_arch { 257struct kvm_arch {
245 unsigned int lpid; 258 unsigned int lpid;
246#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 259#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
247 unsigned int tlb_sets; 260 unsigned int tlb_sets;
248 unsigned long hpt_virt; 261 struct kvm_hpt_info hpt;
249 struct revmap_entry *revmap;
250 atomic64_t mmio_update; 262 atomic64_t mmio_update;
251 unsigned int host_lpid; 263 unsigned int host_lpid;
252 unsigned long host_lpcr; 264 unsigned long host_lpcr;
@@ -256,20 +268,17 @@ struct kvm_arch {
256 unsigned long lpcr; 268 unsigned long lpcr;
257 unsigned long vrma_slb_v; 269 unsigned long vrma_slb_v;
258 int hpte_setup_done; 270 int hpte_setup_done;
259 u32 hpt_order;
260 atomic_t vcpus_running; 271 atomic_t vcpus_running;
261 u32 online_vcores; 272 u32 online_vcores;
262 unsigned long hpt_npte;
263 unsigned long hpt_mask;
264 atomic_t hpte_mod_interest; 273 atomic_t hpte_mod_interest;
265 cpumask_t need_tlb_flush; 274 cpumask_t need_tlb_flush;
266 cpumask_t cpu_in_guest; 275 cpumask_t cpu_in_guest;
267 int hpt_cma_alloc;
268 u8 radix; 276 u8 radix;
269 pgd_t *pgtable; 277 pgd_t *pgtable;
270 u64 process_table; 278 u64 process_table;
271 struct dentry *debugfs_dir; 279 struct dentry *debugfs_dir;
272 struct dentry *htab_dentry; 280 struct dentry *htab_dentry;
281 struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
273#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ 282#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
274#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE 283#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
275 struct mutex hpt_mutex; 284 struct mutex hpt_mutex;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48c760f89590..dd11c4c8c56a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
155extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); 155extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
156extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); 156extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
157 157
158extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); 158extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
159extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); 159extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
160extern void kvmppc_free_hpt(struct kvm *kvm); 160extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
161extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
161extern long kvmppc_prepare_vrma(struct kvm *kvm, 162extern long kvmppc_prepare_vrma(struct kvm *kvm,
162 struct kvm_userspace_memory_region *mem); 163 struct kvm_userspace_memory_region *mem);
163extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, 164extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -186,8 +187,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
186 unsigned long tce_value, unsigned long npages); 187 unsigned long tce_value, unsigned long npages);
187extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, 188extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
188 unsigned long ioba); 189 unsigned long ioba);
189extern struct page *kvm_alloc_hpt(unsigned long nr_pages); 190extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
190extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); 191extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
191extern int kvmppc_core_init_vm(struct kvm *kvm); 192extern int kvmppc_core_init_vm(struct kvm *kvm);
192extern void kvmppc_core_destroy_vm(struct kvm *kvm); 193extern void kvmppc_core_destroy_vm(struct kvm *kvm);
193extern void kvmppc_core_free_memslot(struct kvm *kvm, 194extern void kvmppc_core_free_memslot(struct kvm *kvm,
@@ -214,6 +215,10 @@ extern void kvmppc_bookehv_exit(void);
214extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); 215extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
215 216
216extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); 217extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
218extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
219 struct kvm_ppc_resize_hpt *rhpt);
220extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
221 struct kvm_ppc_resize_hpt *rhpt);
217 222
218int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); 223int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
219 224
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index cc0908b6c2a0..4edbe4bb0e8b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -633,5 +633,7 @@ struct kvm_ppc_rmmu_info {
633#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) 633#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40)
634#define KVM_XICS_MASKED (1ULL << 41) 634#define KVM_XICS_MASKED (1ULL << 41)
635#define KVM_XICS_PENDING (1ULL << 42) 635#define KVM_XICS_PENDING (1ULL << 42)
636#define KVM_XICS_PRESENTED (1ULL << 43)
637#define KVM_XICS_QUEUED (1ULL << 44)
636 638
637#endif /* __LINUX_KVM_POWERPC_H */ 639#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index a2eb6d354a57..1992676c7a94 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -224,7 +224,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
224 ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary); 224 ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
225 225
226 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { 226 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
227 printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp); 227 printk_ratelimited(KERN_ERR
228 "KVM: Can't copy data from 0x%lx!\n", ptegp);
228 goto no_page_found; 229 goto no_page_found;
229 } 230 }
230 231
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b9131aa1aedf..70153578131a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -265,7 +265,8 @@ do_second:
265 goto no_page_found; 265 goto no_page_found;
266 266
267 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { 267 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
268 printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); 268 printk_ratelimited(KERN_ERR
269 "KVM: Can't copy data from 0x%lx!\n", ptegp);
269 goto no_page_found; 270 goto no_page_found;
270 } 271 }
271 272
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9df3d940acec..f3158fb16de3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,84 +40,101 @@
40 40
41#include "trace_hv.h" 41#include "trace_hv.h"
42 42
43/* Power architecture requires HPT is at least 256kB */ 43//#define DEBUG_RESIZE_HPT 1
44#define PPC_MIN_HPT_ORDER 18 44
45#ifdef DEBUG_RESIZE_HPT
46#define resize_hpt_debug(resize, ...) \
47 do { \
48 printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \
49 printk(__VA_ARGS__); \
50 } while (0)
51#else
52#define resize_hpt_debug(resize, ...) \
53 do { } while (0)
54#endif
45 55
46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 56static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
47 long pte_index, unsigned long pteh, 57 long pte_index, unsigned long pteh,
48 unsigned long ptel, unsigned long *pte_idx_ret); 58 unsigned long ptel, unsigned long *pte_idx_ret);
59
60struct kvm_resize_hpt {
61 /* These fields read-only after init */
62 struct kvm *kvm;
63 struct work_struct work;
64 u32 order;
65
66 /* These fields protected by kvm->lock */
67 int error;
68 bool prepare_done;
69
70 /* Private to the work thread, until prepare_done is true,
71 * then protected by kvm->resize_hpt_sem */
72 struct kvm_hpt_info hpt;
73};
74
49static void kvmppc_rmap_reset(struct kvm *kvm); 75static void kvmppc_rmap_reset(struct kvm *kvm);
50 76
51long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 77int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
52{ 78{
53 unsigned long hpt = 0; 79 unsigned long hpt = 0;
54 struct revmap_entry *rev; 80 int cma = 0;
55 struct page *page = NULL; 81 struct page *page = NULL;
56 long order = KVM_DEFAULT_HPT_ORDER; 82 struct revmap_entry *rev;
83 unsigned long npte;
57 84
58 if (htab_orderp) { 85 if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
59 order = *htab_orderp; 86 return -EINVAL;
60 if (order < PPC_MIN_HPT_ORDER)
61 order = PPC_MIN_HPT_ORDER;
62 }
63 87
64 kvm->arch.hpt_cma_alloc = 0; 88 page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
65 page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
66 if (page) { 89 if (page) {
67 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); 90 hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
68 memset((void *)hpt, 0, (1ul << order)); 91 memset((void *)hpt, 0, (1ul << order));
69 kvm->arch.hpt_cma_alloc = 1; 92 cma = 1;
70 } 93 }
71 94
72 /* Lastly try successively smaller sizes from the page allocator */ 95 if (!hpt)
73 /* Only do this if userspace didn't specify a size via ioctl */ 96 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT
74 while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { 97 |__GFP_NOWARN, order - PAGE_SHIFT);
75 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
76 __GFP_NOWARN, order - PAGE_SHIFT);
77 if (!hpt)
78 --order;
79 }
80 98
81 if (!hpt) 99 if (!hpt)
82 return -ENOMEM; 100 return -ENOMEM;
83 101
84 kvm->arch.hpt_virt = hpt;
85 kvm->arch.hpt_order = order;
86 /* HPTEs are 2**4 bytes long */ 102 /* HPTEs are 2**4 bytes long */
87 kvm->arch.hpt_npte = 1ul << (order - 4); 103 npte = 1ul << (order - 4);
88 /* 128 (2**7) bytes in each HPTEG */
89 kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
90
91 atomic64_set(&kvm->arch.mmio_update, 0);
92 104
93 /* Allocate reverse map array */ 105 /* Allocate reverse map array */
94 rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); 106 rev = vmalloc(sizeof(struct revmap_entry) * npte);
95 if (!rev) { 107 if (!rev) {
96 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); 108 pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
97 goto out_freehpt; 109 if (cma)
110 kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
111 else
112 free_pages(hpt, order - PAGE_SHIFT);
113 return -ENOMEM;
98 } 114 }
99 kvm->arch.revmap = rev;
100 kvm->arch.sdr1 = __pa(hpt) | (order - 18);
101 115
102 pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", 116 info->order = order;
103 hpt, order, kvm->arch.lpid); 117 info->virt = hpt;
118 info->cma = cma;
119 info->rev = rev;
104 120
105 if (htab_orderp)
106 *htab_orderp = order;
107 return 0; 121 return 0;
122}
108 123
109 out_freehpt: 124void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
110 if (kvm->arch.hpt_cma_alloc) 125{
111 kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); 126 atomic64_set(&kvm->arch.mmio_update, 0);
112 else 127 kvm->arch.hpt = *info;
113 free_pages(hpt, order - PAGE_SHIFT); 128 kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
114 return -ENOMEM; 129
130 pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
131 info->virt, (long)info->order, kvm->arch.lpid);
115} 132}
116 133
117long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) 134long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
118{ 135{
119 long err = -EBUSY; 136 long err = -EBUSY;
120 long order; 137 struct kvm_hpt_info info;
121 138
122 if (kvm_is_radix(kvm)) 139 if (kvm_is_radix(kvm))
123 return -EINVAL; 140 return -EINVAL;
@@ -132,36 +149,44 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
132 goto out; 149 goto out;
133 } 150 }
134 } 151 }
135 if (kvm->arch.hpt_virt) { 152 if (kvm->arch.hpt.order == order) {
136 order = kvm->arch.hpt_order; 153 /* We already have a suitable HPT */
154
137 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 155 /* Set the entire HPT to 0, i.e. invalid HPTEs */
138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 156 memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
139 /* 157 /*
140 * Reset all the reverse-mapping chains for all memslots 158 * Reset all the reverse-mapping chains for all memslots
141 */ 159 */
142 kvmppc_rmap_reset(kvm); 160 kvmppc_rmap_reset(kvm);
143 /* Ensure that each vcpu will flush its TLB on next entry. */ 161 /* Ensure that each vcpu will flush its TLB on next entry. */
144 cpumask_setall(&kvm->arch.need_tlb_flush); 162 cpumask_setall(&kvm->arch.need_tlb_flush);
145 *htab_orderp = order;
146 err = 0; 163 err = 0;
147 } else { 164 goto out;
148 err = kvmppc_alloc_hpt(kvm, htab_orderp);
149 order = *htab_orderp;
150 } 165 }
151 out: 166
167 if (kvm->arch.hpt.virt)
168 kvmppc_free_hpt(&kvm->arch.hpt);
169
170 err = kvmppc_allocate_hpt(&info, order);
171 if (err < 0)
172 goto out;
173 kvmppc_set_hpt(kvm, &info);
174
175out:
152 mutex_unlock(&kvm->lock); 176 mutex_unlock(&kvm->lock);
153 return err; 177 return err;
154} 178}
155 179
156void kvmppc_free_hpt(struct kvm *kvm) 180void kvmppc_free_hpt(struct kvm_hpt_info *info)
157{ 181{
158 vfree(kvm->arch.revmap); 182 vfree(info->rev);
159 if (kvm->arch.hpt_cma_alloc) 183 if (info->cma)
160 kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), 184 kvm_free_hpt_cma(virt_to_page(info->virt),
161 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); 185 1 << (info->order - PAGE_SHIFT));
162 else if (kvm->arch.hpt_virt) 186 else if (info->virt)
163 free_pages(kvm->arch.hpt_virt, 187 free_pages(info->virt, info->order - PAGE_SHIFT);
164 kvm->arch.hpt_order - PAGE_SHIFT); 188 info->virt = 0;
189 info->order = 0;
165} 190}
166 191
167/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ 192/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -196,8 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
196 if (npages > 1ul << (40 - porder)) 221 if (npages > 1ul << (40 - porder))
197 npages = 1ul << (40 - porder); 222 npages = 1ul << (40 - porder);
198 /* Can't use more than 1 HPTE per HPTEG */ 223 /* Can't use more than 1 HPTE per HPTEG */
199 if (npages > kvm->arch.hpt_mask + 1) 224 if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
200 npages = kvm->arch.hpt_mask + 1; 225 npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
201 226
202 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | 227 hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
203 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); 228 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -207,7 +232,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
207 for (i = 0; i < npages; ++i) { 232 for (i = 0; i < npages; ++i) {
208 addr = i << porder; 233 addr = i << porder;
209 /* can't use hpt_hash since va > 64 bits */ 234 /* can't use hpt_hash since va > 64 bits */
210 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; 235 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
236 & kvmppc_hpt_mask(&kvm->arch.hpt);
211 /* 237 /*
212 * We assume that the hash table is empty and no 238 * We assume that the hash table is empty and no
213 * vcpus are using it at this stage. Since we create 239 * vcpus are using it at this stage. Since we create
@@ -340,11 +366,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
340 preempt_enable(); 366 preempt_enable();
341 return -ENOENT; 367 return -ENOENT;
342 } 368 }
343 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); 369 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
344 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; 370 v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
345 if (cpu_has_feature(CPU_FTR_ARCH_300)) 371 if (cpu_has_feature(CPU_FTR_ARCH_300))
346 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); 372 v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
347 gr = kvm->arch.revmap[index].guest_rpte; 373 gr = kvm->arch.hpt.rev[index].guest_rpte;
348 374
349 unlock_hpte(hptep, orig_v); 375 unlock_hpte(hptep, orig_v);
350 preempt_enable(); 376 preempt_enable();
@@ -485,8 +511,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
485 } 511 }
486 } 512 }
487 index = vcpu->arch.pgfault_index; 513 index = vcpu->arch.pgfault_index;
488 hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); 514 hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
489 rev = &kvm->arch.revmap[index]; 515 rev = &kvm->arch.hpt.rev[index];
490 preempt_disable(); 516 preempt_disable();
491 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) 517 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
492 cpu_relax(); 518 cpu_relax();
@@ -745,13 +771,53 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
745 return kvm_handle_hva_range(kvm, hva, hva + 1, handler); 771 return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
746} 772}
747 773
774/* Must be called with both HPTE and rmap locked */
775static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
776 unsigned long *rmapp, unsigned long gfn)
777{
778 __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
779 struct revmap_entry *rev = kvm->arch.hpt.rev;
780 unsigned long j, h;
781 unsigned long ptel, psize, rcbits;
782
783 j = rev[i].forw;
784 if (j == i) {
785 /* chain is now empty */
786 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
787 } else {
788 /* remove i from chain */
789 h = rev[i].back;
790 rev[h].forw = j;
791 rev[j].back = h;
792 rev[i].forw = rev[i].back = i;
793 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
794 }
795
796 /* Now check and modify the HPTE */
797 ptel = rev[i].guest_rpte;
798 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
799 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
800 hpte_rpn(ptel, psize) == gfn) {
801 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
802 kvmppc_invalidate_hpte(kvm, hptep, i);
803 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
804 /* Harvest R and C */
805 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
806 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
807 if (rcbits & HPTE_R_C)
808 kvmppc_update_rmap_change(rmapp, psize);
809 if (rcbits & ~rev[i].guest_rpte) {
810 rev[i].guest_rpte = ptel | rcbits;
811 note_hpte_modification(kvm, &rev[i]);
812 }
813 }
814}
815
748static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 816static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
749 unsigned long gfn) 817 unsigned long gfn)
750{ 818{
751 struct revmap_entry *rev = kvm->arch.revmap; 819 unsigned long i;
752 unsigned long h, i, j;
753 __be64 *hptep; 820 __be64 *hptep;
754 unsigned long ptel, psize, rcbits;
755 unsigned long *rmapp; 821 unsigned long *rmapp;
756 822
757 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; 823 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
@@ -768,7 +834,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
768 * rmap chain lock. 834 * rmap chain lock.
769 */ 835 */
770 i = *rmapp & KVMPPC_RMAP_INDEX; 836 i = *rmapp & KVMPPC_RMAP_INDEX;
771 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 837 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
772 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { 838 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
773 /* unlock rmap before spinning on the HPTE lock */ 839 /* unlock rmap before spinning on the HPTE lock */
774 unlock_rmap(rmapp); 840 unlock_rmap(rmapp);
@@ -776,37 +842,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
776 cpu_relax(); 842 cpu_relax();
777 continue; 843 continue;
778 } 844 }
779 j = rev[i].forw;
780 if (j == i) {
781 /* chain is now empty */
782 *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
783 } else {
784 /* remove i from chain */
785 h = rev[i].back;
786 rev[h].forw = j;
787 rev[j].back = h;
788 rev[i].forw = rev[i].back = i;
789 *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
790 }
791 845
792 /* Now check and modify the HPTE */ 846 kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
793 ptel = rev[i].guest_rpte;
794 psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
795 if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
796 hpte_rpn(ptel, psize) == gfn) {
797 hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
798 kvmppc_invalidate_hpte(kvm, hptep, i);
799 hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
800 /* Harvest R and C */
801 rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
802 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
803 if (rcbits & HPTE_R_C)
804 kvmppc_update_rmap_change(rmapp, psize);
805 if (rcbits & ~rev[i].guest_rpte) {
806 rev[i].guest_rpte = ptel | rcbits;
807 note_hpte_modification(kvm, &rev[i]);
808 }
809 }
810 unlock_rmap(rmapp); 847 unlock_rmap(rmapp);
811 __unlock_hpte(hptep, be64_to_cpu(hptep[0])); 848 __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
812 } 849 }
@@ -860,7 +897,7 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
860static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 897static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
861 unsigned long gfn) 898 unsigned long gfn)
862{ 899{
863 struct revmap_entry *rev = kvm->arch.revmap; 900 struct revmap_entry *rev = kvm->arch.hpt.rev;
864 unsigned long head, i, j; 901 unsigned long head, i, j;
865 __be64 *hptep; 902 __be64 *hptep;
866 int ret = 0; 903 int ret = 0;
@@ -880,7 +917,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
880 917
881 i = head = *rmapp & KVMPPC_RMAP_INDEX; 918 i = head = *rmapp & KVMPPC_RMAP_INDEX;
882 do { 919 do {
883 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 920 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
884 j = rev[i].forw; 921 j = rev[i].forw;
885 922
886 /* If this HPTE isn't referenced, ignore it */ 923 /* If this HPTE isn't referenced, ignore it */
@@ -923,7 +960,7 @@ int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
923static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, 960static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
924 unsigned long gfn) 961 unsigned long gfn)
925{ 962{
926 struct revmap_entry *rev = kvm->arch.revmap; 963 struct revmap_entry *rev = kvm->arch.hpt.rev;
927 unsigned long head, i, j; 964 unsigned long head, i, j;
928 unsigned long *hp; 965 unsigned long *hp;
929 int ret = 1; 966 int ret = 1;
@@ -940,7 +977,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
940 if (*rmapp & KVMPPC_RMAP_PRESENT) { 977 if (*rmapp & KVMPPC_RMAP_PRESENT) {
941 i = head = *rmapp & KVMPPC_RMAP_INDEX; 978 i = head = *rmapp & KVMPPC_RMAP_INDEX;
942 do { 979 do {
943 hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); 980 hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
944 j = rev[i].forw; 981 j = rev[i].forw;
945 if (be64_to_cpu(hp[1]) & HPTE_R_R) 982 if (be64_to_cpu(hp[1]) & HPTE_R_R)
946 goto out; 983 goto out;
@@ -980,7 +1017,7 @@ static int vcpus_running(struct kvm *kvm)
980 */ 1017 */
981static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) 1018static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
982{ 1019{
983 struct revmap_entry *rev = kvm->arch.revmap; 1020 struct revmap_entry *rev = kvm->arch.hpt.rev;
984 unsigned long head, i, j; 1021 unsigned long head, i, j;
985 unsigned long n; 1022 unsigned long n;
986 unsigned long v, r; 1023 unsigned long v, r;
@@ -1005,7 +1042,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1005 i = head = *rmapp & KVMPPC_RMAP_INDEX; 1042 i = head = *rmapp & KVMPPC_RMAP_INDEX;
1006 do { 1043 do {
1007 unsigned long hptep1; 1044 unsigned long hptep1;
1008 hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); 1045 hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
1009 j = rev[i].forw; 1046 j = rev[i].forw;
1010 1047
1011 /* 1048 /*
@@ -1172,6 +1209,363 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1172} 1209}
1173 1210
1174/* 1211/*
1212 * HPT resizing
1213 */
1214static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
1215{
1216 int rc;
1217
1218 rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
1219 if (rc < 0)
1220 return rc;
1221
1222 resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
1223 resize->hpt.virt);
1224
1225 return 0;
1226}
1227
1228static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1229 unsigned long idx)
1230{
1231 struct kvm *kvm = resize->kvm;
1232 struct kvm_hpt_info *old = &kvm->arch.hpt;
1233 struct kvm_hpt_info *new = &resize->hpt;
1234 unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
1235 unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
1236 __be64 *hptep, *new_hptep;
1237 unsigned long vpte, rpte, guest_rpte;
1238 int ret;
1239 struct revmap_entry *rev;
1240 unsigned long apsize, psize, avpn, pteg, hash;
1241 unsigned long new_idx, new_pteg, replace_vpte;
1242
1243 hptep = (__be64 *)(old->virt + (idx << 4));
1244
1245 /* Guest is stopped, so new HPTEs can't be added or faulted
1246 * in, only unmapped or altered by host actions. So, it's
1247 * safe to check this before we take the HPTE lock */
1248 vpte = be64_to_cpu(hptep[0]);
1249 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1250 return 0; /* nothing to do */
1251
1252 while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
1253 cpu_relax();
1254
1255 vpte = be64_to_cpu(hptep[0]);
1256
1257 ret = 0;
1258 if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1259 /* Nothing to do */
1260 goto out;
1261
1262 /* Unmap */
1263 rev = &old->rev[idx];
1264 guest_rpte = rev->guest_rpte;
1265
1266 ret = -EIO;
1267 apsize = hpte_page_size(vpte, guest_rpte);
1268 if (!apsize)
1269 goto out;
1270
1271 if (vpte & HPTE_V_VALID) {
1272 unsigned long gfn = hpte_rpn(guest_rpte, apsize);
1273 int srcu_idx = srcu_read_lock(&kvm->srcu);
1274 struct kvm_memory_slot *memslot =
1275 __gfn_to_memslot(kvm_memslots(kvm), gfn);
1276
1277 if (memslot) {
1278 unsigned long *rmapp;
1279 rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1280
1281 lock_rmap(rmapp);
1282 kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
1283 unlock_rmap(rmapp);
1284 }
1285
1286 srcu_read_unlock(&kvm->srcu, srcu_idx);
1287 }
1288
1289 /* Reload PTE after unmap */
1290 vpte = be64_to_cpu(hptep[0]);
1291
1292 BUG_ON(vpte & HPTE_V_VALID);
1293 BUG_ON(!(vpte & HPTE_V_ABSENT));
1294
1295 ret = 0;
1296 if (!(vpte & HPTE_V_BOLTED))
1297 goto out;
1298
1299 rpte = be64_to_cpu(hptep[1]);
1300 psize = hpte_base_page_size(vpte, rpte);
1301 avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
1302 pteg = idx / HPTES_PER_GROUP;
1303 if (vpte & HPTE_V_SECONDARY)
1304 pteg = ~pteg;
1305
1306 if (!(vpte & HPTE_V_1TB_SEG)) {
1307 unsigned long offset, vsid;
1308
1309 /* We only have 28 - 23 bits of offset in avpn */
1310 offset = (avpn & 0x1f) << 23;
1311 vsid = avpn >> 5;
1312 /* We can find more bits from the pteg value */
1313 if (psize < (1ULL << 23))
1314 offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
1315
1316 hash = vsid ^ (offset / psize);
1317 } else {
1318 unsigned long offset, vsid;
1319
1320 /* We only have 40 - 23 bits of seg_off in avpn */
1321 offset = (avpn & 0x1ffff) << 23;
1322 vsid = avpn >> 17;
1323 if (psize < (1ULL << 23))
1324 offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
1325
1326 hash = vsid ^ (vsid << 25) ^ (offset / psize);
1327 }
1328
1329 new_pteg = hash & new_hash_mask;
1330 if (vpte & HPTE_V_SECONDARY) {
1331 BUG_ON(~pteg != (hash & old_hash_mask));
1332 new_pteg = ~new_pteg;
1333 } else {
1334 BUG_ON(pteg != (hash & old_hash_mask));
1335 }
1336
1337 new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
1338 new_hptep = (__be64 *)(new->virt + (new_idx << 4));
1339
1340 replace_vpte = be64_to_cpu(new_hptep[0]);
1341
1342 if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1343 BUG_ON(new->order >= old->order);
1344
1345 if (replace_vpte & HPTE_V_BOLTED) {
1346 if (vpte & HPTE_V_BOLTED)
1347 /* Bolted collision, nothing we can do */
1348 ret = -ENOSPC;
1349 /* Discard the new HPTE */
1350 goto out;
1351 }
1352
1353 /* Discard the previous HPTE */
1354 }
1355
1356 new_hptep[1] = cpu_to_be64(rpte);
1357 new->rev[new_idx].guest_rpte = guest_rpte;
1358 /* No need for a barrier, since new HPT isn't active */
1359 new_hptep[0] = cpu_to_be64(vpte);
1360 unlock_hpte(new_hptep, vpte);
1361
1362out:
1363 unlock_hpte(hptep, vpte);
1364 return ret;
1365}
1366
1367static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
1368{
1369 struct kvm *kvm = resize->kvm;
1370 unsigned long i;
1371 int rc;
1372
1373 /*
1374 * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
1375 * that POWER9 uses, and could well hit a BUG_ON on POWER9.
1376 */
1377 if (cpu_has_feature(CPU_FTR_ARCH_300))
1378 return -EIO;
1379 for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
1380 rc = resize_hpt_rehash_hpte(resize, i);
1381 if (rc != 0)
1382 return rc;
1383 }
1384
1385 return 0;
1386}
1387
1388static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
1389{
1390 struct kvm *kvm = resize->kvm;
1391 struct kvm_hpt_info hpt_tmp;
1392
1393 /* Exchange the pending tables in the resize structure with
1394 * the active tables */
1395
1396 resize_hpt_debug(resize, "resize_hpt_pivot()\n");
1397
1398 spin_lock(&kvm->mmu_lock);
1399 asm volatile("ptesync" : : : "memory");
1400
1401 hpt_tmp = kvm->arch.hpt;
1402 kvmppc_set_hpt(kvm, &resize->hpt);
1403 resize->hpt = hpt_tmp;
1404
1405 spin_unlock(&kvm->mmu_lock);
1406
1407 synchronize_srcu_expedited(&kvm->srcu);
1408
1409 resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
1410}
1411
1412static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
1413{
1414 BUG_ON(kvm->arch.resize_hpt != resize);
1415
1416 if (!resize)
1417 return;
1418
1419 if (resize->hpt.virt)
1420 kvmppc_free_hpt(&resize->hpt);
1421
1422 kvm->arch.resize_hpt = NULL;
1423 kfree(resize);
1424}
1425
1426static void resize_hpt_prepare_work(struct work_struct *work)
1427{
1428 struct kvm_resize_hpt *resize = container_of(work,
1429 struct kvm_resize_hpt,
1430 work);
1431 struct kvm *kvm = resize->kvm;
1432 int err;
1433
1434 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
1435 resize->order);
1436
1437 err = resize_hpt_allocate(resize);
1438
1439 mutex_lock(&kvm->lock);
1440
1441 resize->error = err;
1442 resize->prepare_done = true;
1443
1444 mutex_unlock(&kvm->lock);
1445}
1446
1447long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
1448 struct kvm_ppc_resize_hpt *rhpt)
1449{
1450 unsigned long flags = rhpt->flags;
1451 unsigned long shift = rhpt->shift;
1452 struct kvm_resize_hpt *resize;
1453 int ret;
1454
1455 if (flags != 0)
1456 return -EINVAL;
1457
1458 if (shift && ((shift < 18) || (shift > 46)))
1459 return -EINVAL;
1460
1461 mutex_lock(&kvm->lock);
1462
1463 resize = kvm->arch.resize_hpt;
1464
1465 if (resize) {
1466 if (resize->order == shift) {
1467 /* Suitable resize in progress */
1468 if (resize->prepare_done) {
1469 ret = resize->error;
1470 if (ret != 0)
1471 resize_hpt_release(kvm, resize);
1472 } else {
1473 ret = 100; /* estimated time in ms */
1474 }
1475
1476 goto out;
1477 }
1478
1479 /* not suitable, cancel it */
1480 resize_hpt_release(kvm, resize);
1481 }
1482
1483 ret = 0;
1484 if (!shift)
1485 goto out; /* nothing to do */
1486
1487 /* start new resize */
1488
1489 resize = kzalloc(sizeof(*resize), GFP_KERNEL);
1490 resize->order = shift;
1491 resize->kvm = kvm;
1492 INIT_WORK(&resize->work, resize_hpt_prepare_work);
1493 kvm->arch.resize_hpt = resize;
1494
1495 schedule_work(&resize->work);
1496
1497 ret = 100; /* estimated time in ms */
1498
1499out:
1500 mutex_unlock(&kvm->lock);
1501 return ret;
1502}
1503
1504static void resize_hpt_boot_vcpu(void *opaque)
1505{
1506 /* Nothing to do, just force a KVM exit */
1507}
1508
1509long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1510 struct kvm_ppc_resize_hpt *rhpt)
1511{
1512 unsigned long flags = rhpt->flags;
1513 unsigned long shift = rhpt->shift;
1514 struct kvm_resize_hpt *resize;
1515 long ret;
1516
1517 if (flags != 0)
1518 return -EINVAL;
1519
1520 if (shift && ((shift < 18) || (shift > 46)))
1521 return -EINVAL;
1522
1523 mutex_lock(&kvm->lock);
1524
1525 resize = kvm->arch.resize_hpt;
1526
1527 /* This shouldn't be possible */
1528 ret = -EIO;
1529 if (WARN_ON(!kvm->arch.hpte_setup_done))
1530 goto out_no_hpt;
1531
1532 /* Stop VCPUs from running while we mess with the HPT */
1533 kvm->arch.hpte_setup_done = 0;
1534 smp_mb();
1535
1536 /* Boot all CPUs out of the guest so they re-read
1537 * hpte_setup_done */
1538 on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
1539
1540 ret = -ENXIO;
1541 if (!resize || (resize->order != shift))
1542 goto out;
1543
1544 ret = -EBUSY;
1545 if (!resize->prepare_done)
1546 goto out;
1547
1548 ret = resize->error;
1549 if (ret != 0)
1550 goto out;
1551
1552 ret = resize_hpt_rehash(resize);
1553 if (ret != 0)
1554 goto out;
1555
1556 resize_hpt_pivot(resize);
1557
1558out:
1559 /* Let VCPUs run again */
1560 kvm->arch.hpte_setup_done = 1;
1561 smp_mb();
1562out_no_hpt:
1563 resize_hpt_release(kvm, resize);
1564 mutex_unlock(&kvm->lock);
1565 return ret;
1566}
1567
1568/*
1175 * Functions for reading and writing the hash table via reads and 1569 * Functions for reading and writing the hash table via reads and
1176 * writes on a file descriptor. 1570 * writes on a file descriptor.
1177 * 1571 *
@@ -1311,8 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1311 flags = ctx->flags; 1705 flags = ctx->flags;
1312 1706
1313 i = ctx->index; 1707 i = ctx->index;
1314 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1708 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1315 revp = kvm->arch.revmap + i; 1709 revp = kvm->arch.hpt.rev + i;
1316 lbuf = (unsigned long __user *)buf; 1710 lbuf = (unsigned long __user *)buf;
1317 1711
1318 nb = 0; 1712 nb = 0;
@@ -1327,7 +1721,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1327 1721
1328 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1722 /* Skip uninteresting entries, i.e. clean on not-first pass */
1329 if (!first_pass) { 1723 if (!first_pass) {
1330 while (i < kvm->arch.hpt_npte && 1724 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1331 !hpte_dirty(revp, hptp)) { 1725 !hpte_dirty(revp, hptp)) {
1332 ++i; 1726 ++i;
1333 hptp += 2; 1727 hptp += 2;
@@ -1337,7 +1731,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1337 hdr.index = i; 1731 hdr.index = i;
1338 1732
1339 /* Grab a series of valid entries */ 1733 /* Grab a series of valid entries */
1340 while (i < kvm->arch.hpt_npte && 1734 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1341 hdr.n_valid < 0xffff && 1735 hdr.n_valid < 0xffff &&
1342 nb + HPTE_SIZE < count && 1736 nb + HPTE_SIZE < count &&
1343 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1737 record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1353,7 +1747,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1353 ++revp; 1747 ++revp;
1354 } 1748 }
1355 /* Now skip invalid entries while we can */ 1749 /* Now skip invalid entries while we can */
1356 while (i < kvm->arch.hpt_npte && 1750 while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1357 hdr.n_invalid < 0xffff && 1751 hdr.n_invalid < 0xffff &&
1358 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1752 record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1359 /* found an invalid entry */ 1753 /* found an invalid entry */
@@ -1374,7 +1768,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1374 } 1768 }
1375 1769
1376 /* Check if we've wrapped around the hash table */ 1770 /* Check if we've wrapped around the hash table */
1377 if (i >= kvm->arch.hpt_npte) { 1771 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
1378 i = 0; 1772 i = 0;
1379 ctx->first_pass = 0; 1773 ctx->first_pass = 0;
1380 break; 1774 break;
@@ -1433,11 +1827,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1433 1827
1434 err = -EINVAL; 1828 err = -EINVAL;
1435 i = hdr.index; 1829 i = hdr.index;
1436 if (i >= kvm->arch.hpt_npte || 1830 if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
1437 i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) 1831 i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
1438 break; 1832 break;
1439 1833
1440 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1834 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1441 lbuf = (unsigned long __user *)buf; 1835 lbuf = (unsigned long __user *)buf;
1442 for (j = 0; j < hdr.n_valid; ++j) { 1836 for (j = 0; j < hdr.n_valid; ++j) {
1443 __be64 hpte_v; 1837 __be64 hpte_v;
@@ -1624,8 +2018,9 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
1624 2018
1625 kvm = p->kvm; 2019 kvm = p->kvm;
1626 i = p->hpt_index; 2020 i = p->hpt_index;
1627 hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 2021 hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1628 for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { 2022 for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
2023 ++i, hptp += 2) {
1629 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) 2024 if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
1630 continue; 2025 continue;
1631 2026
@@ -1635,7 +2030,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
1635 cpu_relax(); 2030 cpu_relax();
1636 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; 2031 v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
1637 hr = be64_to_cpu(hptp[1]); 2032 hr = be64_to_cpu(hptp[1]);
1638 gr = kvm->arch.revmap[i].guest_rpte; 2033 gr = kvm->arch.hpt.rev[i].guest_rpte;
1639 unlock_hpte(hptp, v); 2034 unlock_hpte(hptp, v);
1640 preempt_enable(); 2035 preempt_enable();
1641 2036
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c379ff5a4438..491c5d8120f7 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -171,6 +171,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
171 goto fail; 171 goto fail;
172 } 172 }
173 173
174 ret = -ENOMEM;
174 stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), 175 stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
175 GFP_KERNEL); 176 GFP_KERNEL);
176 if (!stt) 177 if (!stt)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4a79679342e..1e107ece4e37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
182 ++vcpu->stat.halt_wakeup; 182 ++vcpu->stat.halt_wakeup;
183 } 183 }
184 184
185 if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) 185 cpu = READ_ONCE(vcpu->arch.thread_cpu);
186 if (cpu >= 0 && kvmppc_ipi_thread(cpu))
186 return; 187 return;
187 188
188 /* CPU points to the first thread of the core */ 189 /* CPU points to the first thread of the core */
@@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
773 } 774 }
774 tvcpu->arch.prodded = 1; 775 tvcpu->arch.prodded = 1;
775 smp_mb(); 776 smp_mb();
776 if (vcpu->arch.ceded) { 777 if (tvcpu->arch.ceded)
777 if (swait_active(&vcpu->wq)) { 778 kvmppc_fast_vcpu_kick_hv(tvcpu);
778 swake_up(&vcpu->wq);
779 vcpu->stat.halt_wakeup++;
780 }
781 }
782 break; 779 break;
783 case H_CONFER: 780 case H_CONFER:
784 target = kvmppc_get_gpr(vcpu, 4); 781 target = kvmppc_get_gpr(vcpu, 4);
@@ -2665,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
2665 int i; 2662 int i;
2666 2663
2667 for_each_runnable_thread(i, vcpu, vc) { 2664 for_each_runnable_thread(i, vcpu, vc) {
2668 if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) 2665 if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
2666 vcpu->arch.prodded)
2669 return 1; 2667 return 1;
2670 } 2668 }
2671 2669
@@ -2851,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2851 break; 2849 break;
2852 n_ceded = 0; 2850 n_ceded = 0;
2853 for_each_runnable_thread(i, v, vc) { 2851 for_each_runnable_thread(i, v, vc) {
2854 if (!v->arch.pending_exceptions) 2852 if (!v->arch.pending_exceptions && !v->arch.prodded)
2855 n_ceded += v->arch.ceded; 2853 n_ceded += v->arch.ceded;
2856 else 2854 else
2857 v->arch.ceded = 0; 2855 v->arch.ceded = 0;
@@ -3199,12 +3197,23 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
3199 goto out; /* another vcpu beat us to it */ 3197 goto out; /* another vcpu beat us to it */
3200 3198
3201 /* Allocate hashed page table (if not done already) and reset it */ 3199 /* Allocate hashed page table (if not done already) and reset it */
3202 if (!kvm->arch.hpt_virt) { 3200 if (!kvm->arch.hpt.virt) {
3203 err = kvmppc_alloc_hpt(kvm, NULL); 3201 int order = KVM_DEFAULT_HPT_ORDER;
3204 if (err) { 3202 struct kvm_hpt_info info;
3203
3204 err = kvmppc_allocate_hpt(&info, order);
3205 /* If we get here, it means userspace didn't specify a
3206 * size explicitly. So, try successively smaller
3207 * sizes if the default failed. */
3208 while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
3209 err = kvmppc_allocate_hpt(&info, order);
3210
3211 if (err < 0) {
3205 pr_err("KVM: Couldn't alloc HPT\n"); 3212 pr_err("KVM: Couldn't alloc HPT\n");
3206 goto out; 3213 goto out;
3207 } 3214 }
3215
3216 kvmppc_set_hpt(kvm, &info);
3208 } 3217 }
3209 3218
3210 /* Look up the memslot for guest physical address 0 */ 3219 /* Look up the memslot for guest physical address 0 */
@@ -3413,6 +3422,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3413 3422
3414 kvm->arch.lpcr = lpcr; 3423 kvm->arch.lpcr = lpcr;
3415 3424
3425 /* Initialization for future HPT resizes */
3426 kvm->arch.resize_hpt = NULL;
3427
3416 /* 3428 /*
3417 * Work out how many sets the TLB has, for the use of 3429 * Work out how many sets the TLB has, for the use of
3418 * the TLB invalidation loop in book3s_hv_rmhandlers.S. 3430 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
@@ -3469,7 +3481,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
3469 if (kvm_is_radix(kvm)) 3481 if (kvm_is_radix(kvm))
3470 kvmppc_free_radix(kvm); 3482 kvmppc_free_radix(kvm);
3471 else 3483 else
3472 kvmppc_free_hpt(kvm); 3484 kvmppc_free_hpt(&kvm->arch.hpt);
3473 3485
3474 kvmppc_free_pimap(kvm); 3486 kvmppc_free_pimap(kvm);
3475} 3487}
@@ -3695,12 +3707,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
3695 r = -EFAULT; 3707 r = -EFAULT;
3696 if (get_user(htab_order, (u32 __user *)argp)) 3708 if (get_user(htab_order, (u32 __user *)argp))
3697 break; 3709 break;
3698 r = kvmppc_alloc_reset_hpt(kvm, &htab_order); 3710 r = kvmppc_alloc_reset_hpt(kvm, htab_order);
3699 if (r) 3711 if (r)
3700 break; 3712 break;
3701 r = -EFAULT;
3702 if (put_user(htab_order, (u32 __user *)argp))
3703 break;
3704 r = 0; 3713 r = 0;
3705 break; 3714 break;
3706 } 3715 }
@@ -3715,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
3715 break; 3724 break;
3716 } 3725 }
3717 3726
3727 case KVM_PPC_RESIZE_HPT_PREPARE: {
3728 struct kvm_ppc_resize_hpt rhpt;
3729
3730 r = -EFAULT;
3731 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
3732 break;
3733
3734 r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
3735 break;
3736 }
3737
3738 case KVM_PPC_RESIZE_HPT_COMMIT: {
3739 struct kvm_ppc_resize_hpt rhpt;
3740
3741 r = -EFAULT;
3742 if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
3743 break;
3744
3745 r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
3746 break;
3747 }
3748
3718 default: 3749 default:
3719 r = -ENOTTY; 3750 r = -ENOTTY;
3720 } 3751 }
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 2f69fbc19bb0..c42a7e63b39e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -52,19 +52,19 @@ static int __init early_parse_kvm_cma_resv(char *p)
52} 52}
53early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); 53early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
54 54
55struct page *kvm_alloc_hpt(unsigned long nr_pages) 55struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
56{ 56{
57 VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); 57 VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
58 58
59 return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); 59 return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
60} 60}
61EXPORT_SYMBOL_GPL(kvm_alloc_hpt); 61EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
62 62
63void kvm_release_hpt(struct page *page, unsigned long nr_pages) 63void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
64{ 64{
65 cma_release(kvm_cma, page, nr_pages); 65 cma_release(kvm_cma, page, nr_pages);
66} 66}
67EXPORT_SYMBOL_GPL(kvm_release_hpt); 67EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
68 68
69/** 69/**
70 * kvm_cma_reserve() - reserve area for kvm hash pagetable 70 * kvm_cma_reserve() - reserve area for kvm hash pagetable
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index b095afcd4309..6fca970373ee 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -86,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
86 86
87 if (*rmap & KVMPPC_RMAP_PRESENT) { 87 if (*rmap & KVMPPC_RMAP_PRESENT) {
88 i = *rmap & KVMPPC_RMAP_INDEX; 88 i = *rmap & KVMPPC_RMAP_INDEX;
89 head = &kvm->arch.revmap[i]; 89 head = &kvm->arch.hpt.rev[i];
90 if (realmode) 90 if (realmode)
91 head = real_vmalloc_addr(head); 91 head = real_vmalloc_addr(head);
92 tail = &kvm->arch.revmap[head->back]; 92 tail = &kvm->arch.hpt.rev[head->back];
93 if (realmode) 93 if (realmode)
94 tail = real_vmalloc_addr(tail); 94 tail = real_vmalloc_addr(tail);
95 rev->forw = i; 95 rev->forw = i;
@@ -154,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
154 lock_rmap(rmap); 154 lock_rmap(rmap);
155 155
156 head = *rmap & KVMPPC_RMAP_INDEX; 156 head = *rmap & KVMPPC_RMAP_INDEX;
157 next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); 157 next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
158 prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); 158 prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
159 next->back = rev->back; 159 next->back = rev->back;
160 prev->forw = rev->forw; 160 prev->forw = rev->forw;
161 if (head == pte_index) { 161 if (head == pte_index) {
@@ -292,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
292 292
293 /* Find and lock the HPTEG slot to use */ 293 /* Find and lock the HPTEG slot to use */
294 do_insert: 294 do_insert:
295 if (pte_index >= kvm->arch.hpt_npte) 295 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
296 return H_PARAMETER; 296 return H_PARAMETER;
297 if (likely((flags & H_EXACT) == 0)) { 297 if (likely((flags & H_EXACT) == 0)) {
298 pte_index &= ~7UL; 298 pte_index &= ~7UL;
299 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 299 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
300 for (i = 0; i < 8; ++i) { 300 for (i = 0; i < 8; ++i) {
301 if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && 301 if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
302 try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | 302 try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
@@ -327,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
327 } 327 }
328 pte_index += i; 328 pte_index += i;
329 } else { 329 } else {
330 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 330 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
331 if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | 331 if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
332 HPTE_V_ABSENT)) { 332 HPTE_V_ABSENT)) {
333 /* Lock the slot and check again */ 333 /* Lock the slot and check again */
@@ -344,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
344 } 344 }
345 345
346 /* Save away the guest's idea of the second HPTE dword */ 346 /* Save away the guest's idea of the second HPTE dword */
347 rev = &kvm->arch.revmap[pte_index]; 347 rev = &kvm->arch.hpt.rev[pte_index];
348 if (realmode) 348 if (realmode)
349 rev = real_vmalloc_addr(rev); 349 rev = real_vmalloc_addr(rev);
350 if (rev) { 350 if (rev) {
@@ -469,9 +469,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
469 469
470 if (kvm_is_radix(kvm)) 470 if (kvm_is_radix(kvm))
471 return H_FUNCTION; 471 return H_FUNCTION;
472 if (pte_index >= kvm->arch.hpt_npte) 472 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
473 return H_PARAMETER; 473 return H_PARAMETER;
474 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 474 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
475 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 475 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
476 cpu_relax(); 476 cpu_relax();
477 pte = orig_pte = be64_to_cpu(hpte[0]); 477 pte = orig_pte = be64_to_cpu(hpte[0]);
@@ -487,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
487 return H_NOT_FOUND; 487 return H_NOT_FOUND;
488 } 488 }
489 489
490 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 490 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
491 v = pte & ~HPTE_V_HVLOCK; 491 v = pte & ~HPTE_V_HVLOCK;
492 if (v & HPTE_V_VALID) { 492 if (v & HPTE_V_VALID) {
493 hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); 493 hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
@@ -557,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
557 break; 557 break;
558 } 558 }
559 if (req != 1 || flags == 3 || 559 if (req != 1 || flags == 3 ||
560 pte_index >= kvm->arch.hpt_npte) { 560 pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
561 /* parameter error */ 561 /* parameter error */
562 args[j] = ((0xa0 | flags) << 56) + pte_index; 562 args[j] = ((0xa0 | flags) << 56) + pte_index;
563 ret = H_PARAMETER; 563 ret = H_PARAMETER;
564 break; 564 break;
565 } 565 }
566 hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4)); 566 hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
567 /* to avoid deadlock, don't spin except for first */ 567 /* to avoid deadlock, don't spin except for first */
568 if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { 568 if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
569 if (n) 569 if (n)
@@ -600,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
600 } 600 }
601 601
602 args[j] = ((0x80 | flags) << 56) + pte_index; 602 args[j] = ((0x80 | flags) << 56) + pte_index;
603 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 603 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
604 note_hpte_modification(kvm, rev); 604 note_hpte_modification(kvm, rev);
605 605
606 if (!(hp0 & HPTE_V_VALID)) { 606 if (!(hp0 & HPTE_V_VALID)) {
@@ -657,10 +657,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
657 657
658 if (kvm_is_radix(kvm)) 658 if (kvm_is_radix(kvm))
659 return H_FUNCTION; 659 return H_FUNCTION;
660 if (pte_index >= kvm->arch.hpt_npte) 660 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
661 return H_PARAMETER; 661 return H_PARAMETER;
662 662
663 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 663 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
664 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 664 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
665 cpu_relax(); 665 cpu_relax();
666 v = pte_v = be64_to_cpu(hpte[0]); 666 v = pte_v = be64_to_cpu(hpte[0]);
@@ -680,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
680 /* Update guest view of 2nd HPTE dword */ 680 /* Update guest view of 2nd HPTE dword */
681 mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | 681 mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
682 HPTE_R_KEY_HI | HPTE_R_KEY_LO; 682 HPTE_R_KEY_HI | HPTE_R_KEY_LO;
683 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 683 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
684 if (rev) { 684 if (rev) {
685 r = (rev->guest_rpte & ~mask) | bits; 685 r = (rev->guest_rpte & ~mask) | bits;
686 rev->guest_rpte = r; 686 rev->guest_rpte = r;
@@ -728,15 +728,15 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
728 728
729 if (kvm_is_radix(kvm)) 729 if (kvm_is_radix(kvm))
730 return H_FUNCTION; 730 return H_FUNCTION;
731 if (pte_index >= kvm->arch.hpt_npte) 731 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
732 return H_PARAMETER; 732 return H_PARAMETER;
733 if (flags & H_READ_4) { 733 if (flags & H_READ_4) {
734 pte_index &= ~3; 734 pte_index &= ~3;
735 n = 4; 735 n = 4;
736 } 736 }
737 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 737 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
738 for (i = 0; i < n; ++i, ++pte_index) { 738 for (i = 0; i < n; ++i, ++pte_index) {
739 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 739 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
740 v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; 740 v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
741 r = be64_to_cpu(hpte[1]); 741 r = be64_to_cpu(hpte[1]);
742 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 742 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
@@ -769,11 +769,11 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
769 769
770 if (kvm_is_radix(kvm)) 770 if (kvm_is_radix(kvm))
771 return H_FUNCTION; 771 return H_FUNCTION;
772 if (pte_index >= kvm->arch.hpt_npte) 772 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
773 return H_PARAMETER; 773 return H_PARAMETER;
774 774
775 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 775 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
776 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 776 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
777 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 777 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
778 cpu_relax(); 778 cpu_relax();
779 v = be64_to_cpu(hpte[0]); 779 v = be64_to_cpu(hpte[0]);
@@ -817,11 +817,11 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
817 817
818 if (kvm_is_radix(kvm)) 818 if (kvm_is_radix(kvm))
819 return H_FUNCTION; 819 return H_FUNCTION;
820 if (pte_index >= kvm->arch.hpt_npte) 820 if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
821 return H_PARAMETER; 821 return H_PARAMETER;
822 822
823 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 823 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
824 hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); 824 hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
825 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) 825 while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
826 cpu_relax(); 826 cpu_relax();
827 v = be64_to_cpu(hpte[0]); 827 v = be64_to_cpu(hpte[0]);
@@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
970 somask = (1UL << 28) - 1; 970 somask = (1UL << 28) - 1;
971 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; 971 vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
972 } 972 }
973 hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; 973 hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
974 avpn = slb_v & ~(somask >> 16); /* also includes B */ 974 avpn = slb_v & ~(somask >> 16); /* also includes B */
975 avpn |= (eaddr & somask) >> 16; 975 avpn |= (eaddr & somask) >> 16;
976 976
@@ -981,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
981 val |= avpn; 981 val |= avpn;
982 982
983 for (;;) { 983 for (;;) {
984 hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7)); 984 hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
985 985
986 for (i = 0; i < 16; i += 2) { 986 for (i = 0; i < 16; i += 2) {
987 /* Read the PTE racily */ 987 /* Read the PTE racily */
@@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
1017 if (val & HPTE_V_SECONDARY) 1017 if (val & HPTE_V_SECONDARY)
1018 break; 1018 break;
1019 val |= HPTE_V_SECONDARY; 1019 val |= HPTE_V_SECONDARY;
1020 hash = hash ^ kvm->arch.hpt_mask; 1020 hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
1021 } 1021 }
1022 return -1; 1022 return -1;
1023} 1023}
@@ -1066,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
1066 return status; /* there really was no HPTE */ 1066 return status; /* there really was no HPTE */
1067 return 0; /* for prot fault, HPTE disappeared */ 1067 return 0; /* for prot fault, HPTE disappeared */
1068 } 1068 }
1069 hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); 1069 hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
1070 v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; 1070 v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
1071 r = be64_to_cpu(hpte[1]); 1071 r = be64_to_cpu(hpte[1]);
1072 if (cpu_has_feature(CPU_FTR_ARCH_300)) { 1072 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1073 v = hpte_new_to_old_v(v, r); 1073 v = hpte_new_to_old_v(v, r);
1074 r = hpte_new_to_old_r(r); 1074 r = hpte_new_to_old_r(r);
1075 } 1075 }
1076 rev = real_vmalloc_addr(&kvm->arch.revmap[index]); 1076 rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
1077 gr = rev->guest_rpte; 1077 gr = rev->guest_rpte;
1078 1078
1079 unlock_hpte(hpte, orig_v); 1079 unlock_hpte(hpte, orig_v);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 29f43ed6d5eb..e78542d99cd6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,7 +35,7 @@ int kvm_irq_bypass = 1;
35EXPORT_SYMBOL(kvm_irq_bypass); 35EXPORT_SYMBOL(kvm_irq_bypass);
36 36
37static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 37static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
38 u32 new_irq); 38 u32 new_irq, bool check_resend);
39static int xics_opal_set_server(unsigned int hw_irq, int server_cpu); 39static int xics_opal_set_server(unsigned int hw_irq, int server_cpu);
40 40
41/* -- ICS routines -- */ 41/* -- ICS routines -- */
@@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
44{ 44{
45 int i; 45 int i;
46 46
47 arch_spin_lock(&ics->lock);
48
49 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 47 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
50 struct ics_irq_state *state = &ics->irq_state[i]; 48 struct ics_irq_state *state = &ics->irq_state[i];
51 49 if (state->resend)
52 if (!state->resend) 50 icp_rm_deliver_irq(xics, icp, state->number, true);
53 continue;
54
55 arch_spin_unlock(&ics->lock);
56 icp_rm_deliver_irq(xics, icp, state->number);
57 arch_spin_lock(&ics->lock);
58 } 51 }
59 52
60 arch_spin_unlock(&ics->lock);
61} 53}
62 54
63/* -- ICP routines -- */ 55/* -- ICP routines -- */
@@ -288,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
288} 280}
289 281
290static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 282static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
291 u32 new_irq) 283 u32 new_irq, bool check_resend)
292{ 284{
293 struct ics_irq_state *state; 285 struct ics_irq_state *state;
294 struct kvmppc_ics *ics; 286 struct kvmppc_ics *ics;
@@ -333,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
333 } 325 }
334 } 326 }
335 327
328 if (check_resend)
329 if (!state->resend)
330 goto out;
331
336 /* Clear the resend bit of that interrupt */ 332 /* Clear the resend bit of that interrupt */
337 state->resend = 0; 333 state->resend = 0;
338 334
@@ -378,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
378 */ 374 */
379 if (reject && reject != XICS_IPI) { 375 if (reject && reject != XICS_IPI) {
380 arch_spin_unlock(&ics->lock); 376 arch_spin_unlock(&ics->lock);
377 icp->n_reject++;
381 new_irq = reject; 378 new_irq = reject;
379 check_resend = 0;
382 goto again; 380 goto again;
383 } 381 }
384 } else { 382 } else {
@@ -386,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
386 * We failed to deliver the interrupt we need to set the 384 * We failed to deliver the interrupt we need to set the
387 * resend map bit and mark the ICS state as needing a resend 385 * resend map bit and mark the ICS state as needing a resend
388 */ 386 */
389 set_bit(ics->icsid, icp->resend_map);
390 state->resend = 1; 387 state->resend = 1;
391 388
392 /* 389 /*
390 * Make sure when checking resend, we don't miss the resend
391 * if resend_map bit is seen and cleared.
392 */
393 smp_wmb();
394 set_bit(ics->icsid, icp->resend_map);
395
396 /*
393 * If the need_resend flag got cleared in the ICP some time 397 * If the need_resend flag got cleared in the ICP some time
394 * between icp_rm_try_to_deliver() atomic update and now, then 398 * between icp_rm_try_to_deliver() atomic update and now, then
395 * we know it might have missed the resend_map bit. So we 399 * we know it might have missed the resend_map bit. So we
@@ -397,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
397 */ 401 */
398 smp_mb(); 402 smp_mb();
399 if (!icp->state.need_resend) { 403 if (!icp->state.need_resend) {
404 state->resend = 0;
400 arch_spin_unlock(&ics->lock); 405 arch_spin_unlock(&ics->lock);
406 check_resend = 0;
401 goto again; 407 goto again;
402 } 408 }
403 } 409 }
@@ -592,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
592 /* Handle reject in real mode */ 598 /* Handle reject in real mode */
593 if (reject && reject != XICS_IPI) { 599 if (reject && reject != XICS_IPI) {
594 this_icp->n_reject++; 600 this_icp->n_reject++;
595 icp_rm_deliver_irq(xics, icp, reject); 601 icp_rm_deliver_irq(xics, icp, reject, false);
596 } 602 }
597 603
598 /* Handle resends in real mode */ 604 /* Handle resends in real mode */
@@ -660,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
660 */ 666 */
661 if (reject && reject != XICS_IPI) { 667 if (reject && reject != XICS_IPI) {
662 icp->n_reject++; 668 icp->n_reject++;
663 icp_rm_deliver_irq(xics, icp, reject); 669 icp_rm_deliver_irq(xics, icp, reject, false);
664 } 670 }
665 bail: 671 bail:
666 return check_too_hard(xics, icp); 672 return check_too_hard(xics, icp);
667} 673}
668 674
669int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 675static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
670{ 676{
671 struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 677 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
672 struct kvmppc_icp *icp = vcpu->arch.icp; 678 struct kvmppc_icp *icp = vcpu->arch.icp;
673 struct kvmppc_ics *ics; 679 struct kvmppc_ics *ics;
674 struct ics_irq_state *state; 680 struct ics_irq_state *state;
675 u32 irq = xirr & 0x00ffffff;
676 u16 src; 681 u16 src;
677 682 u32 pq_old, pq_new;
678 if (!xics || !xics->real_mode)
679 return H_TOO_HARD;
680 683
681 /* 684 /*
682 * ICP State: EOI 685 * ICS EOI handling: For LSI, if P bit is still set, we need to
686 * resend it.
683 * 687 *
684 * Note: If EOI is incorrectly used by SW to lower the CPPR 688 * For MSI, we move Q bit into P (and clear Q). If it is set,
685 * value (ie more favored), we do not check for rejection of 689 * resend it.
686 * a pending interrupt, this is a SW error and PAPR sepcifies
687 * that we don't have to deal with it.
688 *
689 * The sending of an EOI to the ICS is handled after the
690 * CPPR update
691 *
692 * ICP State: Down_CPPR which we handle
693 * in a separate function as it's shared with H_CPPR.
694 */ 690 */
695 icp_rm_down_cppr(xics, icp, xirr >> 24);
696 691
697 /* IPIs have no EOI */
698 if (irq == XICS_IPI)
699 goto bail;
700 /*
701 * EOI handling: If the interrupt is still asserted, we need to
702 * resend it. We can take a lockless "peek" at the ICS state here.
703 *
704 * "Message" interrupts will never have "asserted" set
705 */
706 ics = kvmppc_xics_find_ics(xics, irq, &src); 692 ics = kvmppc_xics_find_ics(xics, irq, &src);
707 if (!ics) 693 if (!ics)
708 goto bail; 694 goto bail;
695
709 state = &ics->irq_state[src]; 696 state = &ics->irq_state[src];
710 697
711 /* Still asserted, resend it */ 698 if (state->lsi)
712 if (state->asserted) { 699 pq_new = state->pq_state;
713 icp->n_reject++; 700 else
714 icp_rm_deliver_irq(xics, icp, irq); 701 do {
715 } 702 pq_old = state->pq_state;
703 pq_new = pq_old >> 1;
704 } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
705
706 if (pq_new & PQ_PRESENTED)
707 icp_rm_deliver_irq(xics, NULL, irq, false);
716 708
717 if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { 709 if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
718 icp->rm_action |= XICS_RM_NOTIFY_EOI; 710 icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -733,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
733 state->intr_cpu = -1; 725 state->intr_cpu = -1;
734 } 726 }
735 } 727 }
728
736 bail: 729 bail:
737 return check_too_hard(xics, icp); 730 return check_too_hard(xics, icp);
738} 731}
739 732
733int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
734{
735 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
736 struct kvmppc_icp *icp = vcpu->arch.icp;
737 u32 irq = xirr & 0x00ffffff;
738
739 if (!xics || !xics->real_mode)
740 return H_TOO_HARD;
741
742 /*
743 * ICP State: EOI
744 *
745 * Note: If EOI is incorrectly used by SW to lower the CPPR
746 * value (ie more favored), we do not check for rejection of
747 * a pending interrupt, this is a SW error and PAPR specifies
748 * that we don't have to deal with it.
749 *
750 * The sending of an EOI to the ICS is handled after the
751 * CPPR update
752 *
753 * ICP State: Down_CPPR which we handle
754 * in a separate function as it's shared with H_CPPR.
755 */
756 icp_rm_down_cppr(xics, icp, xirr >> 24);
757
758 /* IPIs have no EOI */
759 if (irq == XICS_IPI)
760 return check_too_hard(xics, icp);
761
762 return ics_rm_eoi(vcpu, irq);
763}
764
740unsigned long eoi_rc; 765unsigned long eoi_rc;
741 766
742static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) 767static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -823,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
823{ 848{
824 struct kvmppc_xics *xics; 849 struct kvmppc_xics *xics;
825 struct kvmppc_icp *icp; 850 struct kvmppc_icp *icp;
851 struct kvmppc_ics *ics;
852 struct ics_irq_state *state;
826 u32 irq; 853 u32 irq;
854 u16 src;
855 u32 pq_old, pq_new;
827 856
828 irq = irq_map->v_hwirq; 857 irq = irq_map->v_hwirq;
829 xics = vcpu->kvm->arch.xics; 858 xics = vcpu->kvm->arch.xics;
830 icp = vcpu->arch.icp; 859 icp = vcpu->arch.icp;
831 860
832 kvmppc_rm_handle_irq_desc(irq_map->desc); 861 kvmppc_rm_handle_irq_desc(irq_map->desc);
833 icp_rm_deliver_irq(xics, icp, irq); 862
863 ics = kvmppc_xics_find_ics(xics, irq, &src);
864 if (!ics)
865 return 2;
866
867 state = &ics->irq_state[src];
868
869 /* only MSIs register bypass producers, so it must be MSI here */
870 do {
871 pq_old = state->pq_state;
872 pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
873 } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
874
875 /* Test P=1, Q=0, this is the only case where we present */
876 if (pq_new == PQ_PRESENTED)
877 icp_rm_deliver_irq(xics, icp, irq, false);
834 878
835 /* EOI the interrupt */ 879 /* EOI the interrupt */
836 icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, 880 icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
902 } 902 }
903} 903}
904 904
905static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
906 unsigned int exit_nr)
907{
908 enum emulation_result er;
909 ulong flags;
910 u32 last_inst;
911 int emul, r;
912
913 /*
914 * shadow_srr1 only contains valid flags if we came here via a program
915 * exception. The other exceptions (emulation assist, FP unavailable,
916 * etc.) do not provide flags in SRR1, so use an illegal-instruction
917 * exception when injecting a program interrupt into the guest.
918 */
919 if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
920 flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
921 else
922 flags = SRR1_PROGILL;
923
924 emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
925 if (emul != EMULATE_DONE)
926 return RESUME_GUEST;
927
928 if (kvmppc_get_msr(vcpu) & MSR_PR) {
929#ifdef EXIT_DEBUG
930 pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
931 kvmppc_get_pc(vcpu), last_inst);
932#endif
933 if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
934 kvmppc_core_queue_program(vcpu, flags);
935 return RESUME_GUEST;
936 }
937 }
938
939 vcpu->stat.emulated_inst_exits++;
940 er = kvmppc_emulate_instruction(run, vcpu);
941 switch (er) {
942 case EMULATE_DONE:
943 r = RESUME_GUEST_NV;
944 break;
945 case EMULATE_AGAIN:
946 r = RESUME_GUEST;
947 break;
948 case EMULATE_FAIL:
949 pr_crit("%s: emulation at %lx failed (%08x)\n",
950 __func__, kvmppc_get_pc(vcpu), last_inst);
951 kvmppc_core_queue_program(vcpu, flags);
952 r = RESUME_GUEST;
953 break;
954 case EMULATE_DO_MMIO:
955 run->exit_reason = KVM_EXIT_MMIO;
956 r = RESUME_HOST_NV;
957 break;
958 case EMULATE_EXIT_USER:
959 r = RESUME_HOST_NV;
960 break;
961 default:
962 BUG();
963 }
964
965 return r;
966}
967
905int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, 968int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
906 unsigned int exit_nr) 969 unsigned int exit_nr)
907{ 970{
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
1044 break; 1107 break;
1045 case BOOK3S_INTERRUPT_PROGRAM: 1108 case BOOK3S_INTERRUPT_PROGRAM:
1046 case BOOK3S_INTERRUPT_H_EMUL_ASSIST: 1109 case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
1047 { 1110 r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
1048 enum emulation_result er;
1049 ulong flags;
1050 u32 last_inst;
1051 int emul;
1052
1053program_interrupt:
1054 /*
1055 * shadow_srr1 only contains valid flags if we came here via
1056 * a program exception. The other exceptions (emulation assist,
1057 * FP unavailable, etc.) do not provide flags in SRR1, so use
1058 * an illegal-instruction exception when injecting a program
1059 * interrupt into the guest.
1060 */
1061 if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
1062 flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
1063 else
1064 flags = SRR1_PROGILL;
1065
1066 emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
1067 if (emul != EMULATE_DONE) {
1068 r = RESUME_GUEST;
1069 break;
1070 }
1071
1072 if (kvmppc_get_msr(vcpu) & MSR_PR) {
1073#ifdef EXIT_DEBUG
1074 pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
1075 kvmppc_get_pc(vcpu), last_inst);
1076#endif
1077 if ((last_inst & 0xff0007ff) !=
1078 (INS_DCBZ & 0xfffffff7)) {
1079 kvmppc_core_queue_program(vcpu, flags);
1080 r = RESUME_GUEST;
1081 break;
1082 }
1083 }
1084
1085 vcpu->stat.emulated_inst_exits++;
1086 er = kvmppc_emulate_instruction(run, vcpu);
1087 switch (er) {
1088 case EMULATE_DONE:
1089 r = RESUME_GUEST_NV;
1090 break;
1091 case EMULATE_AGAIN:
1092 r = RESUME_GUEST;
1093 break;
1094 case EMULATE_FAIL:
1095 printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
1096 __func__, kvmppc_get_pc(vcpu), last_inst);
1097 kvmppc_core_queue_program(vcpu, flags);
1098 r = RESUME_GUEST;
1099 break;
1100 case EMULATE_DO_MMIO:
1101 run->exit_reason = KVM_EXIT_MMIO;
1102 r = RESUME_HOST_NV;
1103 break;
1104 case EMULATE_EXIT_USER:
1105 r = RESUME_HOST_NV;
1106 break;
1107 default:
1108 BUG();
1109 }
1110 break; 1111 break;
1111 }
1112 case BOOK3S_INTERRUPT_SYSCALL: 1112 case BOOK3S_INTERRUPT_SYSCALL:
1113 { 1113 {
1114 u32 last_sc; 1114 u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
1185 emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, 1185 emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
1186 &last_inst); 1186 &last_inst);
1187 if (emul == EMULATE_DONE) 1187 if (emul == EMULATE_DONE)
1188 goto program_interrupt; 1188 r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
1189 else 1189 else
1190 r = RESUME_GUEST; 1190 r = RESUME_GUEST;
1191 1191
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
63/* -- ICS routines -- */ 63/* -- ICS routines -- */
64 64
65static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 65static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
66 u32 new_irq); 66 u32 new_irq, bool check_resend);
67 67
68/* 68/*
69 * Return value ideally indicates how the interrupt was handled, but no 69 * Return value ideally indicates how the interrupt was handled, but no
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
75 struct ics_irq_state *state; 75 struct ics_irq_state *state;
76 struct kvmppc_ics *ics; 76 struct kvmppc_ics *ics;
77 u16 src; 77 u16 src;
78 u32 pq_old, pq_new;
78 79
79 XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); 80 XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
80 81
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
87 if (!state->exists) 88 if (!state->exists)
88 return -EINVAL; 89 return -EINVAL;
89 90
91 if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
92 level = 1;
93 else if (level == KVM_INTERRUPT_UNSET)
94 level = 0;
90 /* 95 /*
91 * We set state->asserted locklessly. This should be fine as 96 * Take other values the same as 1, consistent with original code.
92 * we are the only setter, thus concurrent access is undefined 97 * maybe WARN here?
93 * to begin with.
94 */ 98 */
95 if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) 99
96 state->asserted = 1; 100 if (!state->lsi && level == 0) /* noop for MSI */
97 else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
98 state->asserted = 0;
99 return 0; 101 return 0;
100 } 102
103 do {
104 pq_old = state->pq_state;
105 if (state->lsi) {
106 if (level) {
107 if (pq_old & PQ_PRESENTED)
108 /* Setting already set LSI ... */
109 return 0;
110
111 pq_new = PQ_PRESENTED;
112 } else
113 pq_new = 0;
114 } else
115 pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
116 } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
117
118 /* Test P=1, Q=0, this is the only case where we present */
119 if (pq_new == PQ_PRESENTED)
120 icp_deliver_irq(xics, NULL, irq, false);
101 121
102 /* Record which CPU this arrived on for passed-through interrupts */ 122 /* Record which CPU this arrived on for passed-through interrupts */
103 if (state->host_irq) 123 if (state->host_irq)
104 state->intr_cpu = raw_smp_processor_id(); 124 state->intr_cpu = raw_smp_processor_id();
105 125
106 /* Attempt delivery */
107 icp_deliver_irq(xics, NULL, irq);
108
109 return 0; 126 return 0;
110} 127}
111 128
@@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
114{ 131{
115 int i; 132 int i;
116 133
117 unsigned long flags;
118
119 local_irq_save(flags);
120 arch_spin_lock(&ics->lock);
121
122 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 134 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
123 struct ics_irq_state *state = &ics->irq_state[i]; 135 struct ics_irq_state *state = &ics->irq_state[i];
124 136 if (state->resend) {
125 if (!state->resend) 137 XICS_DBG("resend %#x prio %#x\n", state->number,
126 continue; 138 state->priority);
127 139 icp_deliver_irq(xics, icp, state->number, true);
128 XICS_DBG("resend %#x prio %#x\n", state->number, 140 }
129 state->priority);
130
131 arch_spin_unlock(&ics->lock);
132 local_irq_restore(flags);
133 icp_deliver_irq(xics, icp, state->number);
134 local_irq_save(flags);
135 arch_spin_lock(&ics->lock);
136 } 141 }
137
138 arch_spin_unlock(&ics->lock);
139 local_irq_restore(flags);
140} 142}
141 143
142static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, 144static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
155 deliver = false; 157 deliver = false;
156 if ((state->masked_pending || state->resend) && priority != MASKED) { 158 if ((state->masked_pending || state->resend) && priority != MASKED) {
157 state->masked_pending = 0; 159 state->masked_pending = 0;
160 state->resend = 0;
158 deliver = true; 161 deliver = true;
159 } 162 }
160 163
@@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
189 state->masked_pending, state->resend); 192 state->masked_pending, state->resend);
190 193
191 if (write_xive(xics, ics, state, server, priority, priority)) 194 if (write_xive(xics, ics, state, server, priority, priority))
192 icp_deliver_irq(xics, icp, irq); 195 icp_deliver_irq(xics, icp, irq, false);
193 196
194 return 0; 197 return 0;
195} 198}
@@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
242 245
243 if (write_xive(xics, ics, state, state->server, state->saved_priority, 246 if (write_xive(xics, ics, state, state->server, state->saved_priority,
244 state->saved_priority)) 247 state->saved_priority))
245 icp_deliver_irq(xics, icp, irq); 248 icp_deliver_irq(xics, icp, irq, false);
246 249
247 return 0; 250 return 0;
248} 251}
@@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
376} 379}
377 380
378static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, 381static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
379 u32 new_irq) 382 u32 new_irq, bool check_resend)
380{ 383{
381 struct ics_irq_state *state; 384 struct ics_irq_state *state;
382 struct kvmppc_ics *ics; 385 struct kvmppc_ics *ics;
@@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
422 } 425 }
423 } 426 }
424 427
428 if (check_resend)
429 if (!state->resend)
430 goto out;
431
425 /* Clear the resend bit of that interrupt */ 432 /* Clear the resend bit of that interrupt */
426 state->resend = 0; 433 state->resend = 0;
427 434
@@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
470 arch_spin_unlock(&ics->lock); 477 arch_spin_unlock(&ics->lock);
471 local_irq_restore(flags); 478 local_irq_restore(flags);
472 new_irq = reject; 479 new_irq = reject;
480 check_resend = 0;
473 goto again; 481 goto again;
474 } 482 }
475 } else { 483 } else {
@@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
477 * We failed to deliver the interrupt we need to set the 485 * We failed to deliver the interrupt we need to set the
478 * resend map bit and mark the ICS state as needing a resend 486 * resend map bit and mark the ICS state as needing a resend
479 */ 487 */
480 set_bit(ics->icsid, icp->resend_map);
481 state->resend = 1; 488 state->resend = 1;
482 489
483 /* 490 /*
491 * Make sure when checking resend, we don't miss the resend
492 * if resend_map bit is seen and cleared.
493 */
494 smp_wmb();
495 set_bit(ics->icsid, icp->resend_map);
496
497 /*
484 * If the need_resend flag got cleared in the ICP some time 498 * If the need_resend flag got cleared in the ICP some time
485 * between icp_try_to_deliver() atomic update and now, then 499 * between icp_try_to_deliver() atomic update and now, then
486 * we know it might have missed the resend_map bit. So we 500 * we know it might have missed the resend_map bit. So we
@@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
488 */ 502 */
489 smp_mb(); 503 smp_mb();
490 if (!icp->state.need_resend) { 504 if (!icp->state.need_resend) {
505 state->resend = 0;
491 arch_spin_unlock(&ics->lock); 506 arch_spin_unlock(&ics->lock);
492 local_irq_restore(flags); 507 local_irq_restore(flags);
508 check_resend = 0;
493 goto again; 509 goto again;
494 } 510 }
495 } 511 }
@@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
681 697
682 /* Handle reject */ 698 /* Handle reject */
683 if (reject && reject != XICS_IPI) 699 if (reject && reject != XICS_IPI)
684 icp_deliver_irq(xics, icp, reject); 700 icp_deliver_irq(xics, icp, reject, false);
685 701
686 /* Handle resend */ 702 /* Handle resend */
687 if (resend) 703 if (resend)
@@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
761 * attempt (see comments in icp_deliver_irq). 777 * attempt (see comments in icp_deliver_irq).
762 */ 778 */
763 if (reject && reject != XICS_IPI) 779 if (reject && reject != XICS_IPI)
764 icp_deliver_irq(xics, icp, reject); 780 icp_deliver_irq(xics, icp, reject, false);
765} 781}
766 782
767static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) 783static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
768{ 784{
769 struct kvmppc_xics *xics = vcpu->kvm->arch.xics; 785 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
770 struct kvmppc_icp *icp = vcpu->arch.icp; 786 struct kvmppc_icp *icp = vcpu->arch.icp;
771 struct kvmppc_ics *ics; 787 struct kvmppc_ics *ics;
772 struct ics_irq_state *state; 788 struct ics_irq_state *state;
773 u32 irq = xirr & 0x00ffffff;
774 u16 src; 789 u16 src;
790 u32 pq_old, pq_new;
791
792 /*
793 * ICS EOI handling: For LSI, if P bit is still set, we need to
794 * resend it.
795 *
796 * For MSI, we move Q bit into P (and clear Q). If it is set,
797 * resend it.
798 */
799
800 ics = kvmppc_xics_find_ics(xics, irq, &src);
801 if (!ics) {
802 XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
803 return H_PARAMETER;
804 }
805 state = &ics->irq_state[src];
806
807 if (state->lsi)
808 pq_new = state->pq_state;
809 else
810 do {
811 pq_old = state->pq_state;
812 pq_new = pq_old >> 1;
813 } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
814
815 if (pq_new & PQ_PRESENTED)
816 icp_deliver_irq(xics, icp, irq, false);
817
818 kvm_notify_acked_irq(vcpu->kvm, 0, irq);
819
820 return H_SUCCESS;
821}
822
823static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
824{
825 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
826 struct kvmppc_icp *icp = vcpu->arch.icp;
827 u32 irq = xirr & 0x00ffffff;
775 828
776 XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); 829 XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
777 830
@@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
794 /* IPIs have no EOI */ 847 /* IPIs have no EOI */
795 if (irq == XICS_IPI) 848 if (irq == XICS_IPI)
796 return H_SUCCESS; 849 return H_SUCCESS;
797 /*
798 * EOI handling: If the interrupt is still asserted, we need to
799 * resend it. We can take a lockless "peek" at the ICS state here.
800 *
801 * "Message" interrupts will never have "asserted" set
802 */
803 ics = kvmppc_xics_find_ics(xics, irq, &src);
804 if (!ics) {
805 XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
806 return H_PARAMETER;
807 }
808 state = &ics->irq_state[src];
809 850
810 /* Still asserted, resend it */ 851 return ics_eoi(vcpu, irq);
811 if (state->asserted)
812 icp_deliver_irq(xics, icp, irq);
813
814 kvm_notify_acked_irq(vcpu->kvm, 0, irq);
815
816 return H_SUCCESS;
817} 852}
818 853
819int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) 854int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
832 icp->n_rm_check_resend++; 867 icp->n_rm_check_resend++;
833 icp_check_resend(xics, icp->rm_resend_icp); 868 icp_check_resend(xics, icp->rm_resend_icp);
834 } 869 }
835 if (icp->rm_action & XICS_RM_REJECT) {
836 icp->n_rm_reject++;
837 icp_deliver_irq(xics, icp, icp->rm_reject);
838 }
839 if (icp->rm_action & XICS_RM_NOTIFY_EOI) { 870 if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
840 icp->n_rm_notify_eoi++; 871 icp->n_rm_notify_eoi++;
841 kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); 872 kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
920 int icsid, i; 951 int icsid, i;
921 unsigned long flags; 952 unsigned long flags;
922 unsigned long t_rm_kick_vcpu, t_rm_check_resend; 953 unsigned long t_rm_kick_vcpu, t_rm_check_resend;
923 unsigned long t_rm_reject, t_rm_notify_eoi; 954 unsigned long t_rm_notify_eoi;
924 unsigned long t_reject, t_check_resend; 955 unsigned long t_reject, t_check_resend;
925 956
926 if (!kvm) 957 if (!kvm)
@@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
929 t_rm_kick_vcpu = 0; 960 t_rm_kick_vcpu = 0;
930 t_rm_notify_eoi = 0; 961 t_rm_notify_eoi = 0;
931 t_rm_check_resend = 0; 962 t_rm_check_resend = 0;
932 t_rm_reject = 0;
933 t_check_resend = 0; 963 t_check_resend = 0;
934 t_reject = 0; 964 t_reject = 0;
935 965
@@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
952 t_rm_kick_vcpu += icp->n_rm_kick_vcpu; 982 t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
953 t_rm_notify_eoi += icp->n_rm_notify_eoi; 983 t_rm_notify_eoi += icp->n_rm_notify_eoi;
954 t_rm_check_resend += icp->n_rm_check_resend; 984 t_rm_check_resend += icp->n_rm_check_resend;
955 t_rm_reject += icp->n_rm_reject;
956 t_check_resend += icp->n_check_resend; 985 t_check_resend += icp->n_check_resend;
957 t_reject += icp->n_reject; 986 t_reject += icp->n_reject;
958 } 987 }
959 988
960 seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", 989 seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
961 t_rm_kick_vcpu, t_rm_check_resend, 990 t_rm_kick_vcpu, t_rm_check_resend,
962 t_rm_reject, t_rm_notify_eoi); 991 t_rm_notify_eoi);
963 seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", 992 seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
964 t_check_resend, t_reject); 993 t_check_resend, t_reject);
965 for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { 994 for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
@@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
977 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 1006 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
978 struct ics_irq_state *irq = &ics->irq_state[i]; 1007 struct ics_irq_state *irq = &ics->irq_state[i];
979 1008
980 seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", 1009 seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
981 irq->number, irq->server, irq->priority, 1010 irq->number, irq->server, irq->priority,
982 irq->saved_priority, irq->asserted, 1011 irq->saved_priority, irq->pq_state,
983 irq->resend, irq->masked_pending); 1012 irq->resend, irq->masked_pending);
984 1013
985 } 1014 }
@@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
1198 val |= prio << KVM_XICS_PRIORITY_SHIFT; 1227 val |= prio << KVM_XICS_PRIORITY_SHIFT;
1199 if (irqp->lsi) { 1228 if (irqp->lsi) {
1200 val |= KVM_XICS_LEVEL_SENSITIVE; 1229 val |= KVM_XICS_LEVEL_SENSITIVE;
1201 if (irqp->asserted) 1230 if (irqp->pq_state & PQ_PRESENTED)
1202 val |= KVM_XICS_PENDING; 1231 val |= KVM_XICS_PENDING;
1203 } else if (irqp->masked_pending || irqp->resend) 1232 } else if (irqp->masked_pending || irqp->resend)
1204 val |= KVM_XICS_PENDING; 1233 val |= KVM_XICS_PENDING;
1234
1235 if (irqp->pq_state & PQ_PRESENTED)
1236 val |= KVM_XICS_PRESENTED;
1237
1238 if (irqp->pq_state & PQ_QUEUED)
1239 val |= KVM_XICS_QUEUED;
1240
1205 ret = 0; 1241 ret = 0;
1206 } 1242 }
1207 arch_spin_unlock(&ics->lock); 1243 arch_spin_unlock(&ics->lock);
@@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
1253 irqp->resend = 0; 1289 irqp->resend = 0;
1254 irqp->masked_pending = 0; 1290 irqp->masked_pending = 0;
1255 irqp->lsi = 0; 1291 irqp->lsi = 0;
1256 irqp->asserted = 0; 1292 irqp->pq_state = 0;
1257 if (val & KVM_XICS_LEVEL_SENSITIVE) { 1293 if (val & KVM_XICS_LEVEL_SENSITIVE)
1258 irqp->lsi = 1; 1294 irqp->lsi = 1;
1259 if (val & KVM_XICS_PENDING) 1295 /* If PENDING, set P in case P is not saved because of old code */
1260 irqp->asserted = 1; 1296 if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
1261 } 1297 irqp->pq_state |= PQ_PRESENTED;
1298 if (val & KVM_XICS_QUEUED)
1299 irqp->pq_state |= PQ_QUEUED;
1262 irqp->exists = 1; 1300 irqp->exists = 1;
1263 arch_spin_unlock(&ics->lock); 1301 arch_spin_unlock(&ics->lock);
1264 local_irq_restore(flags); 1302 local_irq_restore(flags);
1265 1303
1266 if (val & KVM_XICS_PENDING) 1304 if (val & KVM_XICS_PENDING)
1267 icp_deliver_irq(xics, NULL, irqp->number); 1305 icp_deliver_irq(xics, NULL, irqp->number, false);
1268 1306
1269 return 0; 1307 return 0;
1270} 1308}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
31/* Priority value to use for disabling an interrupt */ 31/* Priority value to use for disabling an interrupt */
32#define MASKED 0xff 32#define MASKED 0xff
33 33
34#define PQ_PRESENTED 1
35#define PQ_QUEUED 2
36
34/* State for one irq source */ 37/* State for one irq source */
35struct ics_irq_state { 38struct ics_irq_state {
36 u32 number; 39 u32 number;
37 u32 server; 40 u32 server;
41 u32 pq_state;
38 u8 priority; 42 u8 priority;
39 u8 saved_priority; 43 u8 saved_priority;
40 u8 resend; 44 u8 resend;
41 u8 masked_pending; 45 u8 masked_pending;
42 u8 lsi; /* level-sensitive interrupt */ 46 u8 lsi; /* level-sensitive interrupt */
43 u8 asserted; /* Only for LSI */
44 u8 exists; 47 u8 exists;
45 int intr_cpu; 48 int intr_cpu;
46 u32 host_irq; 49 u32 host_irq;
@@ -73,7 +76,6 @@ struct kvmppc_icp {
73 */ 76 */
74#define XICS_RM_KICK_VCPU 0x1 77#define XICS_RM_KICK_VCPU 0x1
75#define XICS_RM_CHECK_RESEND 0x2 78#define XICS_RM_CHECK_RESEND 0x2
76#define XICS_RM_REJECT 0x4
77#define XICS_RM_NOTIFY_EOI 0x8 79#define XICS_RM_NOTIFY_EOI 0x8
78 u32 rm_action; 80 u32 rm_action;
79 struct kvm_vcpu *rm_kick_target; 81 struct kvm_vcpu *rm_kick_target;
@@ -84,7 +86,6 @@ struct kvmppc_icp {
84 /* Counters for each reason we exited real mode */ 86 /* Counters for each reason we exited real mode */
85 unsigned long n_rm_kick_vcpu; 87 unsigned long n_rm_kick_vcpu;
86 unsigned long n_rm_check_resend; 88 unsigned long n_rm_check_resend;
87 unsigned long n_rm_reject;
88 unsigned long n_rm_notify_eoi; 89 unsigned long n_rm_notify_eoi;
89 /* Counters for handling ICP processing in real mode */ 90 /* Counters for handling ICP processing in real mode */
90 unsigned long n_check_resend; 91 unsigned long n_check_resend;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 40a5b2d75ed1..2b38d824e9e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
511 case KVM_CAP_ONE_REG: 511 case KVM_CAP_ONE_REG:
512 case KVM_CAP_IOEVENTFD: 512 case KVM_CAP_IOEVENTFD:
513 case KVM_CAP_DEVICE_CTRL: 513 case KVM_CAP_DEVICE_CTRL:
514 case KVM_CAP_IMMEDIATE_EXIT:
514 r = 1; 515 r = 1;
515 break; 516 break;
516 case KVM_CAP_PPC_PAIRED_SINGLES: 517 case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -612,6 +613,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
612 case KVM_CAP_SPAPR_MULTITCE: 613 case KVM_CAP_SPAPR_MULTITCE:
613 r = 1; 614 r = 1;
614 break; 615 break;
616 case KVM_CAP_SPAPR_RESIZE_HPT:
617 /* Disable this on POWER9 until code handles new HPTE format */
618 r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
619 break;
615#endif 620#endif
616 case KVM_CAP_PPC_HTM: 621 case KVM_CAP_PPC_HTM:
617 r = cpu_has_feature(CPU_FTR_TM_COMP) && 622 r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1114,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
1114#endif 1119#endif
1115 } 1120 }
1116 1121
1117 r = kvmppc_vcpu_run(run, vcpu); 1122 if (run->immediate_exit)
1123 r = -EINTR;
1124 else
1125 r = kvmppc_vcpu_run(run, vcpu);
1118 1126
1119 if (vcpu->sigset_active) 1127 if (vcpu->sigset_active)
1120 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1128 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4aa8a7e2a1da..4492c9363178 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
373 ipte_unlock_simple(vcpu); 373 ipte_unlock_simple(vcpu);
374} 374}
375 375
376static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar, 376static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
377 enum gacc_mode mode) 377 enum gacc_mode mode)
378{ 378{
379 union alet alet; 379 union alet alet;
@@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
465struct trans_exc_code_bits { 465struct trans_exc_code_bits {
466 unsigned long addr : 52; /* Translation-exception Address */ 466 unsigned long addr : 52; /* Translation-exception Address */
467 unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ 467 unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */
468 unsigned long : 6; 468 unsigned long : 2;
469 unsigned long b56 : 1;
470 unsigned long : 3;
469 unsigned long b60 : 1; 471 unsigned long b60 : 1;
470 unsigned long b61 : 1; 472 unsigned long b61 : 1;
471 unsigned long as : 2; /* ASCE Identifier */ 473 unsigned long as : 2; /* ASCE Identifier */
@@ -485,7 +487,7 @@ enum prot_type {
485}; 487};
486 488
487static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, 489static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
488 ar_t ar, enum gacc_mode mode, enum prot_type prot) 490 u8 ar, enum gacc_mode mode, enum prot_type prot)
489{ 491{
490 struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; 492 struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
491 struct trans_exc_code_bits *tec; 493 struct trans_exc_code_bits *tec;
@@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
497 switch (code) { 499 switch (code) {
498 case PGM_PROTECTION: 500 case PGM_PROTECTION:
499 switch (prot) { 501 switch (prot) {
502 case PROT_TYPE_LA:
503 tec->b56 = 1;
504 break;
505 case PROT_TYPE_KEYC:
506 tec->b60 = 1;
507 break;
500 case PROT_TYPE_ALC: 508 case PROT_TYPE_ALC:
501 tec->b60 = 1; 509 tec->b60 = 1;
502 /* FALL THROUGH */ 510 /* FALL THROUGH */
503 case PROT_TYPE_DAT: 511 case PROT_TYPE_DAT:
504 tec->b61 = 1; 512 tec->b61 = 1;
505 break; 513 break;
506 default: /* LA and KEYC set b61 to 0, other params undefined */
507 return code;
508 } 514 }
509 /* FALL THROUGH */ 515 /* FALL THROUGH */
510 case PGM_ASCE_TYPE: 516 case PGM_ASCE_TYPE:
@@ -539,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
539} 545}
540 546
541static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, 547static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
542 unsigned long ga, ar_t ar, enum gacc_mode mode) 548 unsigned long ga, u8 ar, enum gacc_mode mode)
543{ 549{
544 int rc; 550 int rc;
545 struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); 551 struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
@@ -771,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
771 return 1; 777 return 1;
772} 778}
773 779
774static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, 780static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
775 unsigned long *pages, unsigned long nr_pages, 781 unsigned long *pages, unsigned long nr_pages,
776 const union asce asce, enum gacc_mode mode) 782 const union asce asce, enum gacc_mode mode)
777{ 783{
@@ -803,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
803 return 0; 809 return 0;
804} 810}
805 811
806int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, 812int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
807 unsigned long len, enum gacc_mode mode) 813 unsigned long len, enum gacc_mode mode)
808{ 814{
809 psw_t *psw = &vcpu->arch.sie_block->gpsw; 815 psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -877,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
877 * Note: The IPTE lock is not taken during this function, so the caller 883 * Note: The IPTE lock is not taken during this function, so the caller
878 * has to take care of this. 884 * has to take care of this.
879 */ 885 */
880int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, 886int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
881 unsigned long *gpa, enum gacc_mode mode) 887 unsigned long *gpa, enum gacc_mode mode)
882{ 888{
883 psw_t *psw = &vcpu->arch.sie_block->gpsw; 889 psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -910,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
910/** 916/**
911 * check_gva_range - test a range of guest virtual addresses for accessibility 917 * check_gva_range - test a range of guest virtual addresses for accessibility
912 */ 918 */
913int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, 919int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
914 unsigned long length, enum gacc_mode mode) 920 unsigned long length, enum gacc_mode mode)
915{ 921{
916 unsigned long gpa; 922 unsigned long gpa;
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 8756569ad938..7ce47fd36f28 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -162,11 +162,11 @@ enum gacc_mode {
162}; 162};
163 163
164int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, 164int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
165 ar_t ar, unsigned long *gpa, enum gacc_mode mode); 165 u8 ar, unsigned long *gpa, enum gacc_mode mode);
166int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, 166int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
167 unsigned long length, enum gacc_mode mode); 167 unsigned long length, enum gacc_mode mode);
168 168
169int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, 169int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
170 unsigned long len, enum gacc_mode mode); 170 unsigned long len, enum gacc_mode mode);
171 171
172int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, 172int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
@@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
218 * if data has been changed in guest space in case of an exception. 218 * if data has been changed in guest space in case of an exception.
219 */ 219 */
220static inline __must_check 220static inline __must_check
221int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, 221int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
222 unsigned long len) 222 unsigned long len)
223{ 223{
224 return access_guest(vcpu, ga, ar, data, len, GACC_STORE); 224 return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
@@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
238 * data will be copied from guest space to kernel space. 238 * data will be copied from guest space to kernel space.
239 */ 239 */
240static inline __must_check 240static inline __must_check
241int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, 241int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
242 unsigned long len) 242 unsigned long len)
243{ 243{
244 return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); 244 return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
@@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
247/** 247/**
248 * read_guest_instr - copy instruction data from guest space to kernel space 248 * read_guest_instr - copy instruction data from guest space to kernel space
249 * @vcpu: virtual cpu 249 * @vcpu: virtual cpu
250 * @ga: guest address
250 * @data: destination address in kernel space 251 * @data: destination address in kernel space
251 * @len: number of bytes to copy 252 * @len: number of bytes to copy
252 * 253 *
253 * Copy @len bytes from the current psw address (guest space) to @data (kernel 254 * Copy @len bytes from the given address (guest space) to @data (kernel
254 * space). 255 * space).
255 * 256 *
256 * The behaviour of read_guest_instr is identical to read_guest, except that 257 * The behaviour of read_guest_instr is identical to read_guest, except that
@@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
258 * address-space mode. 259 * address-space mode.
259 */ 260 */
260static inline __must_check 261static inline __must_check
261int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len) 262int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
263 unsigned long len)
262{ 264{
263 return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len, 265 return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
264 GACC_IFETCH);
265} 266}
266 267
267/** 268/**
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index d7c6a7f53ced..23d9a4e12da1 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
388#define per_write_wp_event(code) \ 388#define per_write_wp_event(code) \
389 (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) 389 (code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
390 390
391static int debug_exit_required(struct kvm_vcpu *vcpu) 391static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc,
392 unsigned long peraddr)
392{ 393{
393 u8 perc = vcpu->arch.sie_block->perc;
394 struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; 394 struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
395 struct kvm_hw_wp_info_arch *wp_info = NULL; 395 struct kvm_hw_wp_info_arch *wp_info = NULL;
396 struct kvm_hw_bp_info_arch *bp_info = NULL; 396 struct kvm_hw_bp_info_arch *bp_info = NULL;
397 unsigned long addr = vcpu->arch.sie_block->gpsw.addr; 397 unsigned long addr = vcpu->arch.sie_block->gpsw.addr;
398 unsigned long peraddr = vcpu->arch.sie_block->peraddr;
399 398
400 if (guestdbg_hw_bp_enabled(vcpu)) { 399 if (guestdbg_hw_bp_enabled(vcpu)) {
401 if (per_write_wp_event(perc) && 400 if (per_write_wp_event(perc) &&
@@ -437,36 +436,118 @@ exit_required:
437 return 1; 436 return 1;
438} 437}
439 438
439static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr)
440{
441 u8 exec_ilen = 0;
442 u16 opcode[3];
443 int rc;
444
445 if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) {
446 /* PER address references the fetched or the execute instr */
447 *addr = vcpu->arch.sie_block->peraddr;
448 /*
449 * Manually detect if we have an EXECUTE instruction. As
450 * instructions are always 2 byte aligned we can read the
451 * first two bytes unconditionally
452 */
453 rc = read_guest_instr(vcpu, *addr, &opcode, 2);
454 if (rc)
455 return rc;
456 if (opcode[0] >> 8 == 0x44)
457 exec_ilen = 4;
458 if ((opcode[0] & 0xff0f) == 0xc600)
459 exec_ilen = 6;
460 } else {
461 /* instr was suppressed, calculate the responsible instr */
462 *addr = __rewind_psw(vcpu->arch.sie_block->gpsw,
463 kvm_s390_get_ilen(vcpu));
464 if (vcpu->arch.sie_block->icptstatus & 0x01) {
465 exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4;
466 if (!exec_ilen)
467 exec_ilen = 4;
468 }
469 }
470
471 if (exec_ilen) {
472 /* read the complete EXECUTE instr to detect the fetched addr */
473 rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen);
474 if (rc)
475 return rc;
476 if (exec_ilen == 6) {
477 /* EXECUTE RELATIVE LONG - RIL-b format */
478 s32 rl = *((s32 *) (opcode + 1));
479
480 /* rl is a _signed_ 32 bit value specifying halfwords */
481 *addr += (u64)(s64) rl * 2;
482 } else {
483 /* EXECUTE - RX-a format */
484 u32 base = (opcode[1] & 0xf000) >> 12;
485 u32 disp = opcode[1] & 0x0fff;
486 u32 index = opcode[0] & 0x000f;
487
488 *addr = base ? vcpu->run->s.regs.gprs[base] : 0;
489 *addr += index ? vcpu->run->s.regs.gprs[index] : 0;
490 *addr += disp;
491 }
492 *addr = kvm_s390_logical_to_effective(vcpu, *addr);
493 }
494 return 0;
495}
496
440#define guest_per_enabled(vcpu) \ 497#define guest_per_enabled(vcpu) \
441 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) 498 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
442 499
443int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) 500int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
444{ 501{
502 const u64 cr10 = vcpu->arch.sie_block->gcr[10];
503 const u64 cr11 = vcpu->arch.sie_block->gcr[11];
445 const u8 ilen = kvm_s390_get_ilen(vcpu); 504 const u8 ilen = kvm_s390_get_ilen(vcpu);
446 struct kvm_s390_pgm_info pgm_info = { 505 struct kvm_s390_pgm_info pgm_info = {
447 .code = PGM_PER, 506 .code = PGM_PER,
448 .per_code = PER_CODE_IFETCH, 507 .per_code = PER_CODE_IFETCH,
449 .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), 508 .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
450 }; 509 };
510 unsigned long fetched_addr;
511 int rc;
451 512
452 /* 513 /*
453 * The PSW points to the next instruction, therefore the intercepted 514 * The PSW points to the next instruction, therefore the intercepted
454 * instruction generated a PER i-fetch event. PER address therefore 515 * instruction generated a PER i-fetch event. PER address therefore
455 * points at the previous PSW address (could be an EXECUTE function). 516 * points at the previous PSW address (could be an EXECUTE function).
456 */ 517 */
457 return kvm_s390_inject_prog_irq(vcpu, &pgm_info); 518 if (!guestdbg_enabled(vcpu))
519 return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
520
521 if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address))
522 vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
523
524 if (!guest_per_enabled(vcpu) ||
525 !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH))
526 return 0;
527
528 rc = per_fetched_addr(vcpu, &fetched_addr);
529 if (rc < 0)
530 return rc;
531 if (rc)
532 /* instruction-fetching exceptions */
533 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
534
535 if (in_addr_range(fetched_addr, cr10, cr11))
536 return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
537 return 0;
458} 538}
459 539
460static void filter_guest_per_event(struct kvm_vcpu *vcpu) 540static int filter_guest_per_event(struct kvm_vcpu *vcpu)
461{ 541{
462 const u8 perc = vcpu->arch.sie_block->perc; 542 const u8 perc = vcpu->arch.sie_block->perc;
463 u64 peraddr = vcpu->arch.sie_block->peraddr;
464 u64 addr = vcpu->arch.sie_block->gpsw.addr; 543 u64 addr = vcpu->arch.sie_block->gpsw.addr;
465 u64 cr9 = vcpu->arch.sie_block->gcr[9]; 544 u64 cr9 = vcpu->arch.sie_block->gcr[9];
466 u64 cr10 = vcpu->arch.sie_block->gcr[10]; 545 u64 cr10 = vcpu->arch.sie_block->gcr[10];
467 u64 cr11 = vcpu->arch.sie_block->gcr[11]; 546 u64 cr11 = vcpu->arch.sie_block->gcr[11];
468 /* filter all events, demanded by the guest */ 547 /* filter all events, demanded by the guest */
469 u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; 548 u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
549 unsigned long fetched_addr;
550 int rc;
470 551
471 if (!guest_per_enabled(vcpu)) 552 if (!guest_per_enabled(vcpu))
472 guest_perc = 0; 553 guest_perc = 0;
@@ -478,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
478 guest_perc &= ~PER_CODE_BRANCH; 559 guest_perc &= ~PER_CODE_BRANCH;
479 560
480 /* filter "instruction-fetching" events */ 561 /* filter "instruction-fetching" events */
481 if (guest_perc & PER_CODE_IFETCH && 562 if (guest_perc & PER_CODE_IFETCH) {
482 !in_addr_range(peraddr, cr10, cr11)) 563 rc = per_fetched_addr(vcpu, &fetched_addr);
483 guest_perc &= ~PER_CODE_IFETCH; 564 if (rc < 0)
565 return rc;
566 /*
567 * Don't inject an irq on exceptions. This would make handling
568 * on icpt code 8 very complex (as PSW was already rewound).
569 */
570 if (rc || !in_addr_range(fetched_addr, cr10, cr11))
571 guest_perc &= ~PER_CODE_IFETCH;
572 }
484 573
485 /* All other PER events will be given to the guest */ 574 /* All other PER events will be given to the guest */
486 /* TODO: Check altered address/address space */ 575 /* TODO: Check altered address/address space */
@@ -489,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
489 578
490 if (!guest_perc) 579 if (!guest_perc)
491 vcpu->arch.sie_block->iprcc &= ~PGM_PER; 580 vcpu->arch.sie_block->iprcc &= ~PGM_PER;
581 return 0;
492} 582}
493 583
494#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) 584#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH)
@@ -496,14 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
496#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) 586#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1)
497#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) 587#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff)
498 588
499void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) 589int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
500{ 590{
501 int new_as; 591 int rc, new_as;
502 592
503 if (debug_exit_required(vcpu)) 593 if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc,
594 vcpu->arch.sie_block->peraddr))
504 vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; 595 vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
505 596
506 filter_guest_per_event(vcpu); 597 rc = filter_guest_per_event(vcpu);
598 if (rc)
599 return rc;
507 600
508 /* 601 /*
509 * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger 602 * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger
@@ -532,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
532 (pssec(vcpu) || old_ssec(vcpu))) 625 (pssec(vcpu) || old_ssec(vcpu)))
533 vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; 626 vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
534 } 627 }
628 return 0;
535} 629}
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7a27eebab28a..59920f96ebc0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu)
238 vcpu->stat.exit_program_interruption++; 238 vcpu->stat.exit_program_interruption++;
239 239
240 if (guestdbg_enabled(vcpu) && per_event(vcpu)) { 240 if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
241 kvm_s390_handle_per_event(vcpu); 241 rc = kvm_s390_handle_per_event(vcpu);
242 if (rc)
243 return rc;
242 /* the interrupt might have been filtered out completely */ 244 /* the interrupt might have been filtered out completely */
243 if (vcpu->arch.sie_block->iprcc == 0) 245 if (vcpu->arch.sie_block->iprcc == 0)
244 return 0; 246 return 0;
@@ -359,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
359 361
360static int handle_operexc(struct kvm_vcpu *vcpu) 362static int handle_operexc(struct kvm_vcpu *vcpu)
361{ 363{
364 psw_t oldpsw, newpsw;
365 int rc;
366
362 vcpu->stat.exit_operation_exception++; 367 vcpu->stat.exit_operation_exception++;
363 trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, 368 trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
364 vcpu->arch.sie_block->ipb); 369 vcpu->arch.sie_block->ipb);
@@ -369,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
369 374
370 if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) 375 if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
371 return -EOPNOTSUPP; 376 return -EOPNOTSUPP;
377 rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
378 if (rc)
379 return rc;
380 /*
381 * Avoid endless loops of operation exceptions, if the pgm new
382 * PSW will cause a new operation exception.
383 * The heuristic checks if the pgm new psw is within 6 bytes before
384 * the faulting psw address (with same DAT, AS settings) and the
385 * new psw is not a wait psw and the fault was not triggered by
386 * problem state.
387 */
388 oldpsw = vcpu->arch.sie_block->gpsw;
389 if (oldpsw.addr - newpsw.addr <= 6 &&
390 !(newpsw.mask & PSW_MASK_WAIT) &&
391 !(oldpsw.mask & PSW_MASK_PSTATE) &&
392 (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) &&
393 (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT))
394 return -EOPNOTSUPP;
372 395
373 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); 396 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
374} 397}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b604854df02c..f5694838234d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -218,7 +218,7 @@ static void allow_cpu_feat(unsigned long nr)
218static inline int plo_test_bit(unsigned char nr) 218static inline int plo_test_bit(unsigned char nr)
219{ 219{
220 register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; 220 register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
221 int cc = 3; /* subfunction not available */ 221 int cc;
222 222
223 asm volatile( 223 asm volatile(
224 /* Parameter registers are ignored for "test bit" */ 224 /* Parameter registers are ignored for "test bit" */
@@ -371,6 +371,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
371 case KVM_CAP_S390_IRQCHIP: 371 case KVM_CAP_S390_IRQCHIP:
372 case KVM_CAP_VM_ATTRIBUTES: 372 case KVM_CAP_VM_ATTRIBUTES:
373 case KVM_CAP_MP_STATE: 373 case KVM_CAP_MP_STATE:
374 case KVM_CAP_IMMEDIATE_EXIT:
374 case KVM_CAP_S390_INJECT_IRQ: 375 case KVM_CAP_S390_INJECT_IRQ:
375 case KVM_CAP_S390_USER_SIGP: 376 case KVM_CAP_S390_USER_SIGP:
376 case KVM_CAP_S390_USER_STSI: 377 case KVM_CAP_S390_USER_STSI:
@@ -443,6 +444,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
443 struct kvm_memory_slot *memslot; 444 struct kvm_memory_slot *memslot;
444 int is_dirty = 0; 445 int is_dirty = 0;
445 446
447 if (kvm_is_ucontrol(kvm))
448 return -EINVAL;
449
446 mutex_lock(&kvm->slots_lock); 450 mutex_lock(&kvm->slots_lock);
447 451
448 r = -EINVAL; 452 r = -EINVAL;
@@ -506,6 +510,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
506 } else if (MACHINE_HAS_VX) { 510 } else if (MACHINE_HAS_VX) {
507 set_kvm_facility(kvm->arch.model.fac_mask, 129); 511 set_kvm_facility(kvm->arch.model.fac_mask, 129);
508 set_kvm_facility(kvm->arch.model.fac_list, 129); 512 set_kvm_facility(kvm->arch.model.fac_list, 129);
513 if (test_facility(134)) {
514 set_kvm_facility(kvm->arch.model.fac_mask, 134);
515 set_kvm_facility(kvm->arch.model.fac_list, 134);
516 }
517 if (test_facility(135)) {
518 set_kvm_facility(kvm->arch.model.fac_mask, 135);
519 set_kvm_facility(kvm->arch.model.fac_list, 135);
520 }
509 r = 0; 521 r = 0;
510 } else 522 } else
511 r = -EINVAL; 523 r = -EINVAL;
@@ -822,6 +834,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
822 } 834 }
823 memcpy(kvm->arch.model.fac_list, proc->fac_list, 835 memcpy(kvm->arch.model.fac_list, proc->fac_list,
824 S390_ARCH_FAC_LIST_SIZE_BYTE); 836 S390_ARCH_FAC_LIST_SIZE_BYTE);
837 VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
838 kvm->arch.model.ibc,
839 kvm->arch.model.cpuid);
840 VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
841 kvm->arch.model.fac_list[0],
842 kvm->arch.model.fac_list[1],
843 kvm->arch.model.fac_list[2]);
825 } else 844 } else
826 ret = -EFAULT; 845 ret = -EFAULT;
827 kfree(proc); 846 kfree(proc);
@@ -895,6 +914,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
895 proc->ibc = kvm->arch.model.ibc; 914 proc->ibc = kvm->arch.model.ibc;
896 memcpy(&proc->fac_list, kvm->arch.model.fac_list, 915 memcpy(&proc->fac_list, kvm->arch.model.fac_list,
897 S390_ARCH_FAC_LIST_SIZE_BYTE); 916 S390_ARCH_FAC_LIST_SIZE_BYTE);
917 VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
918 kvm->arch.model.ibc,
919 kvm->arch.model.cpuid);
920 VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
921 kvm->arch.model.fac_list[0],
922 kvm->arch.model.fac_list[1],
923 kvm->arch.model.fac_list[2]);
898 if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc))) 924 if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
899 ret = -EFAULT; 925 ret = -EFAULT;
900 kfree(proc); 926 kfree(proc);
@@ -918,6 +944,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
918 S390_ARCH_FAC_LIST_SIZE_BYTE); 944 S390_ARCH_FAC_LIST_SIZE_BYTE);
919 memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, 945 memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
920 sizeof(S390_lowcore.stfle_fac_list)); 946 sizeof(S390_lowcore.stfle_fac_list));
947 VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx",
948 kvm->arch.model.ibc,
949 kvm->arch.model.cpuid);
950 VM_EVENT(kvm, 3, "GET: host facmask: 0x%16.16llx.%16.16llx.%16.16llx",
951 mach->fac_mask[0],
952 mach->fac_mask[1],
953 mach->fac_mask[2]);
954 VM_EVENT(kvm, 3, "GET: host faclist: 0x%16.16llx.%16.16llx.%16.16llx",
955 mach->fac_list[0],
956 mach->fac_list[1],
957 mach->fac_list[2]);
921 if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach))) 958 if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
922 ret = -EFAULT; 959 ret = -EFAULT;
923 kfree(mach); 960 kfree(mach);
@@ -1939,6 +1976,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1939 1976
1940 if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) 1977 if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
1941 vcpu->arch.sie_block->ecb2 |= 0x08; 1978 vcpu->arch.sie_block->ecb2 |= 0x08;
1979 if (test_kvm_facility(vcpu->kvm, 130))
1980 vcpu->arch.sie_block->ecb2 |= 0x20;
1942 vcpu->arch.sie_block->eca = 0x1002000U; 1981 vcpu->arch.sie_block->eca = 0x1002000U;
1943 if (sclp.has_cei) 1982 if (sclp.has_cei)
1944 vcpu->arch.sie_block->eca |= 0x80000000U; 1983 vcpu->arch.sie_block->eca |= 0x80000000U;
@@ -2579,7 +2618,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
2579 * to look up the current opcode to get the length of the instruction 2618 * to look up the current opcode to get the length of the instruction
2580 * to be able to forward the PSW. 2619 * to be able to forward the PSW.
2581 */ 2620 */
2582 rc = read_guest_instr(vcpu, &opcode, 1); 2621 rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1);
2583 ilen = insn_length(opcode); 2622 ilen = insn_length(opcode);
2584 if (rc < 0) { 2623 if (rc < 0) {
2585 return rc; 2624 return rc;
@@ -2761,6 +2800,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2761 int rc; 2800 int rc;
2762 sigset_t sigsaved; 2801 sigset_t sigsaved;
2763 2802
2803 if (kvm_run->immediate_exit)
2804 return -EINTR;
2805
2764 if (guestdbg_exit_pending(vcpu)) { 2806 if (guestdbg_exit_pending(vcpu)) {
2765 kvm_s390_prepare_debug_exit(vcpu); 2807 kvm_s390_prepare_debug_exit(vcpu);
2766 return 0; 2808 return 0;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3a4e97f1a9e6..af9fa91a0c91 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
86 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 86 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
87} 87}
88 88
89typedef u8 __bitwise ar_t; 89static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
90
91static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
92{ 90{
93 u32 base2 = vcpu->arch.sie_block->ipb >> 28; 91 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
94 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 92 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
101 99
102static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, 100static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
103 u64 *address1, u64 *address2, 101 u64 *address1, u64 *address2,
104 ar_t *ar_b1, ar_t *ar_b2) 102 u8 *ar_b1, u8 *ar_b2)
105{ 103{
106 u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 104 u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
107 u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; 105 u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
@@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2
125 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; 123 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
126} 124}
127 125
128static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar) 126static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar)
129{ 127{
130 u32 base2 = vcpu->arch.sie_block->ipb >> 28; 128 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
131 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + 129 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
@@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
140 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; 138 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
141} 139}
142 140
143static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar) 141static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar)
144{ 142{
145 u32 base2 = vcpu->arch.sie_block->ipb >> 28; 143 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
146 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); 144 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -379,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
379void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); 377void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
380void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); 378void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
381int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); 379int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
382void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); 380int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
383 381
384/* support for Basic/Extended SCA handling */ 382/* support for Basic/Extended SCA handling */
385static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) 383static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 794503516bd4..fb4b494cde9b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
54static int handle_set_clock(struct kvm_vcpu *vcpu) 54static int handle_set_clock(struct kvm_vcpu *vcpu)
55{ 55{
56 int rc; 56 int rc;
57 ar_t ar; 57 u8 ar;
58 u64 op2, val; 58 u64 op2, val;
59 59
60 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 60 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
79 u64 operand2; 79 u64 operand2;
80 u32 address; 80 u32 address;
81 int rc; 81 int rc;
82 ar_t ar; 82 u8 ar;
83 83
84 vcpu->stat.instruction_spx++; 84 vcpu->stat.instruction_spx++;
85 85
@@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
117 u64 operand2; 117 u64 operand2;
118 u32 address; 118 u32 address;
119 int rc; 119 int rc;
120 ar_t ar; 120 u8 ar;
121 121
122 vcpu->stat.instruction_stpx++; 122 vcpu->stat.instruction_stpx++;
123 123
@@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
147 u16 vcpu_id = vcpu->vcpu_id; 147 u16 vcpu_id = vcpu->vcpu_id;
148 u64 ga; 148 u64 ga;
149 int rc; 149 int rc;
150 ar_t ar; 150 u8 ar;
151 151
152 vcpu->stat.instruction_stap++; 152 vcpu->stat.instruction_stap++;
153 153
@@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
380 u32 tpi_data[3]; 380 u32 tpi_data[3];
381 int rc; 381 int rc;
382 u64 addr; 382 u64 addr;
383 ar_t ar; 383 u8 ar;
384 384
385 addr = kvm_s390_get_base_disp_s(vcpu, &ar); 385 addr = kvm_s390_get_base_disp_s(vcpu, &ar);
386 if (addr & 3) 386 if (addr & 3)
@@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
548 psw_compat_t new_psw; 548 psw_compat_t new_psw;
549 u64 addr; 549 u64 addr;
550 int rc; 550 int rc;
551 ar_t ar; 551 u8 ar;
552 552
553 if (gpsw->mask & PSW_MASK_PSTATE) 553 if (gpsw->mask & PSW_MASK_PSTATE)
554 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 554 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
575 psw_t new_psw; 575 psw_t new_psw;
576 u64 addr; 576 u64 addr;
577 int rc; 577 int rc;
578 ar_t ar; 578 u8 ar;
579 579
580 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 580 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
581 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); 581 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
597 u64 stidp_data = vcpu->kvm->arch.model.cpuid; 597 u64 stidp_data = vcpu->kvm->arch.model.cpuid;
598 u64 operand2; 598 u64 operand2;
599 int rc; 599 int rc;
600 ar_t ar; 600 u8 ar;
601 601
602 vcpu->stat.instruction_stidp++; 602 vcpu->stat.instruction_stidp++;
603 603
@@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
644 ASCEBC(mem->vm[0].cpi, 16); 644 ASCEBC(mem->vm[0].cpi, 16);
645} 645}
646 646
647static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar, 647static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar,
648 u8 fc, u8 sel1, u16 sel2) 648 u8 fc, u8 sel1, u16 sel2)
649{ 649{
650 vcpu->run->exit_reason = KVM_EXIT_S390_STSI; 650 vcpu->run->exit_reason = KVM_EXIT_S390_STSI;
@@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
663 unsigned long mem = 0; 663 unsigned long mem = 0;
664 u64 operand2; 664 u64 operand2;
665 int rc = 0; 665 int rc = 0;
666 ar_t ar; 666 u8 ar;
667 667
668 vcpu->stat.instruction_stsi++; 668 vcpu->stat.instruction_stsi++;
669 VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); 669 VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2);
@@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
970 int reg, rc, nr_regs; 970 int reg, rc, nr_regs;
971 u32 ctl_array[16]; 971 u32 ctl_array[16];
972 u64 ga; 972 u64 ga;
973 ar_t ar; 973 u8 ar;
974 974
975 vcpu->stat.instruction_lctl++; 975 vcpu->stat.instruction_lctl++;
976 976
@@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
1009 int reg, rc, nr_regs; 1009 int reg, rc, nr_regs;
1010 u32 ctl_array[16]; 1010 u32 ctl_array[16];
1011 u64 ga; 1011 u64 ga;
1012 ar_t ar; 1012 u8 ar;
1013 1013
1014 vcpu->stat.instruction_stctl++; 1014 vcpu->stat.instruction_stctl++;
1015 1015
@@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
1043 int reg, rc, nr_regs; 1043 int reg, rc, nr_regs;
1044 u64 ctl_array[16]; 1044 u64 ctl_array[16];
1045 u64 ga; 1045 u64 ga;
1046 ar_t ar; 1046 u8 ar;
1047 1047
1048 vcpu->stat.instruction_lctlg++; 1048 vcpu->stat.instruction_lctlg++;
1049 1049
@@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
1081 int reg, rc, nr_regs; 1081 int reg, rc, nr_regs;
1082 u64 ctl_array[16]; 1082 u64 ctl_array[16];
1083 u64 ga; 1083 u64 ga;
1084 ar_t ar; 1084 u8 ar;
1085 1085
1086 vcpu->stat.instruction_stctg++; 1086 vcpu->stat.instruction_stctg++;
1087 1087
@@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
1132 unsigned long hva, gpa; 1132 unsigned long hva, gpa;
1133 int ret = 0, cc = 0; 1133 int ret = 0, cc = 0;
1134 bool writable; 1134 bool writable;
1135 ar_t ar; 1135 u8 ar;
1136 1136
1137 vcpu->stat.instruction_tprot++; 1137 vcpu->stat.instruction_tprot++;
1138 1138
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index a9a9d974d9a4..38556e395915 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
324 /* Run-time-Instrumentation */ 324 /* Run-time-Instrumentation */
325 if (test_kvm_facility(vcpu->kvm, 64)) 325 if (test_kvm_facility(vcpu->kvm, 64))
326 scb_s->ecb3 |= scb_o->ecb3 & 0x01U; 326 scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
327 /* Instruction Execution Prevention */
328 if (test_kvm_facility(vcpu->kvm, 130))
329 scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
327 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) 330 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
328 scb_s->eca |= scb_o->eca & 0x00000001U; 331 scb_s->eca |= scb_o->eca & 0x00000001U;
329 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) 332 if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index beb90f3993e6..b48dc5f1900b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -744,7 +744,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
744 744
745 pgste_set_unlock(ptep, new); 745 pgste_set_unlock(ptep, new);
746 pte_unmap_unlock(ptep, ptl); 746 pte_unmap_unlock(ptep, ptl);
747 return 0; 747 return cc;
748} 748}
749EXPORT_SYMBOL(reset_guest_reference_bit); 749EXPORT_SYMBOL(reset_guest_reference_bit);
750 750
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 8cc53b1e6d03..0cf802de52a1 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -80,6 +80,8 @@ static struct facility_def facility_defs[] = {
80 76, /* msa extension 3 */ 80 76, /* msa extension 3 */
81 77, /* msa extension 4 */ 81 77, /* msa extension 4 */
82 78, /* enhanced-DAT 2 */ 82 78, /* enhanced-DAT 2 */
83 130, /* instruction-execution-protection */
84 131, /* enhanced-SOP 2 and side-effect */
83 -1 /* END */ 85 -1 /* END */
84 } 86 }
85 }, 87 },
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 12080d87da3b..cb8f9149f6c8 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -177,16 +177,8 @@ static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
177 struct desc_struct *d = get_cpu_gdt_table(cpu); 177 struct desc_struct *d = get_cpu_gdt_table(cpu);
178 tss_desc tss; 178 tss_desc tss;
179 179
180 /*
181 * sizeof(unsigned long) coming from an extra "long" at the end
182 * of the iobitmap. See tss_struct definition in processor.h
183 *
184 * -1? seg base+limit should be pointing to the address of the
185 * last valid byte
186 */
187 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, 180 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
188 IO_BITMAP_OFFSET + IO_BITMAP_BYTES + 181 __KERNEL_TSS_LIMIT);
189 sizeof(unsigned long) - 1);
190 write_gdt_entry(d, entry, &tss, DESC_TSS); 182 write_gdt_entry(d, entry, &tss, DESC_TSS);
191} 183}
192 184
@@ -213,6 +205,54 @@ static inline void native_load_tr_desc(void)
213 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); 205 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
214} 206}
215 207
208static inline void force_reload_TR(void)
209{
210 struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
211 tss_desc tss;
212
213 memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
214
215 /*
216 * LTR requires an available TSS, and the TSS is currently
217 * busy. Make it be available so that LTR will work.
218 */
219 tss.type = DESC_TSS;
220 write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
221
222 load_TR_desc();
223}
224
225DECLARE_PER_CPU(bool, need_tr_refresh);
226
227static inline void refresh_TR(void)
228{
229 DEBUG_LOCKS_WARN_ON(preemptible());
230
231 if (unlikely(this_cpu_read(need_tr_refresh))) {
232 force_reload_TR();
233 this_cpu_write(need_tr_refresh, false);
234 }
235}
236
237/*
238 * If you do something evil that corrupts the cached TSS limit (I'm looking
239 * at you, VMX exits), call this function.
240 *
241 * The optimization here is that the TSS limit only matters for Linux if the
242 * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
243 * everything works except that IO bitmap will be ignored and all CPL 3 IO
244 * instructions will #GP, which is exactly what we want for normal tasks.
245 */
246static inline void invalidate_tss_limit(void)
247{
248 DEBUG_LOCKS_WARN_ON(preemptible());
249
250 if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
251 force_reload_TR();
252 else
253 this_cpu_write(need_tr_refresh, true);
254}
255
216static inline void native_load_gdt(const struct desc_ptr *dtr) 256static inline void native_load_gdt(const struct desc_ptr *dtr)
217{ 257{
218 asm volatile("lgdt %0"::"m" (*dtr)); 258 asm volatile("lgdt %0"::"m" (*dtr));
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
441int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); 441int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
442void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); 442void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
443void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); 443void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
444bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
444 445
445#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 446#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
55#define KVM_REQ_TRIPLE_FAULT 10 55#define KVM_REQ_TRIPLE_FAULT 10
56#define KVM_REQ_MMU_SYNC 11 56#define KVM_REQ_MMU_SYNC 11
57#define KVM_REQ_CLOCK_UPDATE 12 57#define KVM_REQ_CLOCK_UPDATE 12
58#define KVM_REQ_DEACTIVATE_FPU 13
59#define KVM_REQ_EVENT 14 58#define KVM_REQ_EVENT 14
60#define KVM_REQ_APF_HALT 15 59#define KVM_REQ_APF_HALT 15
61#define KVM_REQ_STEAL_UPDATE 16 60#define KVM_REQ_STEAL_UPDATE 16
@@ -115,7 +114,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
115 114
116#define KVM_PERMILLE_MMU_PAGES 20 115#define KVM_PERMILLE_MMU_PAGES 20
117#define KVM_MIN_ALLOC_MMU_PAGES 64 116#define KVM_MIN_ALLOC_MMU_PAGES 64
118#define KVM_MMU_HASH_SHIFT 10 117#define KVM_MMU_HASH_SHIFT 12
119#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) 118#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
120#define KVM_MIN_FREE_MMU_PAGES 5 119#define KVM_MIN_FREE_MMU_PAGES 5
121#define KVM_REFILL_PAGES 25 120#define KVM_REFILL_PAGES 25
@@ -208,6 +207,13 @@ enum {
208 PFERR_WRITE_MASK | \ 207 PFERR_WRITE_MASK | \
209 PFERR_PRESENT_MASK) 208 PFERR_PRESENT_MASK)
210 209
210/*
211 * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
212 * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
213 * with the SVE bit in EPT PTEs.
214 */
215#define SPTE_SPECIAL_MASK (1ULL << 62)
216
211/* apic attention bits */ 217/* apic attention bits */
212#define KVM_APIC_CHECK_VAPIC 0 218#define KVM_APIC_CHECK_VAPIC 0
213/* 219/*
@@ -668,6 +674,9 @@ struct kvm_vcpu_arch {
668 674
669 int pending_ioapic_eoi; 675 int pending_ioapic_eoi;
670 int pending_external_vector; 676 int pending_external_vector;
677
678 /* GPA available (AMD only) */
679 bool gpa_available;
671}; 680};
672 681
673struct kvm_lpage_info { 682struct kvm_lpage_info {
@@ -716,6 +725,12 @@ struct kvm_hv {
716 HV_REFERENCE_TSC_PAGE tsc_ref; 725 HV_REFERENCE_TSC_PAGE tsc_ref;
717}; 726};
718 727
728enum kvm_irqchip_mode {
729 KVM_IRQCHIP_NONE,
730 KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
731 KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
732};
733
719struct kvm_arch { 734struct kvm_arch {
720 unsigned int n_used_mmu_pages; 735 unsigned int n_used_mmu_pages;
721 unsigned int n_requested_mmu_pages; 736 unsigned int n_requested_mmu_pages;
@@ -788,7 +803,7 @@ struct kvm_arch {
788 803
789 u64 disabled_quirks; 804 u64 disabled_quirks;
790 805
791 bool irqchip_split; 806 enum kvm_irqchip_mode irqchip_mode;
792 u8 nr_reserved_ioapic_pins; 807 u8 nr_reserved_ioapic_pins;
793 808
794 bool disabled_lapic_found; 809 bool disabled_lapic_found;
@@ -815,6 +830,7 @@ struct kvm_vm_stat {
815 ulong mmu_unsync; 830 ulong mmu_unsync;
816 ulong remote_tlb_flush; 831 ulong remote_tlb_flush;
817 ulong lpages; 832 ulong lpages;
833 ulong max_mmu_page_hash_collisions;
818}; 834};
819 835
820struct kvm_vcpu_stat { 836struct kvm_vcpu_stat {
@@ -844,6 +860,7 @@ struct kvm_vcpu_stat {
844 u64 hypercalls; 860 u64 hypercalls;
845 u64 irq_injections; 861 u64 irq_injections;
846 u64 nmi_injections; 862 u64 nmi_injections;
863 u64 req_event;
847}; 864};
848 865
849struct x86_instruction_info; 866struct x86_instruction_info;
@@ -918,8 +935,6 @@ struct kvm_x86_ops {
918 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 935 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
919 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 936 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
920 u32 (*get_pkru)(struct kvm_vcpu *vcpu); 937 u32 (*get_pkru)(struct kvm_vcpu *vcpu);
921 void (*fpu_activate)(struct kvm_vcpu *vcpu);
922 void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
923 938
924 void (*tlb_flush)(struct kvm_vcpu *vcpu); 939 void (*tlb_flush)(struct kvm_vcpu *vcpu);
925 940
@@ -951,7 +966,7 @@ struct kvm_x86_ops {
951 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 966 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
952 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); 967 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
953 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); 968 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
954 void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); 969 int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
955 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 970 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
956 int (*get_tdp_level)(void); 971 int (*get_tdp_level)(void);
957 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 972 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -1050,7 +1065,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
1050void kvm_mmu_init_vm(struct kvm *kvm); 1065void kvm_mmu_init_vm(struct kvm *kvm);
1051void kvm_mmu_uninit_vm(struct kvm *kvm); 1066void kvm_mmu_uninit_vm(struct kvm *kvm);
1052void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1067void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
1053 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); 1068 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
1069 u64 acc_track_mask);
1054 1070
1055void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1071void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
1056void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 1072void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f260bef63591
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_KVM_CLOCK_H
2#define _ASM_X86_KVM_CLOCK_H
3
4extern struct clocksource kvm_clock;
5
6#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1eea6ca40694..f75fbfe550f2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -673,7 +673,7 @@ static __always_inline void pv_kick(int cpu)
673 PVOP_VCALL1(pv_lock_ops.kick, cpu); 673 PVOP_VCALL1(pv_lock_ops.kick, cpu);
674} 674}
675 675
676static __always_inline bool pv_vcpu_is_preempted(int cpu) 676static __always_inline bool pv_vcpu_is_preempted(long cpu)
677{ 677{
678 return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); 678 return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
679} 679}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e6cfe7ba2d65..f385eca5407a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -304,7 +304,7 @@ struct x86_hw_tss {
304 u16 reserved5; 304 u16 reserved5;
305 u16 io_bitmap_base; 305 u16 io_bitmap_base;
306 306
307} __attribute__((packed)) ____cacheline_aligned; 307} __attribute__((packed));
308#endif 308#endif
309 309
310/* 310/*
@@ -342,6 +342,16 @@ struct tss_struct {
342 342
343DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); 343DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
344 344
345/*
346 * sizeof(unsigned long) coming from an extra "long" at the end
347 * of the iobitmap.
348 *
349 * -1? seg base+limit should be pointing to the address of the
350 * last valid byte
351 */
352#define __KERNEL_TSS_LIMIT \
353 (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
354
345#ifdef CONFIG_X86_32 355#ifdef CONFIG_X86_32
346DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); 356DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
347#endif 357#endif
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c343ab52579f..48a706f641f2 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -34,7 +34,7 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
34} 34}
35 35
36#define vcpu_is_preempted vcpu_is_preempted 36#define vcpu_is_preempted vcpu_is_preempted
37static inline bool vcpu_is_preempted(int cpu) 37static inline bool vcpu_is_preempted(long cpu)
38{ 38{
39 return pv_vcpu_is_preempted(cpu); 39 return pv_vcpu_is_preempted(cpu);
40} 40}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..cc54b7026567 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,16 @@ enum vmcs_field {
467#define VMX_EPT_WRITABLE_MASK 0x2ull 467#define VMX_EPT_WRITABLE_MASK 0x2ull
468#define VMX_EPT_EXECUTABLE_MASK 0x4ull 468#define VMX_EPT_EXECUTABLE_MASK 0x4ull
469#define VMX_EPT_IPAT_BIT (1ull << 6) 469#define VMX_EPT_IPAT_BIT (1ull << 6)
470#define VMX_EPT_ACCESS_BIT (1ull << 8) 470#define VMX_EPT_ACCESS_BIT (1ull << 8)
471#define VMX_EPT_DIRTY_BIT (1ull << 9) 471#define VMX_EPT_DIRTY_BIT (1ull << 9)
472#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
473 VMX_EPT_WRITABLE_MASK | \
474 VMX_EPT_EXECUTABLE_MASK)
475#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
476
477/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
478#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
479 VMX_EPT_EXECUTABLE_MASK)
472 480
473#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 481#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
474 482
@@ -500,6 +508,22 @@ struct vmx_msr_entry {
500#define ENTRY_FAIL_VMCS_LINK_PTR 4 508#define ENTRY_FAIL_VMCS_LINK_PTR 4
501 509
502/* 510/*
511 * Exit Qualifications for EPT Violations
512 */
513#define EPT_VIOLATION_ACC_READ_BIT 0
514#define EPT_VIOLATION_ACC_WRITE_BIT 1
515#define EPT_VIOLATION_ACC_INSTR_BIT 2
516#define EPT_VIOLATION_READABLE_BIT 3
517#define EPT_VIOLATION_WRITABLE_BIT 4
518#define EPT_VIOLATION_EXECUTABLE_BIT 5
519#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
520#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
521#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
522#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
523#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
524#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
525
526/*
503 * VM-instruction error numbers 527 * VM-instruction error numbers
504 */ 528 */
505enum vm_instruction_error_number { 529enum vm_instruction_error_number {
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
50 __u32 pad[11]; 50 __u32 pad[11];
51}; 51};
52 52
53#define KVM_CLOCK_PAIRING_WALLCLOCK 0
54struct kvm_clock_pairing {
55 __s64 sec;
56 __s64 nsec;
57 __u64 tsc;
58 __u32 flags;
59 __u32 pad[9];
60};
61
53#define KVM_STEAL_ALIGNMENT_BITS 5 62#define KVM_STEAL_ALIGNMENT_BITS 5
54#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) 63#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
55#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) 64#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 210927ee2e74..99332f550c48 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,6 +13,10 @@ static char syscalls_ia32[] = {
13#include <asm/syscalls_32.h> 13#include <asm/syscalls_32.h>
14}; 14};
15 15
16#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
17#include <asm/kvm_para.h>
18#endif
19
16int main(void) 20int main(void)
17{ 21{
18#ifdef CONFIG_PARAVIRT 22#ifdef CONFIG_PARAVIRT
@@ -22,6 +26,11 @@ int main(void)
22 BLANK(); 26 BLANK();
23#endif 27#endif
24 28
29#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
30 OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
31 BLANK();
32#endif
33
25#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) 34#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
26 ENTRY(bx); 35 ENTRY(bx);
27 ENTRY(cx); 36 ENTRY(cx);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b3193f102..b01bc8517450 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -16,6 +16,7 @@
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <asm/syscalls.h> 18#include <asm/syscalls.h>
19#include <asm/desc.h>
19 20
20/* 21/*
21 * this changes the io permissions bitmap in the current task. 22 * this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
45 memset(bitmap, 0xff, IO_BITMAP_BYTES); 46 memset(bitmap, 0xff, IO_BITMAP_BYTES);
46 t->io_bitmap_ptr = bitmap; 47 t->io_bitmap_ptr = bitmap;
47 set_thread_flag(TIF_IO_BITMAP); 48 set_thread_flag(TIF_IO_BITMAP);
49
50 preempt_disable();
51 refresh_TR();
52 preempt_enable();
48 } 53 }
49 54
50 /* 55 /*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba4981d..14f65a5f938e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,7 +589,8 @@ out:
589 local_irq_restore(flags); 589 local_irq_restore(flags);
590} 590}
591 591
592__visible bool __kvm_vcpu_is_preempted(int cpu) 592#ifdef CONFIG_X86_32
593__visible bool __kvm_vcpu_is_preempted(long cpu)
593{ 594{
594 struct kvm_steal_time *src = &per_cpu(steal_time, cpu); 595 struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
595 596
@@ -597,6 +598,29 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
597} 598}
598PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); 599PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
599 600
601#else
602
603#include <asm/asm-offsets.h>
604
605extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
606
607/*
608 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
609 * restoring to/from the stack.
610 */
611asm(
612".pushsection .text;"
613".global __raw_callee_save___kvm_vcpu_is_preempted;"
614".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
615"__raw_callee_save___kvm_vcpu_is_preempted:"
616"movq __per_cpu_offset(,%rdi,8), %rax;"
617"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
618"setne %al;"
619"ret;"
620".popsection");
621
622#endif
623
600/* 624/*
601 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. 625 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
602 */ 626 */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 542710b99f52..bae6ea6cfb94 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
28 28
29#include <asm/x86_init.h> 29#include <asm/x86_init.h>
30#include <asm/reboot.h> 30#include <asm/reboot.h>
31#include <asm/kvmclock.h>
31 32
32static int kvmclock __ro_after_init = 1; 33static int kvmclock __ro_after_init = 1;
33static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 34static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
@@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
49{ 50{
50 return hv_clock; 51 return hv_clock;
51} 52}
53EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
52 54
53/* 55/*
54 * The wallclock is the time of day when we booted. Since then, some time may 56 * The wallclock is the time of day when we booted. Since then, some time may
@@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void)
174 return ret; 176 return ret;
175} 177}
176 178
177static struct clocksource kvm_clock = { 179struct clocksource kvm_clock = {
178 .name = "kvm-clock", 180 .name = "kvm-clock",
179 .read = kvm_clock_get_cycles, 181 .read = kvm_clock_get_cycles,
180 .rating = 400, 182 .rating = 400,
181 .mask = CLOCKSOURCE_MASK(64), 183 .mask = CLOCKSOURCE_MASK(64),
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 184 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183}; 185};
186EXPORT_SYMBOL_GPL(kvm_clock);
184 187
185int kvm_register_clock(char *txt) 188int kvm_register_clock(char *txt)
186{ 189{
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6259327f3454..8f2d1c9d43a8 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -20,7 +20,7 @@ bool pv_is_native_spin_unlock(void)
20 __raw_callee_save___native_queued_spin_unlock; 20 __raw_callee_save___native_queued_spin_unlock;
21} 21}
22 22
23__visible bool __native_vcpu_is_preempted(int cpu) 23__visible bool __native_vcpu_is_preempted(long cpu)
24{ 24{
25 return false; 25 return false;
26} 26}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a1113f58..7780efa635b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
32#include <asm/mce.h> 32#include <asm/mce.h>
33#include <asm/vm86.h> 33#include <asm/vm86.h>
34#include <asm/switch_to.h> 34#include <asm/switch_to.h>
35#include <asm/desc.h>
35 36
36/* 37/*
37 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 38 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -64,6 +65,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
64}; 65};
65EXPORT_PER_CPU_SYMBOL(cpu_tss); 66EXPORT_PER_CPU_SYMBOL(cpu_tss);
66 67
68DEFINE_PER_CPU(bool, need_tr_refresh);
69EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
70
67/* 71/*
68 * this gets called so that we can store lazy state into memory and copy the 72 * this gets called so that we can store lazy state into memory and copy the
69 * current task into the new thread. 73 * current task into the new thread.
@@ -209,6 +213,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
209 */ 213 */
210 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 214 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
211 max(prev->io_bitmap_max, next->io_bitmap_max)); 215 max(prev->io_bitmap_max, next->io_bitmap_max));
216
217 /*
218 * Make sure that the TSS limit is correct for the CPU
219 * to notice the IO bitmap.
220 */
221 refresh_TR();
212 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 222 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
213 /* 223 /*
214 * Clear any possible leftover bits: 224 * Clear any possible leftover bits:
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
123 if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) 123 if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
124 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 124 best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
125 125
126 kvm_x86_ops->fpu_activate(vcpu);
127
128 /* 126 /*
129 * The existing code assumes virtual address is 48-bit in the canonical 127 * The existing code assumes virtual address is 48-bit in the canonical
130 * address checks; exit if it is ever changed. 128 * address checks; exit if it is ever changed.
@@ -383,7 +381,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
383 381
384 /* cpuid 7.0.ecx*/ 382 /* cpuid 7.0.ecx*/
385 const u32 kvm_cpuid_7_0_ecx_x86_features = 383 const u32 kvm_cpuid_7_0_ecx_x86_features =
386 F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; 384 F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
387 385
388 /* cpuid 7.0.edx*/ 386 /* cpuid 7.0.edx*/
389 const u32 kvm_cpuid_7_0_edx_x86_features = 387 const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -861,12 +859,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
861 if (!best) 859 if (!best)
862 best = check_cpuid_limit(vcpu, function, index); 860 best = check_cpuid_limit(vcpu, function, index);
863 861
864 /*
865 * Perfmon not yet supported for L2 guest.
866 */
867 if (is_guest_mode(vcpu) && function == 0xa)
868 best = NULL;
869
870 if (best) { 862 if (best) {
871 *eax = best->eax; 863 *eax = best->eax;
872 *ebx = best->ebx; 864 *ebx = best->ebx;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cedbba0f3402..45c7306c8780 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
173#define NearBranch ((u64)1 << 52) /* Near branches */ 173#define NearBranch ((u64)1 << 52) /* Near branches */
174#define No16 ((u64)1 << 53) /* No 16 bit operand */ 174#define No16 ((u64)1 << 53) /* No 16 bit operand */
175#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ 175#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
176#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
176 177
177#define DstXacc (DstAccLo | SrcAccHi | SrcWrite) 178#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
178 179
@@ -4298,7 +4299,7 @@ static const struct opcode group1[] = {
4298}; 4299};
4299 4300
4300static const struct opcode group1A[] = { 4301static const struct opcode group1A[] = {
4301 I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, 4302 I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
4302}; 4303};
4303 4304
4304static const struct opcode group2[] = { 4305static const struct opcode group2[] = {
@@ -4336,7 +4337,7 @@ static const struct opcode group5[] = {
4336 I(SrcMemFAddr | ImplicitOps, em_call_far), 4337 I(SrcMemFAddr | ImplicitOps, em_call_far),
4337 I(SrcMem | NearBranch, em_jmp_abs), 4338 I(SrcMem | NearBranch, em_jmp_abs),
4338 I(SrcMemFAddr | ImplicitOps, em_jmp_far), 4339 I(SrcMemFAddr | ImplicitOps, em_jmp_far),
4339 I(SrcMem | Stack, em_push), D(Undefined), 4340 I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
4340}; 4341};
4341 4342
4342static const struct opcode group6[] = { 4343static const struct opcode group6[] = {
@@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = {
4556 /* 0xA0 - 0xA7 */ 4557 /* 0xA0 - 0xA7 */
4557 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 4558 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
4558 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 4559 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
4559 I2bv(SrcSI | DstDI | Mov | String, em_mov), 4560 I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
4560 F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), 4561 F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
4561 /* 0xA8 - 0xAF */ 4562 /* 0xA8 - 0xAF */
4562 F2bv(DstAcc | SrcImm | NoWrite, em_test), 4563 F2bv(DstAcc | SrcImm | NoWrite, em_test),
4563 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 4564 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
5671{ 5672{
5672 writeback_registers(ctxt); 5673 writeback_registers(ctxt);
5673} 5674}
5675
5676bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
5677{
5678 if (ctxt->rep_prefix && (ctxt->d & String))
5679 return false;
5680
5681 if (ctxt->d & TwoMemOp)
5682 return false;
5683
5684 return true;
5685}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2ecd7dab4631..f701d4430727 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
305 return -ENOENT; 305 return -ENOENT;
306 306
307 memset(&irq, 0, sizeof(irq)); 307 memset(&irq, 0, sizeof(irq));
308 irq.dest_id = kvm_apic_id(vcpu->arch.apic); 308 irq.shorthand = APIC_DEST_SELF;
309 irq.dest_mode = APIC_DEST_PHYSICAL; 309 irq.dest_mode = APIC_DEST_PHYSICAL;
310 irq.delivery_mode = APIC_DM_FIXED; 310 irq.delivery_mode = APIC_DM_FIXED;
311 irq.vector = vector; 311 irq.vector = vector;
312 irq.level = 1; 312 irq.level = 1;
313 313
314 ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); 314 ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
315 trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); 315 trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
316 return ret; 316 return ret;
317} 317}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
598 .write = picdev_eclr_write, 598 .write = picdev_eclr_write,
599}; 599};
600 600
601struct kvm_pic *kvm_create_pic(struct kvm *kvm) 601int kvm_pic_init(struct kvm *kvm)
602{ 602{
603 struct kvm_pic *s; 603 struct kvm_pic *s;
604 int ret; 604 int ret;
605 605
606 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 606 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
607 if (!s) 607 if (!s)
608 return NULL; 608 return -ENOMEM;
609 spin_lock_init(&s->lock); 609 spin_lock_init(&s->lock);
610 s->kvm = kvm; 610 s->kvm = kvm;
611 s->pics[0].elcr_mask = 0xf8; 611 s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
635 635
636 mutex_unlock(&kvm->slots_lock); 636 mutex_unlock(&kvm->slots_lock);
637 637
638 return s; 638 kvm->arch.vpic = s;
639
640 return 0;
639 641
640fail_unreg_1: 642fail_unreg_1:
641 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); 643 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
648 650
649 kfree(s); 651 kfree(s);
650 652
651 return NULL; 653 return ret;
652} 654}
653 655
654void kvm_destroy_pic(struct kvm_pic *vpic) 656void kvm_pic_destroy(struct kvm *kvm)
655{ 657{
658 struct kvm_pic *vpic = kvm->arch.vpic;
659
656 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); 660 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
657 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); 661 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
658 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); 662 kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
663
664 kvm->arch.vpic = NULL;
659 kfree(vpic); 665 kfree(vpic);
660} 666}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
73 unsigned long irq_states[PIC_NUM_PINS]; 73 unsigned long irq_states[PIC_NUM_PINS];
74}; 74};
75 75
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 76int kvm_pic_init(struct kvm *kvm);
77void kvm_destroy_pic(struct kvm_pic *vpic); 77void kvm_pic_destroy(struct kvm *kvm);
78int kvm_pic_read_irq(struct kvm *kvm); 78int kvm_pic_read_irq(struct kvm *kvm);
79void kvm_pic_update_irq(struct kvm_pic *s); 79void kvm_pic_update_irq(struct kvm_pic *s);
80 80
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
93 93
94static inline int irqchip_split(struct kvm *kvm) 94static inline int irqchip_split(struct kvm *kvm)
95{ 95{
96 return kvm->arch.irqchip_split; 96 return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
97} 97}
98 98
99static inline int irqchip_in_kernel(struct kvm *kvm) 99static inline int irqchip_kernel(struct kvm *kvm)
100{ 100{
101 struct kvm_pic *vpic = pic_irqchip(kvm); 101 return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
102 bool ret; 102}
103 103
104 ret = (vpic != NULL); 104static inline int irqchip_in_kernel(struct kvm *kvm)
105 ret |= irqchip_split(kvm); 105{
106 bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
106 107
107 /* Read vpic before kvm->irq_routing. */ 108 /* Matches with wmb after initializing kvm->irq_routing. */
108 smp_rmb(); 109 smp_rmb();
109 return ret; 110 return ret;
110} 111}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
41 bool line_status) 41 bool line_status)
42{ 42{
43 struct kvm_pic *pic = pic_irqchip(kvm); 43 struct kvm_pic *pic = pic_irqchip(kvm);
44
45 /*
46 * XXX: rejecting pic routes when pic isn't in use would be better,
47 * but the default routing table is installed while kvm->arch.vpic is
48 * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
49 */
50 if (!pic)
51 return -1;
52
53 return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); 44 return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
54} 45}
55 46
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
58 bool line_status) 49 bool line_status)
59{ 50{
60 struct kvm_ioapic *ioapic = kvm->arch.vioapic; 51 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
61
62 if (!ioapic)
63 return -1;
64
65 return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, 52 return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
66 line_status); 53 line_status);
67} 54}
@@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm,
297 case KVM_IRQ_ROUTING_IRQCHIP: 284 case KVM_IRQ_ROUTING_IRQCHIP:
298 delta = 0; 285 delta = 0;
299 switch (ue->u.irqchip.irqchip) { 286 switch (ue->u.irqchip.irqchip) {
300 case KVM_IRQCHIP_PIC_MASTER:
301 e->set = kvm_set_pic_irq;
302 max_pin = PIC_NUM_PINS;
303 break;
304 case KVM_IRQCHIP_PIC_SLAVE: 287 case KVM_IRQCHIP_PIC_SLAVE:
288 delta = 8;
289 /* fall through */
290 case KVM_IRQCHIP_PIC_MASTER:
291 if (!pic_in_kernel(kvm))
292 goto out;
293
305 e->set = kvm_set_pic_irq; 294 e->set = kvm_set_pic_irq;
306 max_pin = PIC_NUM_PINS; 295 max_pin = PIC_NUM_PINS;
307 delta = 8;
308 break; 296 break;
309 case KVM_IRQCHIP_IOAPIC: 297 case KVM_IRQCHIP_IOAPIC:
298 if (!ioapic_in_kernel(kvm))
299 goto out;
300
310 max_pin = KVM_IOAPIC_NUM_PINS; 301 max_pin = KVM_IOAPIC_NUM_PINS;
311 e->set = kvm_set_ioapic_irq; 302 e->set = kvm_set_ioapic_irq;
312 break; 303 break;
@@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
409 400
410void kvm_arch_post_irq_routing_update(struct kvm *kvm) 401void kvm_arch_post_irq_routing_update(struct kvm *kvm)
411{ 402{
412 if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) 403 if (!irqchip_split(kvm))
413 return; 404 return;
414 kvm_make_scan_ioapic_request(kvm); 405 kvm_make_scan_ioapic_request(kvm);
415} 406}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f6ef5121a4c..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
115 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 115 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
116 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 116 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
117 117
118static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
119{
120 return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
121}
122
123static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
124{
125 return apic->vcpu->vcpu_id;
126}
127
118static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, 128static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
119 u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { 129 u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
120 switch (map->mode) { 130 switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
159 struct kvm_apic_map *new, *old = NULL; 169 struct kvm_apic_map *new, *old = NULL;
160 struct kvm_vcpu *vcpu; 170 struct kvm_vcpu *vcpu;
161 int i; 171 int i;
162 u32 max_id = 255; 172 u32 max_id = 255; /* enough space for any xAPIC ID */
163 173
164 mutex_lock(&kvm->arch.apic_map_lock); 174 mutex_lock(&kvm->arch.apic_map_lock);
165 175
166 kvm_for_each_vcpu(i, vcpu, kvm) 176 kvm_for_each_vcpu(i, vcpu, kvm)
167 if (kvm_apic_present(vcpu)) 177 if (kvm_apic_present(vcpu))
168 max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); 178 max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
169 179
170 new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + 180 new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
171 sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); 181 sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm)
179 struct kvm_lapic *apic = vcpu->arch.apic; 189 struct kvm_lapic *apic = vcpu->arch.apic;
180 struct kvm_lapic **cluster; 190 struct kvm_lapic **cluster;
181 u16 mask; 191 u16 mask;
182 u32 ldr, aid; 192 u32 ldr;
193 u8 xapic_id;
194 u32 x2apic_id;
183 195
184 if (!kvm_apic_present(vcpu)) 196 if (!kvm_apic_present(vcpu))
185 continue; 197 continue;
186 198
187 aid = kvm_apic_id(apic); 199 xapic_id = kvm_xapic_id(apic);
188 ldr = kvm_lapic_get_reg(apic, APIC_LDR); 200 x2apic_id = kvm_x2apic_id(apic);
189 201
190 if (aid <= new->max_apic_id) 202 /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
191 new->phys_map[aid] = apic; 203 if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
204 x2apic_id <= new->max_apic_id)
205 new->phys_map[x2apic_id] = apic;
206 /*
207 * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
208 * prevent them from masking VCPUs with APIC ID <= 0xff.
209 */
210 if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
211 new->phys_map[xapic_id] = apic;
212
213 ldr = kvm_lapic_get_reg(apic, APIC_LDR);
192 214
193 if (apic_x2apic_mode(apic)) { 215 if (apic_x2apic_mode(apic)) {
194 new->mode |= KVM_APIC_MODE_X2APIC; 216 new->mode |= KVM_APIC_MODE_X2APIC;
@@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
250{ 272{
251 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 273 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
252 274
275 WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
276
253 kvm_lapic_set_reg(apic, APIC_ID, id); 277 kvm_lapic_set_reg(apic, APIC_ID, id);
254 kvm_lapic_set_reg(apic, APIC_LDR, ldr); 278 kvm_lapic_set_reg(apic, APIC_LDR, ldr);
255 recalculate_apic_map(apic->vcpu->kvm); 279 recalculate_apic_map(apic->vcpu->kvm);
@@ -317,7 +341,7 @@ static int find_highest_vector(void *bitmap)
317 vec >= 0; vec -= APIC_VECTORS_PER_REG) { 341 vec >= 0; vec -= APIC_VECTORS_PER_REG) {
318 reg = bitmap + REG_POS(vec); 342 reg = bitmap + REG_POS(vec);
319 if (*reg) 343 if (*reg)
320 return fls(*reg) - 1 + vec; 344 return __fls(*reg) + vec;
321 } 345 }
322 346
323 return -1; 347 return -1;
@@ -337,27 +361,32 @@ static u8 count_vectors(void *bitmap)
337 return count; 361 return count;
338} 362}
339 363
340void __kvm_apic_update_irr(u32 *pir, void *regs) 364int __kvm_apic_update_irr(u32 *pir, void *regs)
341{ 365{
342 u32 i, pir_val; 366 u32 i, vec;
367 u32 pir_val, irr_val;
368 int max_irr = -1;
343 369
344 for (i = 0; i <= 7; i++) { 370 for (i = vec = 0; i <= 7; i++, vec += 32) {
345 pir_val = READ_ONCE(pir[i]); 371 pir_val = READ_ONCE(pir[i]);
372 irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
346 if (pir_val) { 373 if (pir_val) {
347 pir_val = xchg(&pir[i], 0); 374 irr_val |= xchg(&pir[i], 0);
348 *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; 375 *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
349 } 376 }
377 if (irr_val)
378 max_irr = __fls(irr_val) + vec;
350 } 379 }
380
381 return max_irr;
351} 382}
352EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); 383EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
353 384
354void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) 385int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
355{ 386{
356 struct kvm_lapic *apic = vcpu->arch.apic; 387 struct kvm_lapic *apic = vcpu->arch.apic;
357 388
358 __kvm_apic_update_irr(pir, apic->regs); 389 return __kvm_apic_update_irr(pir, apic->regs);
359
360 kvm_make_request(KVM_REQ_EVENT, vcpu);
361} 390}
362EXPORT_SYMBOL_GPL(kvm_apic_update_irr); 391EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
363 392
@@ -377,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
377 if (!apic->irr_pending) 406 if (!apic->irr_pending)
378 return -1; 407 return -1;
379 408
380 if (apic->vcpu->arch.apicv_active)
381 kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
382 result = apic_search_irr(apic); 409 result = apic_search_irr(apic);
383 ASSERT(result == -1 || result >= 16); 410 ASSERT(result == -1 || result >= 16);
384 411
@@ -392,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
392 vcpu = apic->vcpu; 419 vcpu = apic->vcpu;
393 420
394 if (unlikely(vcpu->arch.apicv_active)) { 421 if (unlikely(vcpu->arch.apicv_active)) {
395 /* try to update RVI */ 422 /* need to update RVI */
396 apic_clear_vector(vec, apic->regs + APIC_IRR); 423 apic_clear_vector(vec, apic->regs + APIC_IRR);
397 kvm_make_request(KVM_REQ_EVENT, vcpu); 424 kvm_x86_ops->hwapic_irr_update(vcpu,
425 apic_find_highest_irr(apic));
398 } else { 426 } else {
399 apic->irr_pending = false; 427 apic->irr_pending = false;
400 apic_clear_vector(vec, apic->regs + APIC_IRR); 428 apic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -484,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
484 */ 512 */
485 return apic_find_highest_irr(vcpu->arch.apic); 513 return apic_find_highest_irr(vcpu->arch.apic);
486} 514}
515EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
487 516
488static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 517static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
489 int vector, int level, int trig_mode, 518 int vector, int level, int trig_mode,
@@ -500,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
500 529
501static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) 530static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
502{ 531{
503 532 return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
504 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, 533 sizeof(val));
505 sizeof(val));
506} 534}
507 535
508static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) 536static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
509{ 537{
510 538 return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
511 return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, 539 sizeof(*val));
512 sizeof(*val));
513} 540}
514 541
515static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) 542static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -546,7 +573,19 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
546 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); 573 __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
547} 574}
548 575
549static void apic_update_ppr(struct kvm_lapic *apic) 576static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
577{
578 int highest_irr;
579 if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
580 highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
581 else
582 highest_irr = apic_find_highest_irr(apic);
583 if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
584 return -1;
585 return highest_irr;
586}
587
588static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
550{ 589{
551 u32 tpr, isrv, ppr, old_ppr; 590 u32 tpr, isrv, ppr, old_ppr;
552 int isr; 591 int isr;
@@ -564,13 +603,28 @@ static void apic_update_ppr(struct kvm_lapic *apic)
564 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 603 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
565 apic, ppr, isr, isrv); 604 apic, ppr, isr, isrv);
566 605
567 if (old_ppr != ppr) { 606 *new_ppr = ppr;
607 if (old_ppr != ppr)
568 kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); 608 kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
569 if (ppr < old_ppr) 609
570 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 610 return ppr < old_ppr;
571 } 611}
612
613static void apic_update_ppr(struct kvm_lapic *apic)
614{
615 u32 ppr;
616
617 if (__apic_update_ppr(apic, &ppr) &&
618 apic_has_interrupt_for_ppr(apic, ppr) != -1)
619 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
572} 620}
573 621
622void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
623{
624 apic_update_ppr(vcpu->arch.apic);
625}
626EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
627
574static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 628static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
575{ 629{
576 kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); 630 kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
@@ -579,10 +633,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
579 633
580static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) 634static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
581{ 635{
582 if (apic_x2apic_mode(apic)) 636 return mda == (apic_x2apic_mode(apic) ?
583 return mda == X2APIC_BROADCAST; 637 X2APIC_BROADCAST : APIC_BROADCAST);
584
585 return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
586} 638}
587 639
588static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) 640static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -591,9 +643,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
591 return true; 643 return true;
592 644
593 if (apic_x2apic_mode(apic)) 645 if (apic_x2apic_mode(apic))
594 return mda == kvm_apic_id(apic); 646 return mda == kvm_x2apic_id(apic);
595 647
596 return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); 648 /*
649 * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
650 * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
651 * this allows unique addressing of VCPUs with APIC ID over 0xff.
652 * The 0xff condition is needed because writeable xAPIC ID.
653 */
654 if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
655 return true;
656
657 return mda == kvm_xapic_id(apic);
597} 658}
598 659
599static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) 660static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -610,7 +671,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
610 && (logical_id & mda & 0xffff) != 0; 671 && (logical_id & mda & 0xffff) != 0;
611 672
612 logical_id = GET_APIC_LOGICAL_ID(logical_id); 673 logical_id = GET_APIC_LOGICAL_ID(logical_id);
613 mda = GET_APIC_DEST_FIELD(mda);
614 674
615 switch (kvm_lapic_get_reg(apic, APIC_DFR)) { 675 switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
616 case APIC_DFR_FLAT: 676 case APIC_DFR_FLAT:
@@ -627,9 +687,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
627 687
628/* The KVM local APIC implementation has two quirks: 688/* The KVM local APIC implementation has two quirks:
629 * 689 *
630 * - the xAPIC MDA stores the destination at bits 24-31, while this 690 * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
631 * is not true of struct kvm_lapic_irq's dest_id field. This is 691 * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
632 * just a quirk in the API and is not problematic. 692 * KVM doesn't do that aliasing.
633 * 693 *
634 * - in-kernel IOAPIC messages have to be delivered directly to 694 * - in-kernel IOAPIC messages have to be delivered directly to
635 * x2APIC, because the kernel does not support interrupt remapping. 695 * x2APIC, because the kernel does not support interrupt remapping.
@@ -645,13 +705,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
645 struct kvm_lapic *source, struct kvm_lapic *target) 705 struct kvm_lapic *source, struct kvm_lapic *target)
646{ 706{
647 bool ipi = source != NULL; 707 bool ipi = source != NULL;
648 bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
649 708
650 if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && 709 if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
651 !ipi && dest_id == APIC_BROADCAST && x2apic_mda) 710 !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
652 return X2APIC_BROADCAST; 711 return X2APIC_BROADCAST;
653 712
654 return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); 713 return dest_id;
655} 714}
656 715
657bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 716bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
@@ -1907,9 +1966,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
1907 vcpu->arch.apic_arb_prio = 0; 1966 vcpu->arch.apic_arb_prio = 0;
1908 vcpu->arch.apic_attention = 0; 1967 vcpu->arch.apic_attention = 0;
1909 1968
1910 apic_debug("%s: vcpu=%p, id=%d, base_msr=" 1969 apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
1911 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, 1970 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
1912 vcpu, kvm_apic_id(apic), 1971 vcpu, kvm_lapic_get_reg(apic, APIC_ID),
1913 vcpu->arch.apic_base, apic->base_address); 1972 vcpu->arch.apic_base, apic->base_address);
1914} 1973}
1915 1974
@@ -2021,17 +2080,13 @@ nomem:
2021int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 2080int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
2022{ 2081{
2023 struct kvm_lapic *apic = vcpu->arch.apic; 2082 struct kvm_lapic *apic = vcpu->arch.apic;
2024 int highest_irr; 2083 u32 ppr;
2025 2084
2026 if (!apic_enabled(apic)) 2085 if (!apic_enabled(apic))
2027 return -1; 2086 return -1;
2028 2087
2029 apic_update_ppr(apic); 2088 __apic_update_ppr(apic, &ppr);
2030 highest_irr = apic_find_highest_irr(apic); 2089 return apic_has_interrupt_for_ppr(apic, ppr);
2031 if ((highest_irr == -1) ||
2032 ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
2033 return -1;
2034 return highest_irr;
2035} 2090}
2036 2091
2037int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 2092int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
@@ -2067,6 +2122,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
2067{ 2122{
2068 int vector = kvm_apic_has_interrupt(vcpu); 2123 int vector = kvm_apic_has_interrupt(vcpu);
2069 struct kvm_lapic *apic = vcpu->arch.apic; 2124 struct kvm_lapic *apic = vcpu->arch.apic;
2125 u32 ppr;
2070 2126
2071 if (vector == -1) 2127 if (vector == -1)
2072 return -1; 2128 return -1;
@@ -2078,13 +2134,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
2078 * because the process would deliver it through the IDT. 2134 * because the process would deliver it through the IDT.
2079 */ 2135 */
2080 2136
2081 apic_set_isr(vector, apic);
2082 apic_update_ppr(apic);
2083 apic_clear_irr(vector, apic); 2137 apic_clear_irr(vector, apic);
2084
2085 if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { 2138 if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
2086 apic_clear_isr(vector, apic); 2139 /*
2140 * For auto-EOI interrupts, there might be another pending
2141 * interrupt above PPR, so check whether to raise another
2142 * KVM_REQ_EVENT.
2143 */
2087 apic_update_ppr(apic); 2144 apic_update_ppr(apic);
2145 } else {
2146 /*
2147 * For normal interrupts, PPR has been raised and there cannot
2148 * be a higher-priority pending interrupt---except if there was
2149 * a concurrent interrupt injection, but that would have
2150 * triggered KVM_REQ_EVENT already.
2151 */
2152 apic_set_isr(vector, apic);
2153 __apic_update_ppr(apic, &ppr);
2088 } 2154 }
2089 2155
2090 return vector; 2156 return vector;
@@ -2145,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
2145 1 : count_vectors(apic->regs + APIC_ISR); 2211 1 : count_vectors(apic->regs + APIC_ISR);
2146 apic->highest_isr_cache = -1; 2212 apic->highest_isr_cache = -1;
2147 if (vcpu->arch.apicv_active) { 2213 if (vcpu->arch.apicv_active) {
2148 if (kvm_x86_ops->apicv_post_state_restore) 2214 kvm_x86_ops->apicv_post_state_restore(vcpu);
2149 kvm_x86_ops->apicv_post_state_restore(vcpu);
2150 kvm_x86_ops->hwapic_irr_update(vcpu, 2215 kvm_x86_ops->hwapic_irr_update(vcpu,
2151 apic_find_highest_irr(apic)); 2216 apic_find_highest_irr(apic));
2152 kvm_x86_ops->hwapic_isr_update(vcpu, 2217 kvm_x86_ops->hwapic_isr_update(vcpu,
@@ -2220,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
2220 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 2285 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
2221 return; 2286 return;
2222 2287
2223 if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, 2288 if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
2224 sizeof(u32))) 2289 sizeof(u32)))
2225 return; 2290 return;
2226 2291
2227 apic_set_tpr(vcpu->arch.apic, data & 0xff); 2292 apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2273,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
2273 max_isr = 0; 2338 max_isr = 0;
2274 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); 2339 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
2275 2340
2276 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, 2341 kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
2277 sizeof(u32)); 2342 sizeof(u32));
2278} 2343}
2279 2344
2280int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) 2345int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
2281{ 2346{
2282 if (vapic_addr) { 2347 if (vapic_addr) {
2283 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, 2348 if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
2284 &vcpu->arch.apic->vapic_cache, 2349 &vcpu->arch.apic->vapic_cache,
2285 vapic_addr, sizeof(u32))) 2350 vapic_addr, sizeof(u32)))
2286 return -EINVAL; 2351 return -EINVAL;
@@ -2374,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
2374 vcpu->arch.pv_eoi.msr_val = data; 2439 vcpu->arch.pv_eoi.msr_val = data;
2375 if (!pv_eoi_enabled(vcpu)) 2440 if (!pv_eoi_enabled(vcpu))
2376 return 0; 2441 return 0;
2377 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 2442 return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
2378 addr, sizeof(u8)); 2443 addr, sizeof(u8));
2379} 2444}
2380 2445
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff8039d61672..bcbe811f3b97 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -71,8 +71,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
71bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 71bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
72 int short_hand, unsigned int dest, int dest_mode); 72 int short_hand, unsigned int dest, int dest_mode);
73 73
74void __kvm_apic_update_irr(u32 *pir, void *regs); 74int __kvm_apic_update_irr(u32 *pir, void *regs);
75void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); 75int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
76void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
76int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, 77int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
77 struct dest_map *dest_map); 78 struct dest_map *dest_map);
78int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 79int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
203 return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); 204 return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
204} 205}
205 206
206static inline u32 kvm_apic_id(struct kvm_lapic *apic)
207{
208 /* To avoid a race between apic_base and following APIC_ID update when
209 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
210 */
211 if (apic_x2apic_mode(apic))
212 return apic->vcpu->vcpu_id;
213
214 return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
215}
216
217bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); 207bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
218 208
219void wait_lapic_expire(struct kvm_vcpu *vcpu); 209void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..2fd7586aad4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,8 @@
37#include <linux/srcu.h> 37#include <linux/srcu.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/hash.h>
41#include <linux/kern_levels.h>
40 42
41#include <asm/page.h> 43#include <asm/page.h>
42#include <asm/cmpxchg.h> 44#include <asm/cmpxchg.h>
@@ -129,6 +131,10 @@ module_param(dbg, bool, 0644);
129#define ACC_USER_MASK PT_USER_MASK 131#define ACC_USER_MASK PT_USER_MASK
130#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 132#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
131 133
134/* The mask for the R/X bits in EPT PTEs */
135#define PT64_EPT_READABLE_MASK 0x1ull
136#define PT64_EPT_EXECUTABLE_MASK 0x4ull
137
132#include <trace/events/kvm.h> 138#include <trace/events/kvm.h>
133 139
134#define CREATE_TRACE_POINTS 140#define CREATE_TRACE_POINTS
@@ -178,15 +184,40 @@ static u64 __read_mostly shadow_dirty_mask;
178static u64 __read_mostly shadow_mmio_mask; 184static u64 __read_mostly shadow_mmio_mask;
179static u64 __read_mostly shadow_present_mask; 185static u64 __read_mostly shadow_present_mask;
180 186
187/*
188 * The mask/value to distinguish a PTE that has been marked not-present for
189 * access tracking purposes.
190 * The mask would be either 0 if access tracking is disabled, or
191 * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
192 */
193static u64 __read_mostly shadow_acc_track_mask;
194static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
195
196/*
197 * The mask/shift to use for saving the original R/X bits when marking the PTE
198 * as not-present for access tracking purposes. We do not save the W bit as the
199 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
200 * restored only when a write is attempted to the page.
201 */
202static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
203 PT64_EPT_EXECUTABLE_MASK;
204static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
205
181static void mmu_spte_set(u64 *sptep, u64 spte); 206static void mmu_spte_set(u64 *sptep, u64 spte);
182static void mmu_free_roots(struct kvm_vcpu *vcpu); 207static void mmu_free_roots(struct kvm_vcpu *vcpu);
183 208
184void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 209void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
185{ 210{
186 shadow_mmio_mask = mmio_mask; 211 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
187} 212}
188EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 213EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
189 214
215static inline bool is_access_track_spte(u64 spte)
216{
217 /* Always false if shadow_acc_track_mask is zero. */
218 return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
219}
220
190/* 221/*
191 * the low bit of the generation number is always presumed to be zero. 222 * the low bit of the generation number is always presumed to be zero.
192 * This disables mmio caching during memslot updates. The concept is 223 * This disables mmio caching during memslot updates. The concept is
@@ -284,17 +315,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
284} 315}
285 316
286void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 317void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
287 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) 318 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
319 u64 acc_track_mask)
288{ 320{
321 if (acc_track_mask != 0)
322 acc_track_mask |= SPTE_SPECIAL_MASK;
323
289 shadow_user_mask = user_mask; 324 shadow_user_mask = user_mask;
290 shadow_accessed_mask = accessed_mask; 325 shadow_accessed_mask = accessed_mask;
291 shadow_dirty_mask = dirty_mask; 326 shadow_dirty_mask = dirty_mask;
292 shadow_nx_mask = nx_mask; 327 shadow_nx_mask = nx_mask;
293 shadow_x_mask = x_mask; 328 shadow_x_mask = x_mask;
294 shadow_present_mask = p_mask; 329 shadow_present_mask = p_mask;
330 shadow_acc_track_mask = acc_track_mask;
331 WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
295} 332}
296EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 333EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
297 334
335void kvm_mmu_clear_all_pte_masks(void)
336{
337 shadow_user_mask = 0;
338 shadow_accessed_mask = 0;
339 shadow_dirty_mask = 0;
340 shadow_nx_mask = 0;
341 shadow_x_mask = 0;
342 shadow_mmio_mask = 0;
343 shadow_present_mask = 0;
344 shadow_acc_track_mask = 0;
345}
346
298static int is_cpuid_PSE36(void) 347static int is_cpuid_PSE36(void)
299{ 348{
300 return 1; 349 return 1;
@@ -307,7 +356,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
307 356
308static int is_shadow_present_pte(u64 pte) 357static int is_shadow_present_pte(u64 pte)
309{ 358{
310 return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); 359 return (pte != 0) && !is_mmio_spte(pte);
311} 360}
312 361
313static int is_large_pte(u64 pte) 362static int is_large_pte(u64 pte)
@@ -324,6 +373,11 @@ static int is_last_spte(u64 pte, int level)
324 return 0; 373 return 0;
325} 374}
326 375
376static bool is_executable_pte(u64 spte)
377{
378 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
379}
380
327static kvm_pfn_t spte_to_pfn(u64 pte) 381static kvm_pfn_t spte_to_pfn(u64 pte)
328{ 382{
329 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 383 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -473,7 +527,7 @@ retry:
473} 527}
474#endif 528#endif
475 529
476static bool spte_is_locklessly_modifiable(u64 spte) 530static bool spte_can_locklessly_be_made_writable(u64 spte)
477{ 531{
478 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == 532 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
479 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); 533 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -481,36 +535,38 @@ static bool spte_is_locklessly_modifiable(u64 spte)
481 535
482static bool spte_has_volatile_bits(u64 spte) 536static bool spte_has_volatile_bits(u64 spte)
483{ 537{
538 if (!is_shadow_present_pte(spte))
539 return false;
540
484 /* 541 /*
485 * Always atomically update spte if it can be updated 542 * Always atomically update spte if it can be updated
486 * out of mmu-lock, it can ensure dirty bit is not lost, 543 * out of mmu-lock, it can ensure dirty bit is not lost,
487 * also, it can help us to get a stable is_writable_pte() 544 * also, it can help us to get a stable is_writable_pte()
488 * to ensure tlb flush is not missed. 545 * to ensure tlb flush is not missed.
489 */ 546 */
490 if (spte_is_locklessly_modifiable(spte)) 547 if (spte_can_locklessly_be_made_writable(spte) ||
548 is_access_track_spte(spte))
491 return true; 549 return true;
492 550
493 if (!shadow_accessed_mask) 551 if (shadow_accessed_mask) {
494 return false; 552 if ((spte & shadow_accessed_mask) == 0 ||
495 553 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
496 if (!is_shadow_present_pte(spte)) 554 return true;
497 return false; 555 }
498
499 if ((spte & shadow_accessed_mask) &&
500 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
501 return false;
502 556
503 return true; 557 return false;
504} 558}
505 559
506static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) 560static bool is_accessed_spte(u64 spte)
507{ 561{
508 return (old_spte & bit_mask) && !(new_spte & bit_mask); 562 return shadow_accessed_mask ? spte & shadow_accessed_mask
563 : !is_access_track_spte(spte);
509} 564}
510 565
511static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) 566static bool is_dirty_spte(u64 spte)
512{ 567{
513 return (old_spte & bit_mask) != (new_spte & bit_mask); 568 return shadow_dirty_mask ? spte & shadow_dirty_mask
569 : spte & PT_WRITABLE_MASK;
514} 570}
515 571
516/* Rules for using mmu_spte_set: 572/* Rules for using mmu_spte_set:
@@ -525,25 +581,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
525 __set_spte(sptep, new_spte); 581 __set_spte(sptep, new_spte);
526} 582}
527 583
528/* Rules for using mmu_spte_update: 584/*
529 * Update the state bits, it means the mapped pfn is not changed. 585 * Update the SPTE (excluding the PFN), but do not track changes in its
530 * 586 * accessed/dirty status.
531 * Whenever we overwrite a writable spte with a read-only one we
532 * should flush remote TLBs. Otherwise rmap_write_protect
533 * will find a read-only spte, even though the writable spte
534 * might be cached on a CPU's TLB, the return value indicates this
535 * case.
536 */ 587 */
537static bool mmu_spte_update(u64 *sptep, u64 new_spte) 588static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
538{ 589{
539 u64 old_spte = *sptep; 590 u64 old_spte = *sptep;
540 bool ret = false;
541 591
542 WARN_ON(!is_shadow_present_pte(new_spte)); 592 WARN_ON(!is_shadow_present_pte(new_spte));
543 593
544 if (!is_shadow_present_pte(old_spte)) { 594 if (!is_shadow_present_pte(old_spte)) {
545 mmu_spte_set(sptep, new_spte); 595 mmu_spte_set(sptep, new_spte);
546 return ret; 596 return old_spte;
547 } 597 }
548 598
549 if (!spte_has_volatile_bits(old_spte)) 599 if (!spte_has_volatile_bits(old_spte))
@@ -551,45 +601,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
551 else 601 else
552 old_spte = __update_clear_spte_slow(sptep, new_spte); 602 old_spte = __update_clear_spte_slow(sptep, new_spte);
553 603
604 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
605
606 return old_spte;
607}
608
609/* Rules for using mmu_spte_update:
610 * Update the state bits, it means the mapped pfn is not changed.
611 *
612 * Whenever we overwrite a writable spte with a read-only one we
613 * should flush remote TLBs. Otherwise rmap_write_protect
614 * will find a read-only spte, even though the writable spte
615 * might be cached on a CPU's TLB, the return value indicates this
616 * case.
617 *
618 * Returns true if the TLB needs to be flushed
619 */
620static bool mmu_spte_update(u64 *sptep, u64 new_spte)
621{
622 bool flush = false;
623 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
624
625 if (!is_shadow_present_pte(old_spte))
626 return false;
627
554 /* 628 /*
555 * For the spte updated out of mmu-lock is safe, since 629 * For the spte updated out of mmu-lock is safe, since
556 * we always atomically update it, see the comments in 630 * we always atomically update it, see the comments in
557 * spte_has_volatile_bits(). 631 * spte_has_volatile_bits().
558 */ 632 */
559 if (spte_is_locklessly_modifiable(old_spte) && 633 if (spte_can_locklessly_be_made_writable(old_spte) &&
560 !is_writable_pte(new_spte)) 634 !is_writable_pte(new_spte))
561 ret = true; 635 flush = true;
562
563 if (!shadow_accessed_mask) {
564 /*
565 * We don't set page dirty when dropping non-writable spte.
566 * So do it now if the new spte is becoming non-writable.
567 */
568 if (ret)
569 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
570 return ret;
571 }
572 636
573 /* 637 /*
574 * Flush TLB when accessed/dirty bits are changed in the page tables, 638 * Flush TLB when accessed/dirty states are changed in the page tables,
575 * to guarantee consistency between TLB and page tables. 639 * to guarantee consistency between TLB and page tables.
576 */ 640 */
577 if (spte_is_bit_changed(old_spte, new_spte,
578 shadow_accessed_mask | shadow_dirty_mask))
579 ret = true;
580 641
581 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 642 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
643 flush = true;
582 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 644 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
583 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 645 }
646
647 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
648 flush = true;
584 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 649 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
650 }
585 651
586 return ret; 652 return flush;
587} 653}
588 654
589/* 655/*
590 * Rules for using mmu_spte_clear_track_bits: 656 * Rules for using mmu_spte_clear_track_bits:
591 * It sets the sptep from present to nonpresent, and track the 657 * It sets the sptep from present to nonpresent, and track the
592 * state bits, it is used to clear the last level sptep. 658 * state bits, it is used to clear the last level sptep.
659 * Returns non-zero if the PTE was previously valid.
593 */ 660 */
594static int mmu_spte_clear_track_bits(u64 *sptep) 661static int mmu_spte_clear_track_bits(u64 *sptep)
595{ 662{
@@ -613,11 +680,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
613 */ 680 */
614 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 681 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
615 682
616 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 683 if (is_accessed_spte(old_spte))
617 kvm_set_pfn_accessed(pfn); 684 kvm_set_pfn_accessed(pfn);
618 if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : 685
619 PT_WRITABLE_MASK)) 686 if (is_dirty_spte(old_spte))
620 kvm_set_pfn_dirty(pfn); 687 kvm_set_pfn_dirty(pfn);
688
621 return 1; 689 return 1;
622} 690}
623 691
@@ -636,6 +704,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
636 return __get_spte_lockless(sptep); 704 return __get_spte_lockless(sptep);
637} 705}
638 706
707static u64 mark_spte_for_access_track(u64 spte)
708{
709 if (shadow_accessed_mask != 0)
710 return spte & ~shadow_accessed_mask;
711
712 if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
713 return spte;
714
715 /*
716 * Making an Access Tracking PTE will result in removal of write access
717 * from the PTE. So, verify that we will be able to restore the write
718 * access in the fast page fault path later on.
719 */
720 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
721 !spte_can_locklessly_be_made_writable(spte),
722 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
723
724 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
725 shadow_acc_track_saved_bits_shift),
726 "kvm: Access Tracking saved bit locations are not zero\n");
727
728 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
729 shadow_acc_track_saved_bits_shift;
730 spte &= ~shadow_acc_track_mask;
731 spte |= shadow_acc_track_value;
732
733 return spte;
734}
735
736/* Restore an acc-track PTE back to a regular PTE */
737static u64 restore_acc_track_spte(u64 spte)
738{
739 u64 new_spte = spte;
740 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
741 & shadow_acc_track_saved_bits_mask;
742
743 WARN_ON_ONCE(!is_access_track_spte(spte));
744
745 new_spte &= ~shadow_acc_track_mask;
746 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
747 shadow_acc_track_saved_bits_shift);
748 new_spte |= saved_bits;
749
750 return new_spte;
751}
752
753/* Returns the Accessed status of the PTE and resets it at the same time. */
754static bool mmu_spte_age(u64 *sptep)
755{
756 u64 spte = mmu_spte_get_lockless(sptep);
757
758 if (!is_accessed_spte(spte))
759 return false;
760
761 if (shadow_accessed_mask) {
762 clear_bit((ffs(shadow_accessed_mask) - 1),
763 (unsigned long *)sptep);
764 } else {
765 /*
766 * Capture the dirty status of the page, so that it doesn't get
767 * lost when the SPTE is marked for access tracking.
768 */
769 if (is_writable_pte(spte))
770 kvm_set_pfn_dirty(spte_to_pfn(spte));
771
772 spte = mark_spte_for_access_track(spte);
773 mmu_spte_update_no_track(sptep, spte);
774 }
775
776 return true;
777}
778
639static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 779static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
640{ 780{
641 /* 781 /*
@@ -1212,7 +1352,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
1212 u64 spte = *sptep; 1352 u64 spte = *sptep;
1213 1353
1214 if (!is_writable_pte(spte) && 1354 if (!is_writable_pte(spte) &&
1215 !(pt_protect && spte_is_locklessly_modifiable(spte))) 1355 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1216 return false; 1356 return false;
1217 1357
1218 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1358 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -1420,7 +1560,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1420restart: 1560restart:
1421 for_each_rmap_spte(rmap_head, &iter, sptep) { 1561 for_each_rmap_spte(rmap_head, &iter, sptep) {
1422 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", 1562 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1423 sptep, *sptep, gfn, level); 1563 sptep, *sptep, gfn, level);
1424 1564
1425 need_flush = 1; 1565 need_flush = 1;
1426 1566
@@ -1433,7 +1573,8 @@ restart:
1433 1573
1434 new_spte &= ~PT_WRITABLE_MASK; 1574 new_spte &= ~PT_WRITABLE_MASK;
1435 new_spte &= ~SPTE_HOST_WRITEABLE; 1575 new_spte &= ~SPTE_HOST_WRITEABLE;
1436 new_spte &= ~shadow_accessed_mask; 1576
1577 new_spte = mark_spte_for_access_track(new_spte);
1437 1578
1438 mmu_spte_clear_track_bits(sptep); 1579 mmu_spte_clear_track_bits(sptep);
1439 mmu_spte_set(sptep, new_spte); 1580 mmu_spte_set(sptep, new_spte);
@@ -1595,15 +1736,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1595 struct rmap_iterator uninitialized_var(iter); 1736 struct rmap_iterator uninitialized_var(iter);
1596 int young = 0; 1737 int young = 0;
1597 1738
1598 BUG_ON(!shadow_accessed_mask); 1739 for_each_rmap_spte(rmap_head, &iter, sptep)
1599 1740 young |= mmu_spte_age(sptep);
1600 for_each_rmap_spte(rmap_head, &iter, sptep) {
1601 if (*sptep & shadow_accessed_mask) {
1602 young = 1;
1603 clear_bit((ffs(shadow_accessed_mask) - 1),
1604 (unsigned long *)sptep);
1605 }
1606 }
1607 1741
1608 trace_kvm_age_page(gfn, level, slot, young); 1742 trace_kvm_age_page(gfn, level, slot, young);
1609 return young; 1743 return young;
@@ -1615,24 +1749,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1615{ 1749{
1616 u64 *sptep; 1750 u64 *sptep;
1617 struct rmap_iterator iter; 1751 struct rmap_iterator iter;
1618 int young = 0;
1619 1752
1620 /* 1753 /*
1621 * If there's no access bit in the secondary pte set by the 1754 * If there's no access bit in the secondary pte set by the hardware and
1622 * hardware it's up to gup-fast/gup to set the access bit in 1755 * fast access tracking is also not enabled, it's up to gup-fast/gup to
1623 * the primary pte or in the page structure. 1756 * set the access bit in the primary pte or in the page structure.
1624 */ 1757 */
1625 if (!shadow_accessed_mask) 1758 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1626 goto out; 1759 goto out;
1627 1760
1628 for_each_rmap_spte(rmap_head, &iter, sptep) { 1761 for_each_rmap_spte(rmap_head, &iter, sptep)
1629 if (*sptep & shadow_accessed_mask) { 1762 if (is_accessed_spte(*sptep))
1630 young = 1; 1763 return 1;
1631 break;
1632 }
1633 }
1634out: 1764out:
1635 return young; 1765 return 0;
1636} 1766}
1637 1767
1638#define RMAP_RECYCLE_THRESHOLD 1000 1768#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1660,7 +1790,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1660 * This has some overhead, but not as much as the cost of swapping 1790 * This has some overhead, but not as much as the cost of swapping
1661 * out actively used pages or breaking up actively used hugepages. 1791 * out actively used pages or breaking up actively used hugepages.
1662 */ 1792 */
1663 if (!shadow_accessed_mask) 1793 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1664 return kvm_handle_hva_range(kvm, start, end, 0, 1794 return kvm_handle_hva_range(kvm, start, end, 0,
1665 kvm_unmap_rmapp); 1795 kvm_unmap_rmapp);
1666 1796
@@ -1713,7 +1843,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1713 1843
1714static unsigned kvm_page_table_hashfn(gfn_t gfn) 1844static unsigned kvm_page_table_hashfn(gfn_t gfn)
1715{ 1845{
1716 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1846 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1717} 1847}
1718 1848
1719static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1849static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
@@ -1904,17 +2034,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1904 * since it has been deleted from active_mmu_pages but still can be found 2034 * since it has been deleted from active_mmu_pages but still can be found
1905 * at hast list. 2035 * at hast list.
1906 * 2036 *
1907 * for_each_gfn_valid_sp() has skipped that kind of pages. 2037 * for_each_valid_sp() has skipped that kind of pages.
1908 */ 2038 */
1909#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ 2039#define for_each_valid_sp(_kvm, _sp, _gfn) \
1910 hlist_for_each_entry(_sp, \ 2040 hlist_for_each_entry(_sp, \
1911 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 2041 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
1912 if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ 2042 if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
1913 || (_sp)->role.invalid) {} else 2043 } else
1914 2044
1915#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 2045#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
1916 for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ 2046 for_each_valid_sp(_kvm, _sp, _gfn) \
1917 if ((_sp)->role.direct) {} else 2047 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
1918 2048
1919/* @sp->gfn should be write-protected at the call site */ 2049/* @sp->gfn should be write-protected at the call site */
1920static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2050static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2246,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2116 struct kvm_mmu_page *sp; 2246 struct kvm_mmu_page *sp;
2117 bool need_sync = false; 2247 bool need_sync = false;
2118 bool flush = false; 2248 bool flush = false;
2249 int collisions = 0;
2119 LIST_HEAD(invalid_list); 2250 LIST_HEAD(invalid_list);
2120 2251
2121 role = vcpu->arch.mmu.base_role; 2252 role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2261,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2130 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2261 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2131 role.quadrant = quadrant; 2262 role.quadrant = quadrant;
2132 } 2263 }
2133 for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { 2264 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2265 if (sp->gfn != gfn) {
2266 collisions++;
2267 continue;
2268 }
2269
2134 if (!need_sync && sp->unsync) 2270 if (!need_sync && sp->unsync)
2135 need_sync = true; 2271 need_sync = true;
2136 2272
@@ -2153,7 +2289,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2153 2289
2154 __clear_sp_write_flooding_count(sp); 2290 __clear_sp_write_flooding_count(sp);
2155 trace_kvm_mmu_get_page(sp, false); 2291 trace_kvm_mmu_get_page(sp, false);
2156 return sp; 2292 goto out;
2157 } 2293 }
2158 2294
2159 ++vcpu->kvm->stat.mmu_cache_miss; 2295 ++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2319,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2183 trace_kvm_mmu_get_page(sp, true); 2319 trace_kvm_mmu_get_page(sp, true);
2184 2320
2185 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); 2321 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2322out:
2323 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2324 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2186 return sp; 2325 return sp;
2187} 2326}
2188 2327
@@ -2583,6 +2722,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2583 spte |= shadow_dirty_mask; 2722 spte |= shadow_dirty_mask;
2584 } 2723 }
2585 2724
2725 if (speculative)
2726 spte = mark_spte_for_access_track(spte);
2727
2586set_pte: 2728set_pte:
2587 if (mmu_spte_update(sptep, spte)) 2729 if (mmu_spte_update(sptep, spte))
2588 kvm_flush_remote_tlbs(vcpu->kvm); 2730 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2636,7 +2778,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2636 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2778 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2637 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2779 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2638 is_large_pte(*sptep)? "2MB" : "4kB", 2780 is_large_pte(*sptep)? "2MB" : "4kB",
2639 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2781 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
2640 *sptep, sptep); 2782 *sptep, sptep);
2641 if (!was_rmapped && is_large_pte(*sptep)) 2783 if (!was_rmapped && is_large_pte(*sptep))
2642 ++vcpu->kvm->stat.lpages; 2784 ++vcpu->kvm->stat.lpages;
@@ -2869,33 +3011,43 @@ static bool page_fault_can_be_fast(u32 error_code)
2869 if (unlikely(error_code & PFERR_RSVD_MASK)) 3011 if (unlikely(error_code & PFERR_RSVD_MASK))
2870 return false; 3012 return false;
2871 3013
3014 /* See if the page fault is due to an NX violation */
3015 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3016 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3017 return false;
3018
2872 /* 3019 /*
2873 * #PF can be fast only if the shadow page table is present and it 3020 * #PF can be fast if:
2874 * is caused by write-protect, that means we just need change the 3021 * 1. The shadow page table entry is not present, which could mean that
2875 * W bit of the spte which can be done out of mmu-lock. 3022 * the fault is potentially caused by access tracking (if enabled).
3023 * 2. The shadow page table entry is present and the fault
3024 * is caused by write-protect, that means we just need change the W
3025 * bit of the spte which can be done out of mmu-lock.
3026 *
3027 * However, if access tracking is disabled we know that a non-present
3028 * page must be a genuine page fault where we have to create a new SPTE.
3029 * So, if access tracking is disabled, we return true only for write
3030 * accesses to a present page.
2876 */ 3031 */
2877 if (!(error_code & PFERR_PRESENT_MASK) ||
2878 !(error_code & PFERR_WRITE_MASK))
2879 return false;
2880 3032
2881 return true; 3033 return shadow_acc_track_mask != 0 ||
3034 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3035 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
2882} 3036}
2883 3037
3038/*
3039 * Returns true if the SPTE was fixed successfully. Otherwise,
3040 * someone else modified the SPTE from its original value.
3041 */
2884static bool 3042static bool
2885fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 3043fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2886 u64 *sptep, u64 spte) 3044 u64 *sptep, u64 old_spte, u64 new_spte)
2887{ 3045{
2888 gfn_t gfn; 3046 gfn_t gfn;
2889 3047
2890 WARN_ON(!sp->role.direct); 3048 WARN_ON(!sp->role.direct);
2891 3049
2892 /* 3050 /*
2893 * The gfn of direct spte is stable since it is calculated
2894 * by sp->gfn.
2895 */
2896 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2897
2898 /*
2899 * Theoretically we could also set dirty bit (and flush TLB) here in 3051 * Theoretically we could also set dirty bit (and flush TLB) here in
2900 * order to eliminate unnecessary PML logging. See comments in 3052 * order to eliminate unnecessary PML logging. See comments in
2901 * set_spte. But fast_page_fault is very unlikely to happen with PML 3053 * set_spte. But fast_page_fault is very unlikely to happen with PML
@@ -2907,12 +3059,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2907 * 3059 *
2908 * Compare with set_spte where instead shadow_dirty_mask is set. 3060 * Compare with set_spte where instead shadow_dirty_mask is set.
2909 */ 3061 */
2910 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 3062 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3063 return false;
3064
3065 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3066 /*
3067 * The gfn of direct spte is stable since it is
3068 * calculated by sp->gfn.
3069 */
3070 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2911 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3071 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3072 }
2912 3073
2913 return true; 3074 return true;
2914} 3075}
2915 3076
3077static bool is_access_allowed(u32 fault_err_code, u64 spte)
3078{
3079 if (fault_err_code & PFERR_FETCH_MASK)
3080 return is_executable_pte(spte);
3081
3082 if (fault_err_code & PFERR_WRITE_MASK)
3083 return is_writable_pte(spte);
3084
3085 /* Fault was on Read access */
3086 return spte & PT_PRESENT_MASK;
3087}
3088
2916/* 3089/*
2917 * Return value: 3090 * Return value:
2918 * - true: let the vcpu to access on the same address again. 3091 * - true: let the vcpu to access on the same address again.
@@ -2923,8 +3096,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2923{ 3096{
2924 struct kvm_shadow_walk_iterator iterator; 3097 struct kvm_shadow_walk_iterator iterator;
2925 struct kvm_mmu_page *sp; 3098 struct kvm_mmu_page *sp;
2926 bool ret = false; 3099 bool fault_handled = false;
2927 u64 spte = 0ull; 3100 u64 spte = 0ull;
3101 uint retry_count = 0;
2928 3102
2929 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3103 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2930 return false; 3104 return false;
@@ -2933,66 +3107,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2933 return false; 3107 return false;
2934 3108
2935 walk_shadow_page_lockless_begin(vcpu); 3109 walk_shadow_page_lockless_begin(vcpu);
2936 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) 3110
2937 if (!is_shadow_present_pte(spte) || iterator.level < level) 3111 do {
3112 u64 new_spte;
3113
3114 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3115 if (!is_shadow_present_pte(spte) ||
3116 iterator.level < level)
3117 break;
3118
3119 sp = page_header(__pa(iterator.sptep));
3120 if (!is_last_spte(spte, sp->role.level))
2938 break; 3121 break;
2939 3122
2940 /* 3123 /*
2941 * If the mapping has been changed, let the vcpu fault on the 3124 * Check whether the memory access that caused the fault would
2942 * same address again. 3125 * still cause it if it were to be performed right now. If not,
2943 */ 3126 * then this is a spurious fault caused by TLB lazily flushed,
2944 if (!is_shadow_present_pte(spte)) { 3127 * or some other CPU has already fixed the PTE after the
2945 ret = true; 3128 * current CPU took the fault.
2946 goto exit; 3129 *
2947 } 3130 * Need not check the access of upper level table entries since
3131 * they are always ACC_ALL.
3132 */
3133 if (is_access_allowed(error_code, spte)) {
3134 fault_handled = true;
3135 break;
3136 }
2948 3137
2949 sp = page_header(__pa(iterator.sptep)); 3138 new_spte = spte;
2950 if (!is_last_spte(spte, sp->role.level))
2951 goto exit;
2952 3139
2953 /* 3140 if (is_access_track_spte(spte))
2954 * Check if it is a spurious fault caused by TLB lazily flushed. 3141 new_spte = restore_acc_track_spte(new_spte);
2955 *
2956 * Need not check the access of upper level table entries since
2957 * they are always ACC_ALL.
2958 */
2959 if (is_writable_pte(spte)) {
2960 ret = true;
2961 goto exit;
2962 }
2963 3142
2964 /* 3143 /*
2965 * Currently, to simplify the code, only the spte write-protected 3144 * Currently, to simplify the code, write-protection can
2966 * by dirty-log can be fast fixed. 3145 * be removed in the fast path only if the SPTE was
2967 */ 3146 * write-protected for dirty-logging or access tracking.
2968 if (!spte_is_locklessly_modifiable(spte)) 3147 */
2969 goto exit; 3148 if ((error_code & PFERR_WRITE_MASK) &&
3149 spte_can_locklessly_be_made_writable(spte))
3150 {
3151 new_spte |= PT_WRITABLE_MASK;
2970 3152
2971 /* 3153 /*
2972 * Do not fix write-permission on the large spte since we only dirty 3154 * Do not fix write-permission on the large spte. Since
2973 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() 3155 * we only dirty the first page into the dirty-bitmap in
2974 * that means other pages are missed if its slot is dirty-logged. 3156 * fast_pf_fix_direct_spte(), other pages are missed
2975 * 3157 * if its slot has dirty logging enabled.
2976 * Instead, we let the slow page fault path create a normal spte to 3158 *
2977 * fix the access. 3159 * Instead, we let the slow page fault path create a
2978 * 3160 * normal spte to fix the access.
2979 * See the comments in kvm_arch_commit_memory_region(). 3161 *
2980 */ 3162 * See the comments in kvm_arch_commit_memory_region().
2981 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 3163 */
2982 goto exit; 3164 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3165 break;
3166 }
3167
3168 /* Verify that the fault can be handled in the fast path */
3169 if (new_spte == spte ||
3170 !is_access_allowed(error_code, new_spte))
3171 break;
3172
3173 /*
3174 * Currently, fast page fault only works for direct mapping
3175 * since the gfn is not stable for indirect shadow page. See
3176 * Documentation/virtual/kvm/locking.txt to get more detail.
3177 */
3178 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3179 iterator.sptep, spte,
3180 new_spte);
3181 if (fault_handled)
3182 break;
3183
3184 if (++retry_count > 4) {
3185 printk_once(KERN_WARNING
3186 "kvm: Fast #PF retrying more than 4 times.\n");
3187 break;
3188 }
3189
3190 } while (true);
2983 3191
2984 /*
2985 * Currently, fast page fault only works for direct mapping since
2986 * the gfn is not stable for indirect shadow page.
2987 * See Documentation/virtual/kvm/locking.txt to get more detail.
2988 */
2989 ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
2990exit:
2991 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 3192 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2992 spte, ret); 3193 spte, fault_handled);
2993 walk_shadow_page_lockless_end(vcpu); 3194 walk_shadow_page_lockless_end(vcpu);
2994 3195
2995 return ret; 3196 return fault_handled;
2996} 3197}
2997 3198
2998static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3199static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -5063,6 +5264,8 @@ static void mmu_destroy_caches(void)
5063 5264
5064int kvm_mmu_module_init(void) 5265int kvm_mmu_module_init(void)
5065{ 5266{
5267 kvm_mmu_clear_all_pte_masks();
5268
5066 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5269 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
5067 sizeof(struct pte_list_desc), 5270 sizeof(struct pte_list_desc),
5068 0, 0, NULL); 5271 0, 0, NULL);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
971 * a particular vCPU. 971 * a particular vCPU.
972 */ 972 */
973#define SVM_VM_DATA_HASH_BITS 8 973#define SVM_VM_DATA_HASH_BITS 8
974DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); 974static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
975static spinlock_t svm_vm_data_hash_lock; 975static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
976 976
977/* Note: 977/* Note:
978 * This function is called from IOMMU driver to notify 978 * This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
1077 } else { 1077 } else {
1078 pr_info("AVIC enabled\n"); 1078 pr_info("AVIC enabled\n");
1079 1079
1080 hash_init(svm_vm_data_hash);
1081 spin_lock_init(&svm_vm_data_hash_lock);
1082 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1080 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1083 } 1081 }
1084 } 1082 }
@@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1159 struct vmcb_control_area *control = &svm->vmcb->control; 1157 struct vmcb_control_area *control = &svm->vmcb->control;
1160 struct vmcb_save_area *save = &svm->vmcb->save; 1158 struct vmcb_save_area *save = &svm->vmcb->save;
1161 1159
1162 svm->vcpu.fpu_active = 1;
1163 svm->vcpu.arch.hflags = 0; 1160 svm->vcpu.arch.hflags = 0;
1164 1161
1165 set_cr_intercept(svm, INTERCEPT_CR0_READ); 1162 set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1901 ulong gcr0 = svm->vcpu.arch.cr0; 1898 ulong gcr0 = svm->vcpu.arch.cr0;
1902 u64 *hcr0 = &svm->vmcb->save.cr0; 1899 u64 *hcr0 = &svm->vmcb->save.cr0;
1903 1900
1904 if (!svm->vcpu.fpu_active) 1901 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1905 *hcr0 |= SVM_CR0_SELECTIVE_MASK; 1902 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1906 else
1907 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1908 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1909 1903
1910 mark_dirty(svm->vmcb, VMCB_CR); 1904 mark_dirty(svm->vmcb, VMCB_CR);
1911 1905
1912 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1906 if (gcr0 == *hcr0) {
1913 clr_cr_intercept(svm, INTERCEPT_CR0_READ); 1907 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1914 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); 1908 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1915 } else { 1909 } else {
@@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1940 if (!npt_enabled) 1934 if (!npt_enabled)
1941 cr0 |= X86_CR0_PG | X86_CR0_WP; 1935 cr0 |= X86_CR0_PG | X86_CR0_WP;
1942 1936
1943 if (!vcpu->fpu_active)
1944 cr0 |= X86_CR0_TS;
1945 /* 1937 /*
1946 * re-enable caching here because the QEMU bios 1938 * re-enable caching here because the QEMU bios
1947 * does not do it - this results in some delay at 1939 * does not do it - this results in some delay at
@@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
2160 return 1; 2152 return 1;
2161} 2153}
2162 2154
2163static void svm_fpu_activate(struct kvm_vcpu *vcpu)
2164{
2165 struct vcpu_svm *svm = to_svm(vcpu);
2166
2167 clr_exception_intercept(svm, NM_VECTOR);
2168
2169 svm->vcpu.fpu_active = 1;
2170 update_cr0_intercept(svm);
2171}
2172
2173static int nm_interception(struct vcpu_svm *svm)
2174{
2175 svm_fpu_activate(&svm->vcpu);
2176 return 1;
2177}
2178
2179static bool is_erratum_383(void) 2155static bool is_erratum_383(void)
2180{ 2156{
2181 int err, i; 2157 int err, i;
@@ -2573,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
2573 if (!npt_enabled && svm->apf_reason == 0) 2549 if (!npt_enabled && svm->apf_reason == 0)
2574 return NESTED_EXIT_HOST; 2550 return NESTED_EXIT_HOST;
2575 break; 2551 break;
2576 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2577 nm_interception(svm);
2578 break;
2579 default: 2552 default:
2580 break; 2553 break;
2581 } 2554 }
@@ -4020,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
4020 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3993 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
4021 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3994 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
4022 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3995 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
4023 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
4024 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3996 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
4025 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3997 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
4026 [SVM_EXIT_INTR] = intr_interception, 3998 [SVM_EXIT_INTR] = intr_interception,
@@ -4182,6 +4154,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
4182 4154
4183 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); 4155 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4184 4156
4157 vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
4158
4185 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 4159 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4186 vcpu->arch.cr0 = svm->vmcb->save.cr0; 4160 vcpu->arch.cr0 = svm->vmcb->save.cr0;
4187 if (npt_enabled) 4161 if (npt_enabled)
@@ -4357,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
4357 return; 4331 return;
4358} 4332}
4359 4333
4360static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4361{
4362 return;
4363}
4364
4365static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) 4334static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4366{ 4335{
4367 kvm_lapic_set_irr(vec, vcpu->arch.apic); 4336 kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5077,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
5077 return true; 5046 return true;
5078} 5047}
5079 5048
5080static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
5081{
5082 struct vcpu_svm *svm = to_svm(vcpu);
5083
5084 set_exception_intercept(svm, NM_VECTOR);
5085 update_cr0_intercept(svm);
5086}
5087
5088#define PRE_EX(exit) { .exit_code = (exit), \ 5049#define PRE_EX(exit) { .exit_code = (exit), \
5089 .stage = X86_ICPT_PRE_EXCEPT, } 5050 .stage = X86_ICPT_PRE_EXCEPT, }
5090#define POST_EX(exit) { .exit_code = (exit), \ 5051#define POST_EX(exit) { .exit_code = (exit), \
@@ -5345,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
5345 5306
5346 .get_pkru = svm_get_pkru, 5307 .get_pkru = svm_get_pkru,
5347 5308
5348 .fpu_activate = svm_fpu_activate,
5349 .fpu_deactivate = svm_fpu_deactivate,
5350
5351 .tlb_flush = svm_flush_tlb, 5309 .tlb_flush = svm_flush_tlb,
5352 5310
5353 .run = svm_vcpu_run, 5311 .run = svm_vcpu_run,
@@ -5371,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
5371 .get_enable_apicv = svm_get_enable_apicv, 5329 .get_enable_apicv = svm_get_enable_apicv,
5372 .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, 5330 .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
5373 .load_eoi_exitmap = svm_load_eoi_exitmap, 5331 .load_eoi_exitmap = svm_load_eoi_exitmap,
5374 .sync_pir_to_irr = svm_sync_pir_to_irr,
5375 .hwapic_irr_update = svm_hwapic_irr_update, 5332 .hwapic_irr_update = svm_hwapic_irr_update,
5376 .hwapic_isr_update = svm_hwapic_isr_update, 5333 .hwapic_isr_update = svm_hwapic_isr_update,
5377 .apicv_post_state_restore = avic_post_state_restore, 5334 .apicv_post_state_restore = avic_post_state_restore,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..ef4ba71dbb66 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1856 u32 eb; 1856 u32 eb;
1857 1857
1858 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1858 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1859 (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); 1859 (1u << DB_VECTOR) | (1u << AC_VECTOR);
1860 if ((vcpu->guest_debug & 1860 if ((vcpu->guest_debug &
1861 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1861 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1862 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1862 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1865 eb = ~0; 1865 eb = ~0;
1866 if (enable_ept) 1866 if (enable_ept)
1867 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1867 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1868 if (vcpu->fpu_active)
1869 eb &= ~(1u << NM_VECTOR);
1870 1868
1871 /* When we are running a nested L2 guest and L1 specified for it a 1869 /* When we are running a nested L2 guest and L1 specified for it a
1872 * certain exception bitmap, we must trap the same exceptions and pass 1870 * certain exception bitmap, we must trap the same exceptions and pass
@@ -1992,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1992 m->host[i].value = host_val; 1990 m->host[i].value = host_val;
1993} 1991}
1994 1992
1995static void reload_tss(void)
1996{
1997 /*
1998 * VT restores TR but not its size. Useless.
1999 */
2000 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
2001 struct desc_struct *descs;
2002
2003 descs = (void *)gdt->address;
2004 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
2005 load_TR_desc();
2006}
2007
2008static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 1993static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2009{ 1994{
2010 u64 guest_efer = vmx->vcpu.arch.efer; 1995 u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2059,41 +2044,36 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2059 } 2044 }
2060} 2045}
2061 2046
2047#ifdef CONFIG_X86_32
2048/*
2049 * On 32-bit kernels, VM exits still load the FS and GS bases from the
2050 * VMCS rather than the segment table. KVM uses this helper to figure
2051 * out the current bases to poke them into the VMCS before entry.
2052 */
2062static unsigned long segment_base(u16 selector) 2053static unsigned long segment_base(u16 selector)
2063{ 2054{
2064 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 2055 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
2065 struct desc_struct *d; 2056 struct desc_struct *d;
2066 unsigned long table_base; 2057 struct desc_struct *table;
2067 unsigned long v; 2058 unsigned long v;
2068 2059
2069 if (!(selector & ~3)) 2060 if (!(selector & ~SEGMENT_RPL_MASK))
2070 return 0; 2061 return 0;
2071 2062
2072 table_base = gdt->address; 2063 table = (struct desc_struct *)gdt->address;
2073 2064
2074 if (selector & 4) { /* from ldt */ 2065 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2075 u16 ldt_selector = kvm_read_ldt(); 2066 u16 ldt_selector = kvm_read_ldt();
2076 2067
2077 if (!(ldt_selector & ~3)) 2068 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2078 return 0; 2069 return 0;
2079 2070
2080 table_base = segment_base(ldt_selector); 2071 table = (struct desc_struct *)segment_base(ldt_selector);
2081 } 2072 }
2082 d = (struct desc_struct *)(table_base + (selector & ~7)); 2073 v = get_desc_base(&table[selector >> 3]);
2083 v = get_desc_base(d);
2084#ifdef CONFIG_X86_64
2085 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
2086 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
2087#endif
2088 return v; 2074 return v;
2089} 2075}
2090 2076#endif
2091static inline unsigned long kvm_read_tr_base(void)
2092{
2093 u16 tr;
2094 asm("str %0" : "=g"(tr));
2095 return segment_base(tr);
2096}
2097 2077
2098static void vmx_save_host_state(struct kvm_vcpu *vcpu) 2078static void vmx_save_host_state(struct kvm_vcpu *vcpu)
2099{ 2079{
@@ -2179,7 +2159,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
2179 loadsegment(es, vmx->host_state.es_sel); 2159 loadsegment(es, vmx->host_state.es_sel);
2180 } 2160 }
2181#endif 2161#endif
2182 reload_tss(); 2162 invalidate_tss_limit();
2183#ifdef CONFIG_X86_64 2163#ifdef CONFIG_X86_64
2184 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 2164 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2185#endif 2165#endif
@@ -2294,10 +2274,19 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2294 2274
2295 /* 2275 /*
2296 * Linux uses per-cpu TSS and GDT, so set these when switching 2276 * Linux uses per-cpu TSS and GDT, so set these when switching
2297 * processors. 2277 * processors. See 22.2.4.
2298 */ 2278 */
2299 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 2279 vmcs_writel(HOST_TR_BASE,
2300 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ 2280 (unsigned long)this_cpu_ptr(&cpu_tss));
2281 vmcs_writel(HOST_GDTR_BASE, gdt->address);
2282
2283 /*
2284 * VM exits change the host TR limit to 0x67 after a VM
2285 * exit. This is okay, since 0x67 covers everything except
2286 * the IO bitmap and have have code to handle the IO bitmap
2287 * being lost after a VM exit.
2288 */
2289 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
2301 2290
2302 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 2291 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2303 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 2292 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -2340,25 +2329,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2340 } 2329 }
2341} 2330}
2342 2331
2343static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
2344{
2345 ulong cr0;
2346
2347 if (vcpu->fpu_active)
2348 return;
2349 vcpu->fpu_active = 1;
2350 cr0 = vmcs_readl(GUEST_CR0);
2351 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
2352 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
2353 vmcs_writel(GUEST_CR0, cr0);
2354 update_exception_bitmap(vcpu);
2355 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
2356 if (is_guest_mode(vcpu))
2357 vcpu->arch.cr0_guest_owned_bits &=
2358 ~get_vmcs12(vcpu)->cr0_guest_host_mask;
2359 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2360}
2361
2362static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 2332static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2363 2333
2364/* 2334/*
@@ -2377,33 +2347,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2377 (fields->cr4_read_shadow & fields->cr4_guest_host_mask); 2347 (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2378} 2348}
2379 2349
2380static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
2381{
2382 /* Note that there is no vcpu->fpu_active = 0 here. The caller must
2383 * set this *before* calling this function.
2384 */
2385 vmx_decache_cr0_guest_bits(vcpu);
2386 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
2387 update_exception_bitmap(vcpu);
2388 vcpu->arch.cr0_guest_owned_bits = 0;
2389 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2390 if (is_guest_mode(vcpu)) {
2391 /*
2392 * L1's specified read shadow might not contain the TS bit,
2393 * so now that we turned on shadowing of this bit, we need to
2394 * set this bit of the shadow. Like in nested_vmx_run we need
2395 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
2396 * up-to-date here because we just decached cr0.TS (and we'll
2397 * only update vmcs12->guest_cr0 on nested exit).
2398 */
2399 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2400 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
2401 (vcpu->arch.cr0 & X86_CR0_TS);
2402 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2403 } else
2404 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2405}
2406
2407static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 2350static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2408{ 2351{
2409 unsigned long rflags, save_rflags; 2352 unsigned long rflags, save_rflags;
@@ -3962,7 +3905,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
3962 } 3905 }
3963 3906
3964 vmcs_write16(sf->selector, var.selector); 3907 vmcs_write16(sf->selector, var.selector);
3965 vmcs_write32(sf->base, var.base); 3908 vmcs_writel(sf->base, var.base);
3966 vmcs_write32(sf->limit, var.limit); 3909 vmcs_write32(sf->limit, var.limit);
3967 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3910 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3968} 3911}
@@ -4232,9 +4175,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
4232 if (enable_ept) 4175 if (enable_ept)
4233 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 4176 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
4234 4177
4235 if (!vcpu->fpu_active)
4236 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
4237
4238 vmcs_writel(CR0_READ_SHADOW, cr0); 4178 vmcs_writel(CR0_READ_SHADOW, cr0);
4239 vmcs_writel(GUEST_CR0, hw_cr0); 4179 vmcs_writel(GUEST_CR0, hw_cr0);
4240 vcpu->arch.cr0 = cr0; 4180 vcpu->arch.cr0 = cr0;
@@ -4953,7 +4893,7 @@ static bool vmx_get_enable_apicv(void)
4953 return enable_apicv; 4893 return enable_apicv;
4954} 4894}
4955 4895
4956static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4896static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4957{ 4897{
4958 struct vcpu_vmx *vmx = to_vmx(vcpu); 4898 struct vcpu_vmx *vmx = to_vmx(vcpu);
4959 int max_irr; 4899 int max_irr;
@@ -4964,19 +4904,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4964 vmx->nested.pi_pending) { 4904 vmx->nested.pi_pending) {
4965 vmx->nested.pi_pending = false; 4905 vmx->nested.pi_pending = false;
4966 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4906 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4967 return 0; 4907 return;
4968 4908
4969 max_irr = find_last_bit( 4909 max_irr = find_last_bit(
4970 (unsigned long *)vmx->nested.pi_desc->pir, 256); 4910 (unsigned long *)vmx->nested.pi_desc->pir, 256);
4971 4911
4972 if (max_irr == 256) 4912 if (max_irr == 256)
4973 return 0; 4913 return;
4974 4914
4975 vapic_page = kmap(vmx->nested.virtual_apic_page); 4915 vapic_page = kmap(vmx->nested.virtual_apic_page);
4976 if (!vapic_page) {
4977 WARN_ON(1);
4978 return -ENOMEM;
4979 }
4980 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); 4916 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
4981 kunmap(vmx->nested.virtual_apic_page); 4917 kunmap(vmx->nested.virtual_apic_page);
4982 4918
@@ -4987,7 +4923,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4987 vmcs_write16(GUEST_INTR_STATUS, status); 4923 vmcs_write16(GUEST_INTR_STATUS, status);
4988 } 4924 }
4989 } 4925 }
4990 return 0;
4991} 4926}
4992 4927
4993static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) 4928static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5056,26 +4991,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
5056 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4991 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
5057 return; 4992 return;
5058 4993
5059 r = pi_test_and_set_on(&vmx->pi_desc); 4994 /* If a previous notification has sent the IPI, nothing to do. */
5060 kvm_make_request(KVM_REQ_EVENT, vcpu); 4995 if (pi_test_and_set_on(&vmx->pi_desc))
5061 if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
5062 kvm_vcpu_kick(vcpu);
5063}
5064
5065static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
5066{
5067 struct vcpu_vmx *vmx = to_vmx(vcpu);
5068
5069 if (!pi_test_on(&vmx->pi_desc))
5070 return; 4996 return;
5071 4997
5072 pi_clear_on(&vmx->pi_desc); 4998 if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
5073 /* 4999 kvm_vcpu_kick(vcpu);
5074 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
5075 * But on x86 this is just a compiler barrier anyway.
5076 */
5077 smp_mb__after_atomic();
5078 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
5079} 5000}
5080 5001
5081/* 5002/*
@@ -5236,10 +5157,8 @@ static void ept_set_mmio_spte_mask(void)
5236 /* 5157 /*
5237 * EPT Misconfigurations can be generated if the value of bits 2:0 5158 * EPT Misconfigurations can be generated if the value of bits 2:0
5238 * of an EPT paging-structure entry is 110b (write/execute). 5159 * of an EPT paging-structure entry is 110b (write/execute).
5239 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
5240 * spte.
5241 */ 5160 */
5242 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); 5161 kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
5243} 5162}
5244 5163
5245#define VMX_XSS_EXIT_BITMAP 0 5164#define VMX_XSS_EXIT_BITMAP 0
@@ -5342,7 +5261,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
5342 /* 22.2.1, 20.8.1 */ 5261 /* 22.2.1, 20.8.1 */
5343 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); 5262 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
5344 5263
5345 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 5264 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5265 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5266
5346 set_cr4_guest_host_mask(vmx); 5267 set_cr4_guest_host_mask(vmx);
5347 5268
5348 if (vmx_xsaves_supported()) 5269 if (vmx_xsaves_supported())
@@ -5446,7 +5367,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5446 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 5367 vmx_set_cr0(vcpu, cr0); /* enter rmode */
5447 vmx_set_cr4(vcpu, 0); 5368 vmx_set_cr4(vcpu, 0);
5448 vmx_set_efer(vcpu, 0); 5369 vmx_set_efer(vcpu, 0);
5449 vmx_fpu_activate(vcpu); 5370
5450 update_exception_bitmap(vcpu); 5371 update_exception_bitmap(vcpu);
5451 5372
5452 vpid_sync_context(vmx->vpid); 5373 vpid_sync_context(vmx->vpid);
@@ -5480,26 +5401,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
5480 5401
5481static void enable_irq_window(struct kvm_vcpu *vcpu) 5402static void enable_irq_window(struct kvm_vcpu *vcpu)
5482{ 5403{
5483 u32 cpu_based_vm_exec_control; 5404 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5484 5405 CPU_BASED_VIRTUAL_INTR_PENDING);
5485 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5486 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
5487 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5488} 5406}
5489 5407
5490static void enable_nmi_window(struct kvm_vcpu *vcpu) 5408static void enable_nmi_window(struct kvm_vcpu *vcpu)
5491{ 5409{
5492 u32 cpu_based_vm_exec_control;
5493
5494 if (!cpu_has_virtual_nmis() || 5410 if (!cpu_has_virtual_nmis() ||
5495 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 5411 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5496 enable_irq_window(vcpu); 5412 enable_irq_window(vcpu);
5497 return; 5413 return;
5498 } 5414 }
5499 5415
5500 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5416 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
5501 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 5417 CPU_BASED_VIRTUAL_NMI_PENDING);
5502 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5503} 5418}
5504 5419
5505static void vmx_inject_irq(struct kvm_vcpu *vcpu) 5420static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -5725,11 +5640,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
5725 if (is_nmi(intr_info)) 5640 if (is_nmi(intr_info))
5726 return 1; /* already handled by vmx_vcpu_run() */ 5641 return 1; /* already handled by vmx_vcpu_run() */
5727 5642
5728 if (is_no_device(intr_info)) {
5729 vmx_fpu_activate(vcpu);
5730 return 1;
5731 }
5732
5733 if (is_invalid_opcode(intr_info)) { 5643 if (is_invalid_opcode(intr_info)) {
5734 if (is_guest_mode(vcpu)) { 5644 if (is_guest_mode(vcpu)) {
5735 kvm_queue_exception(vcpu, UD_VECTOR); 5645 kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5919,22 +5829,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5919 return kvm_set_cr4(vcpu, val); 5829 return kvm_set_cr4(vcpu, val);
5920} 5830}
5921 5831
5922/* called to set cr0 as appropriate for clts instruction exit. */
5923static void handle_clts(struct kvm_vcpu *vcpu)
5924{
5925 if (is_guest_mode(vcpu)) {
5926 /*
5927 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
5928 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
5929 * just pretend it's off (also in arch.cr0 for fpu_activate).
5930 */
5931 vmcs_writel(CR0_READ_SHADOW,
5932 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
5933 vcpu->arch.cr0 &= ~X86_CR0_TS;
5934 } else
5935 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5936}
5937
5938static int handle_cr(struct kvm_vcpu *vcpu) 5832static int handle_cr(struct kvm_vcpu *vcpu)
5939{ 5833{
5940 unsigned long exit_qualification, val; 5834 unsigned long exit_qualification, val;
@@ -5980,9 +5874,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
5980 } 5874 }
5981 break; 5875 break;
5982 case 2: /* clts */ 5876 case 2: /* clts */
5983 handle_clts(vcpu); 5877 WARN_ONCE(1, "Guest should always own CR0.TS");
5878 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5984 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 5879 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5985 vmx_fpu_activate(vcpu);
5986 return kvm_skip_emulated_instruction(vcpu); 5880 return kvm_skip_emulated_instruction(vcpu);
5987 case 1: /*mov from cr*/ 5881 case 1: /*mov from cr*/
5988 switch (cr) { 5882 switch (cr) {
@@ -6152,18 +6046,14 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
6152 6046
6153static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 6047static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6154{ 6048{
6155 kvm_make_request(KVM_REQ_EVENT, vcpu); 6049 kvm_apic_update_ppr(vcpu);
6156 return 1; 6050 return 1;
6157} 6051}
6158 6052
6159static int handle_interrupt_window(struct kvm_vcpu *vcpu) 6053static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6160{ 6054{
6161 u32 cpu_based_vm_exec_control; 6055 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
6162 6056 CPU_BASED_VIRTUAL_INTR_PENDING);
6163 /* clear pending irq */
6164 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6165 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
6166 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
6167 6057
6168 kvm_make_request(KVM_REQ_EVENT, vcpu); 6058 kvm_make_request(KVM_REQ_EVENT, vcpu);
6169 6059
@@ -6374,15 +6264,22 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
6374 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6264 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6375 trace_kvm_page_fault(gpa, exit_qualification); 6265 trace_kvm_page_fault(gpa, exit_qualification);
6376 6266
6377 /* it is a read fault? */ 6267 /* Is it a read fault? */
6378 error_code = (exit_qualification << 2) & PFERR_USER_MASK; 6268 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
6379 /* it is a write fault? */ 6269 ? PFERR_USER_MASK : 0;
6380 error_code |= exit_qualification & PFERR_WRITE_MASK; 6270 /* Is it a write fault? */
6381 /* It is a fetch fault? */ 6271 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
6382 error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; 6272 ? PFERR_WRITE_MASK : 0;
6383 /* ept page table is present? */ 6273 /* Is it a fetch fault? */
6384 error_code |= (exit_qualification & 0x38) != 0; 6274 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
6385 6275 ? PFERR_FETCH_MASK : 0;
6276 /* ept page table entry is present? */
6277 error_code |= (exit_qualification &
6278 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
6279 EPT_VIOLATION_EXECUTABLE))
6280 ? PFERR_PRESENT_MASK : 0;
6281
6282 vcpu->arch.gpa_available = true;
6386 vcpu->arch.exit_qualification = exit_qualification; 6283 vcpu->arch.exit_qualification = exit_qualification;
6387 6284
6388 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 6285 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6400,6 +6297,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6400 } 6297 }
6401 6298
6402 ret = handle_mmio_page_fault(vcpu, gpa, true); 6299 ret = handle_mmio_page_fault(vcpu, gpa, true);
6300 vcpu->arch.gpa_available = true;
6403 if (likely(ret == RET_MMIO_PF_EMULATE)) 6301 if (likely(ret == RET_MMIO_PF_EMULATE))
6404 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 6302 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
6405 EMULATE_DONE; 6303 EMULATE_DONE;
@@ -6421,12 +6319,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
6421 6319
6422static int handle_nmi_window(struct kvm_vcpu *vcpu) 6320static int handle_nmi_window(struct kvm_vcpu *vcpu)
6423{ 6321{
6424 u32 cpu_based_vm_exec_control; 6322 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
6425 6323 CPU_BASED_VIRTUAL_NMI_PENDING);
6426 /* clear pending NMI */
6427 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6428 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
6429 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
6430 ++vcpu->stat.nmi_window_exits; 6324 ++vcpu->stat.nmi_window_exits;
6431 kvm_make_request(KVM_REQ_EVENT, vcpu); 6325 kvm_make_request(KVM_REQ_EVENT, vcpu);
6432 6326
@@ -6572,6 +6466,19 @@ static void wakeup_handler(void)
6572 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6466 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6573} 6467}
6574 6468
6469void vmx_enable_tdp(void)
6470{
6471 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6472 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6473 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6474 0ull, VMX_EPT_EXECUTABLE_MASK,
6475 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6476 enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
6477
6478 ept_set_mmio_spte_mask();
6479 kvm_enable_tdp();
6480}
6481
6575static __init int hardware_setup(void) 6482static __init int hardware_setup(void)
6576{ 6483{
6577 int r = -ENOMEM, i, msr; 6484 int r = -ENOMEM, i, msr;
@@ -6651,8 +6558,10 @@ static __init int hardware_setup(void)
6651 if (!cpu_has_vmx_ple()) 6558 if (!cpu_has_vmx_ple())
6652 ple_gap = 0; 6559 ple_gap = 0;
6653 6560
6654 if (!cpu_has_vmx_apicv()) 6561 if (!cpu_has_vmx_apicv()) {
6655 enable_apicv = 0; 6562 enable_apicv = 0;
6563 kvm_x86_ops->sync_pir_to_irr = NULL;
6564 }
6656 6565
6657 if (cpu_has_vmx_tsc_scaling()) { 6566 if (cpu_has_vmx_tsc_scaling()) {
6658 kvm_has_tsc_control = true; 6567 kvm_has_tsc_control = true;
@@ -6697,16 +6606,9 @@ static __init int hardware_setup(void)
6697 /* SELF-IPI */ 6606 /* SELF-IPI */
6698 vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); 6607 vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
6699 6608
6700 if (enable_ept) { 6609 if (enable_ept)
6701 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, 6610 vmx_enable_tdp();
6702 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 6611 else
6703 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
6704 0ull, VMX_EPT_EXECUTABLE_MASK,
6705 cpu_has_vmx_ept_execute_only() ?
6706 0ull : VMX_EPT_READABLE_MASK);
6707 ept_set_mmio_spte_mask();
6708 kvm_enable_tdp();
6709 } else
6710 kvm_disable_tdp(); 6612 kvm_disable_tdp();
6711 6613
6712 update_ple_window_actual_max(); 6614 update_ple_window_actual_max();
@@ -7085,13 +6987,18 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
7085 } 6987 }
7086 6988
7087 page = nested_get_page(vcpu, vmptr); 6989 page = nested_get_page(vcpu, vmptr);
7088 if (page == NULL || 6990 if (page == NULL) {
7089 *(u32 *)kmap(page) != VMCS12_REVISION) {
7090 nested_vmx_failInvalid(vcpu); 6991 nested_vmx_failInvalid(vcpu);
6992 return kvm_skip_emulated_instruction(vcpu);
6993 }
6994 if (*(u32 *)kmap(page) != VMCS12_REVISION) {
7091 kunmap(page); 6995 kunmap(page);
6996 nested_release_page_clean(page);
6997 nested_vmx_failInvalid(vcpu);
7092 return kvm_skip_emulated_instruction(vcpu); 6998 return kvm_skip_emulated_instruction(vcpu);
7093 } 6999 }
7094 kunmap(page); 7000 kunmap(page);
7001 nested_release_page_clean(page);
7095 vmx->nested.vmxon_ptr = vmptr; 7002 vmx->nested.vmxon_ptr = vmptr;
7096 break; 7003 break;
7097 case EXIT_REASON_VMCLEAR: 7004 case EXIT_REASON_VMCLEAR:
@@ -7129,6 +7036,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
7129 return 0; 7036 return 0;
7130} 7037}
7131 7038
7039static int enter_vmx_operation(struct kvm_vcpu *vcpu)
7040{
7041 struct vcpu_vmx *vmx = to_vmx(vcpu);
7042 struct vmcs *shadow_vmcs;
7043
7044 if (cpu_has_vmx_msr_bitmap()) {
7045 vmx->nested.msr_bitmap =
7046 (unsigned long *)__get_free_page(GFP_KERNEL);
7047 if (!vmx->nested.msr_bitmap)
7048 goto out_msr_bitmap;
7049 }
7050
7051 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
7052 if (!vmx->nested.cached_vmcs12)
7053 goto out_cached_vmcs12;
7054
7055 if (enable_shadow_vmcs) {
7056 shadow_vmcs = alloc_vmcs();
7057 if (!shadow_vmcs)
7058 goto out_shadow_vmcs;
7059 /* mark vmcs as shadow */
7060 shadow_vmcs->revision_id |= (1u << 31);
7061 /* init shadow vmcs */
7062 vmcs_clear(shadow_vmcs);
7063 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
7064 }
7065
7066 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
7067 vmx->nested.vmcs02_num = 0;
7068
7069 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
7070 HRTIMER_MODE_REL_PINNED);
7071 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
7072
7073 vmx->nested.vmxon = true;
7074 return 0;
7075
7076out_shadow_vmcs:
7077 kfree(vmx->nested.cached_vmcs12);
7078
7079out_cached_vmcs12:
7080 free_page((unsigned long)vmx->nested.msr_bitmap);
7081
7082out_msr_bitmap:
7083 return -ENOMEM;
7084}
7085
7132/* 7086/*
7133 * Emulate the VMXON instruction. 7087 * Emulate the VMXON instruction.
7134 * Currently, we just remember that VMX is active, and do not save or even 7088 * Currently, we just remember that VMX is active, and do not save or even
@@ -7139,9 +7093,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
7139 */ 7093 */
7140static int handle_vmon(struct kvm_vcpu *vcpu) 7094static int handle_vmon(struct kvm_vcpu *vcpu)
7141{ 7095{
7096 int ret;
7142 struct kvm_segment cs; 7097 struct kvm_segment cs;
7143 struct vcpu_vmx *vmx = to_vmx(vcpu); 7098 struct vcpu_vmx *vmx = to_vmx(vcpu);
7144 struct vmcs *shadow_vmcs;
7145 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 7099 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
7146 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 7100 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7147 7101
@@ -7168,9 +7122,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
7168 return 1; 7122 return 1;
7169 } 7123 }
7170 7124
7171 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
7172 return 1;
7173
7174 if (vmx->nested.vmxon) { 7125 if (vmx->nested.vmxon) {
7175 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 7126 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
7176 return kvm_skip_emulated_instruction(vcpu); 7127 return kvm_skip_emulated_instruction(vcpu);
@@ -7182,48 +7133,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
7182 return 1; 7133 return 1;
7183 } 7134 }
7184 7135
7185 if (cpu_has_vmx_msr_bitmap()) { 7136 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
7186 vmx->nested.msr_bitmap = 7137 return 1;
7187 (unsigned long *)__get_free_page(GFP_KERNEL); 7138
7188 if (!vmx->nested.msr_bitmap) 7139 ret = enter_vmx_operation(vcpu);
7189 goto out_msr_bitmap; 7140 if (ret)
7190 } 7141 return ret;
7191
7192 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
7193 if (!vmx->nested.cached_vmcs12)
7194 goto out_cached_vmcs12;
7195
7196 if (enable_shadow_vmcs) {
7197 shadow_vmcs = alloc_vmcs();
7198 if (!shadow_vmcs)
7199 goto out_shadow_vmcs;
7200 /* mark vmcs as shadow */
7201 shadow_vmcs->revision_id |= (1u << 31);
7202 /* init shadow vmcs */
7203 vmcs_clear(shadow_vmcs);
7204 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
7205 }
7206
7207 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
7208 vmx->nested.vmcs02_num = 0;
7209
7210 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
7211 HRTIMER_MODE_REL_PINNED);
7212 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
7213
7214 vmx->nested.vmxon = true;
7215 7142
7216 nested_vmx_succeed(vcpu); 7143 nested_vmx_succeed(vcpu);
7217 return kvm_skip_emulated_instruction(vcpu); 7144 return kvm_skip_emulated_instruction(vcpu);
7218
7219out_shadow_vmcs:
7220 kfree(vmx->nested.cached_vmcs12);
7221
7222out_cached_vmcs12:
7223 free_page((unsigned long)vmx->nested.msr_bitmap);
7224
7225out_msr_bitmap:
7226 return -ENOMEM;
7227} 7145}
7228 7146
7229/* 7147/*
@@ -7672,6 +7590,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
7672 return kvm_skip_emulated_instruction(vcpu); 7590 return kvm_skip_emulated_instruction(vcpu);
7673} 7591}
7674 7592
7593static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
7594{
7595 vmx->nested.current_vmptr = vmptr;
7596 if (enable_shadow_vmcs) {
7597 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
7598 SECONDARY_EXEC_SHADOW_VMCS);
7599 vmcs_write64(VMCS_LINK_POINTER,
7600 __pa(vmx->vmcs01.shadow_vmcs));
7601 vmx->nested.sync_shadow_vmcs = true;
7602 }
7603}
7604
7675/* Emulate the VMPTRLD instruction */ 7605/* Emulate the VMPTRLD instruction */
7676static int handle_vmptrld(struct kvm_vcpu *vcpu) 7606static int handle_vmptrld(struct kvm_vcpu *vcpu)
7677{ 7607{
@@ -7702,7 +7632,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7702 } 7632 }
7703 7633
7704 nested_release_vmcs12(vmx); 7634 nested_release_vmcs12(vmx);
7705 vmx->nested.current_vmptr = vmptr;
7706 vmx->nested.current_vmcs12 = new_vmcs12; 7635 vmx->nested.current_vmcs12 = new_vmcs12;
7707 vmx->nested.current_vmcs12_page = page; 7636 vmx->nested.current_vmcs12_page = page;
7708 /* 7637 /*
@@ -7711,14 +7640,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
7711 */ 7640 */
7712 memcpy(vmx->nested.cached_vmcs12, 7641 memcpy(vmx->nested.cached_vmcs12,
7713 vmx->nested.current_vmcs12, VMCS12_SIZE); 7642 vmx->nested.current_vmcs12, VMCS12_SIZE);
7714 7643 set_current_vmptr(vmx, vmptr);
7715 if (enable_shadow_vmcs) {
7716 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
7717 SECONDARY_EXEC_SHADOW_VMCS);
7718 vmcs_write64(VMCS_LINK_POINTER,
7719 __pa(vmx->vmcs01.shadow_vmcs));
7720 vmx->nested.sync_shadow_vmcs = true;
7721 }
7722 } 7644 }
7723 7645
7724 nested_vmx_succeed(vcpu); 7646 nested_vmx_succeed(vcpu);
@@ -8191,8 +8113,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
8191 case EXIT_REASON_TASK_SWITCH: 8113 case EXIT_REASON_TASK_SWITCH:
8192 return true; 8114 return true;
8193 case EXIT_REASON_CPUID: 8115 case EXIT_REASON_CPUID:
8194 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
8195 return false;
8196 return true; 8116 return true;
8197 case EXIT_REASON_HLT: 8117 case EXIT_REASON_HLT:
8198 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 8118 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -8350,7 +8270,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
8350static void vmx_dump_sel(char *name, uint32_t sel) 8270static void vmx_dump_sel(char *name, uint32_t sel)
8351{ 8271{
8352 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 8272 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
8353 name, vmcs_read32(sel), 8273 name, vmcs_read16(sel),
8354 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 8274 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
8355 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 8275 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
8356 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 8276 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
@@ -8514,6 +8434,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
8514 u32 vectoring_info = vmx->idt_vectoring_info; 8434 u32 vectoring_info = vmx->idt_vectoring_info;
8515 8435
8516 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 8436 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
8437 vcpu->arch.gpa_available = false;
8517 8438
8518 /* 8439 /*
8519 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 8440 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -8732,6 +8653,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
8732 } 8653 }
8733} 8654}
8734 8655
8656static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
8657{
8658 struct vcpu_vmx *vmx = to_vmx(vcpu);
8659 int max_irr;
8660
8661 WARN_ON(!vcpu->arch.apicv_active);
8662 if (pi_test_on(&vmx->pi_desc)) {
8663 pi_clear_on(&vmx->pi_desc);
8664 /*
8665 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
8666 * But on x86 this is just a compiler barrier anyway.
8667 */
8668 smp_mb__after_atomic();
8669 max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
8670 } else {
8671 max_irr = kvm_lapic_find_highest_irr(vcpu);
8672 }
8673 vmx_hwapic_irr_update(vcpu, max_irr);
8674 return max_irr;
8675}
8676
8735static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 8677static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
8736{ 8678{
8737 if (!kvm_vcpu_apicv_active(vcpu)) 8679 if (!kvm_vcpu_apicv_active(vcpu))
@@ -8743,6 +8685,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
8743 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 8685 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
8744} 8686}
8745 8687
8688static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
8689{
8690 struct vcpu_vmx *vmx = to_vmx(vcpu);
8691
8692 pi_clear_on(&vmx->pi_desc);
8693 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
8694}
8695
8746static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 8696static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
8747{ 8697{
8748 u32 exit_intr_info; 8698 u32 exit_intr_info;
@@ -9588,17 +9538,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
9588 kvm_inject_page_fault(vcpu, fault); 9538 kvm_inject_page_fault(vcpu, fault);
9589} 9539}
9590 9540
9591static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 9541static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
9542 struct vmcs12 *vmcs12);
9543
9544static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9592 struct vmcs12 *vmcs12) 9545 struct vmcs12 *vmcs12)
9593{ 9546{
9594 struct vcpu_vmx *vmx = to_vmx(vcpu); 9547 struct vcpu_vmx *vmx = to_vmx(vcpu);
9595 int maxphyaddr = cpuid_maxphyaddr(vcpu); 9548 u64 hpa;
9596 9549
9597 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 9550 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
9598 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
9599 vmcs12->apic_access_addr >> maxphyaddr)
9600 return false;
9601
9602 /* 9551 /*
9603 * Translate L1 physical address to host physical 9552 * Translate L1 physical address to host physical
9604 * address for vmcs02. Keep the page pinned, so this 9553 * address for vmcs02. Keep the page pinned, so this
@@ -9609,59 +9558,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
9609 nested_release_page(vmx->nested.apic_access_page); 9558 nested_release_page(vmx->nested.apic_access_page);
9610 vmx->nested.apic_access_page = 9559 vmx->nested.apic_access_page =
9611 nested_get_page(vcpu, vmcs12->apic_access_addr); 9560 nested_get_page(vcpu, vmcs12->apic_access_addr);
9561 /*
9562 * If translation failed, no matter: This feature asks
9563 * to exit when accessing the given address, and if it
9564 * can never be accessed, this feature won't do
9565 * anything anyway.
9566 */
9567 if (vmx->nested.apic_access_page) {
9568 hpa = page_to_phys(vmx->nested.apic_access_page);
9569 vmcs_write64(APIC_ACCESS_ADDR, hpa);
9570 } else {
9571 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
9572 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
9573 }
9574 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9575 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
9576 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
9577 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
9578 kvm_vcpu_reload_apic_access_page(vcpu);
9612 } 9579 }
9613 9580
9614 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 9581 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
9615 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
9616 vmcs12->virtual_apic_page_addr >> maxphyaddr)
9617 return false;
9618
9619 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 9582 if (vmx->nested.virtual_apic_page) /* shouldn't happen */
9620 nested_release_page(vmx->nested.virtual_apic_page); 9583 nested_release_page(vmx->nested.virtual_apic_page);
9621 vmx->nested.virtual_apic_page = 9584 vmx->nested.virtual_apic_page =
9622 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 9585 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
9623 9586
9624 /* 9587 /*
9625 * Failing the vm entry is _not_ what the processor does 9588 * If translation failed, VM entry will fail because
9626 * but it's basically the only possibility we have. 9589 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
9627 * We could still enter the guest if CR8 load exits are 9590 * Failing the vm entry is _not_ what the processor
9628 * enabled, CR8 store exits are enabled, and virtualize APIC 9591 * does but it's basically the only possibility we
9629 * access is disabled; in this case the processor would never 9592 * have. We could still enter the guest if CR8 load
9630 * use the TPR shadow and we could simply clear the bit from 9593 * exits are enabled, CR8 store exits are enabled, and
9631 * the execution control. But such a configuration is useless, 9594 * virtualize APIC access is disabled; in this case
9632 * so let's keep the code simple. 9595 * the processor would never use the TPR shadow and we
9596 * could simply clear the bit from the execution
9597 * control. But such a configuration is useless, so
9598 * let's keep the code simple.
9633 */ 9599 */
9634 if (!vmx->nested.virtual_apic_page) 9600 if (vmx->nested.virtual_apic_page) {
9635 return false; 9601 hpa = page_to_phys(vmx->nested.virtual_apic_page);
9602 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
9603 }
9636 } 9604 }
9637 9605
9638 if (nested_cpu_has_posted_intr(vmcs12)) { 9606 if (nested_cpu_has_posted_intr(vmcs12)) {
9639 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
9640 vmcs12->posted_intr_desc_addr >> maxphyaddr)
9641 return false;
9642
9643 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 9607 if (vmx->nested.pi_desc_page) { /* shouldn't happen */
9644 kunmap(vmx->nested.pi_desc_page); 9608 kunmap(vmx->nested.pi_desc_page);
9645 nested_release_page(vmx->nested.pi_desc_page); 9609 nested_release_page(vmx->nested.pi_desc_page);
9646 } 9610 }
9647 vmx->nested.pi_desc_page = 9611 vmx->nested.pi_desc_page =
9648 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 9612 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
9649 if (!vmx->nested.pi_desc_page)
9650 return false;
9651
9652 vmx->nested.pi_desc = 9613 vmx->nested.pi_desc =
9653 (struct pi_desc *)kmap(vmx->nested.pi_desc_page); 9614 (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
9654 if (!vmx->nested.pi_desc) { 9615 if (!vmx->nested.pi_desc) {
9655 nested_release_page_clean(vmx->nested.pi_desc_page); 9616 nested_release_page_clean(vmx->nested.pi_desc_page);
9656 return false; 9617 return;
9657 } 9618 }
9658 vmx->nested.pi_desc = 9619 vmx->nested.pi_desc =
9659 (struct pi_desc *)((void *)vmx->nested.pi_desc + 9620 (struct pi_desc *)((void *)vmx->nested.pi_desc +
9660 (unsigned long)(vmcs12->posted_intr_desc_addr & 9621 (unsigned long)(vmcs12->posted_intr_desc_addr &
9661 (PAGE_SIZE - 1))); 9622 (PAGE_SIZE - 1)));
9623 vmcs_write64(POSTED_INTR_DESC_ADDR,
9624 page_to_phys(vmx->nested.pi_desc_page) +
9625 (unsigned long)(vmcs12->posted_intr_desc_addr &
9626 (PAGE_SIZE - 1)));
9662 } 9627 }
9663 9628 if (cpu_has_vmx_msr_bitmap() &&
9664 return true; 9629 nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
9630 nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
9631 ;
9632 else
9633 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
9634 CPU_BASED_USE_MSR_BITMAPS);
9665} 9635}
9666 9636
9667static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 9637static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9730,11 +9700,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
9730 return false; 9700 return false;
9731 } 9701 }
9732 msr_bitmap_l1 = (unsigned long *)kmap(page); 9702 msr_bitmap_l1 = (unsigned long *)kmap(page);
9733 if (!msr_bitmap_l1) {
9734 nested_release_page_clean(page);
9735 WARN_ON(1);
9736 return false;
9737 }
9738 9703
9739 memset(msr_bitmap_l0, 0xff, PAGE_SIZE); 9704 memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
9740 9705
@@ -9982,7 +9947,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
9982 * is assigned to entry_failure_code on failure. 9947 * is assigned to entry_failure_code on failure.
9983 */ 9948 */
9984static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 9949static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
9985 unsigned long *entry_failure_code) 9950 u32 *entry_failure_code)
9986{ 9951{
9987 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 9952 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
9988 if (!nested_cr3_valid(vcpu, cr3)) { 9953 if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10022,7 +9987,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
10022 * is assigned to entry_failure_code on failure. 9987 * is assigned to entry_failure_code on failure.
10023 */ 9988 */
10024static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 9989static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10025 unsigned long *entry_failure_code) 9990 bool from_vmentry, u32 *entry_failure_code)
10026{ 9991{
10027 struct vcpu_vmx *vmx = to_vmx(vcpu); 9992 struct vcpu_vmx *vmx = to_vmx(vcpu);
10028 u32 exec_control; 9993 u32 exec_control;
@@ -10065,21 +10030,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10065 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 10030 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
10066 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 10031 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
10067 10032
10068 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 10033 if (from_vmentry &&
10034 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
10069 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 10035 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
10070 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 10036 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
10071 } else { 10037 } else {
10072 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 10038 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
10073 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 10039 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
10074 } 10040 }
10075 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 10041 if (from_vmentry) {
10076 vmcs12->vm_entry_intr_info_field); 10042 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
10077 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 10043 vmcs12->vm_entry_intr_info_field);
10078 vmcs12->vm_entry_exception_error_code); 10044 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
10079 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 10045 vmcs12->vm_entry_exception_error_code);
10080 vmcs12->vm_entry_instruction_len); 10046 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
10081 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 10047 vmcs12->vm_entry_instruction_len);
10082 vmcs12->guest_interruptibility_info); 10048 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
10049 vmcs12->guest_interruptibility_info);
10050 } else {
10051 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
10052 }
10083 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 10053 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
10084 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 10054 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
10085 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 10055 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10108,12 +10078,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10108 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 10078 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
10109 vmx->nested.pi_pending = false; 10079 vmx->nested.pi_pending = false;
10110 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 10080 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
10111 vmcs_write64(POSTED_INTR_DESC_ADDR, 10081 } else {
10112 page_to_phys(vmx->nested.pi_desc_page) +
10113 (unsigned long)(vmcs12->posted_intr_desc_addr &
10114 (PAGE_SIZE - 1)));
10115 } else
10116 exec_control &= ~PIN_BASED_POSTED_INTR; 10082 exec_control &= ~PIN_BASED_POSTED_INTR;
10083 }
10117 10084
10118 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 10085 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
10119 10086
@@ -10158,26 +10125,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10158 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 10125 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
10159 exec_control |= vmcs12->secondary_vm_exec_control; 10126 exec_control |= vmcs12->secondary_vm_exec_control;
10160 10127
10161 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
10162 /*
10163 * If translation failed, no matter: This feature asks
10164 * to exit when accessing the given address, and if it
10165 * can never be accessed, this feature won't do
10166 * anything anyway.
10167 */
10168 if (!vmx->nested.apic_access_page)
10169 exec_control &=
10170 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10171 else
10172 vmcs_write64(APIC_ACCESS_ADDR,
10173 page_to_phys(vmx->nested.apic_access_page));
10174 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
10175 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
10176 exec_control |=
10177 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
10178 kvm_vcpu_reload_apic_access_page(vcpu);
10179 }
10180
10181 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 10128 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
10182 vmcs_write64(EOI_EXIT_BITMAP0, 10129 vmcs_write64(EOI_EXIT_BITMAP0,
10183 vmcs12->eoi_exit_bitmap0); 10130 vmcs12->eoi_exit_bitmap0);
@@ -10192,6 +10139,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10192 } 10139 }
10193 10140
10194 nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; 10141 nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
10142
10143 /*
10144 * Write an illegal value to APIC_ACCESS_ADDR. Later,
10145 * nested_get_vmcs12_pages will either fix it up or
10146 * remove the VM execution control.
10147 */
10148 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
10149 vmcs_write64(APIC_ACCESS_ADDR, -1ull);
10150
10195 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 10151 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
10196 } 10152 }
10197 10153
@@ -10228,19 +10184,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10228 exec_control &= ~CPU_BASED_TPR_SHADOW; 10184 exec_control &= ~CPU_BASED_TPR_SHADOW;
10229 exec_control |= vmcs12->cpu_based_vm_exec_control; 10185 exec_control |= vmcs12->cpu_based_vm_exec_control;
10230 10186
10187 /*
10188 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
10189 * nested_get_vmcs12_pages can't fix it up, the illegal value
10190 * will result in a VM entry failure.
10191 */
10231 if (exec_control & CPU_BASED_TPR_SHADOW) { 10192 if (exec_control & CPU_BASED_TPR_SHADOW) {
10232 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 10193 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
10233 page_to_phys(vmx->nested.virtual_apic_page));
10234 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 10194 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
10235 } 10195 }
10236 10196
10237 if (cpu_has_vmx_msr_bitmap() &&
10238 exec_control & CPU_BASED_USE_MSR_BITMAPS &&
10239 nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
10240 ; /* MSR_BITMAP will be set by following vmx_set_efer. */
10241 else
10242 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
10243
10244 /* 10197 /*
10245 * Merging of IO bitmap not currently supported. 10198 * Merging of IO bitmap not currently supported.
10246 * Rather, exit every time. 10199 * Rather, exit every time.
@@ -10272,16 +10225,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10272 ~VM_ENTRY_IA32E_MODE) | 10225 ~VM_ENTRY_IA32E_MODE) |
10273 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 10226 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
10274 10227
10275 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { 10228 if (from_vmentry &&
10229 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
10276 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 10230 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
10277 vcpu->arch.pat = vmcs12->guest_ia32_pat; 10231 vcpu->arch.pat = vmcs12->guest_ia32_pat;
10278 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 10232 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
10279 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 10233 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
10280 10234 }
10281 10235
10282 set_cr4_guest_host_mask(vmx); 10236 set_cr4_guest_host_mask(vmx);
10283 10237
10284 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) 10238 if (from_vmentry &&
10239 vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
10285 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 10240 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
10286 10241
10287 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 10242 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10320,8 +10275,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10320 } 10275 }
10321 10276
10322 /* 10277 /*
10323 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified 10278 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
10324 * TS bit (for lazy fpu) and bits which we consider mandatory enabled. 10279 * bits which we consider mandatory enabled.
10325 * The CR0_READ_SHADOW is what L2 should have expected to read given 10280 * The CR0_READ_SHADOW is what L2 should have expected to read given
10326 * the specifications by L1; It's not enough to take 10281 * the specifications by L1; It's not enough to take
10327 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 10282 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10333,7 +10288,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10333 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 10288 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
10334 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 10289 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
10335 10290
10336 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 10291 if (from_vmentry &&
10292 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
10337 vcpu->arch.efer = vmcs12->guest_ia32_efer; 10293 vcpu->arch.efer = vmcs12->guest_ia32_efer;
10338 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 10294 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
10339 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 10295 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10367,73 +10323,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10367 return 0; 10323 return 0;
10368} 10324}
10369 10325
10370/* 10326static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10371 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
10372 * for running an L2 nested guest.
10373 */
10374static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10375{ 10327{
10376 struct vmcs12 *vmcs12;
10377 struct vcpu_vmx *vmx = to_vmx(vcpu); 10328 struct vcpu_vmx *vmx = to_vmx(vcpu);
10378 int cpu;
10379 struct loaded_vmcs *vmcs02;
10380 bool ia32e;
10381 u32 msr_entry_idx;
10382 unsigned long exit_qualification;
10383
10384 if (!nested_vmx_check_permission(vcpu))
10385 return 1;
10386
10387 if (!nested_vmx_check_vmcs12(vcpu))
10388 goto out;
10389
10390 vmcs12 = get_vmcs12(vcpu);
10391
10392 if (enable_shadow_vmcs)
10393 copy_shadow_to_vmcs12(vmx);
10394
10395 /*
10396 * The nested entry process starts with enforcing various prerequisites
10397 * on vmcs12 as required by the Intel SDM, and act appropriately when
10398 * they fail: As the SDM explains, some conditions should cause the
10399 * instruction to fail, while others will cause the instruction to seem
10400 * to succeed, but return an EXIT_REASON_INVALID_STATE.
10401 * To speed up the normal (success) code path, we should avoid checking
10402 * for misconfigurations which will anyway be caught by the processor
10403 * when using the merged vmcs02.
10404 */
10405 if (vmcs12->launch_state == launch) {
10406 nested_vmx_failValid(vcpu,
10407 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
10408 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
10409 goto out;
10410 }
10411 10329
10412 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 10330 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
10413 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { 10331 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
10414 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 10332 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10415 goto out;
10416 }
10417 10333
10418 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 10334 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
10419 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 10335 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10420 goto out;
10421 }
10422 10336
10423 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { 10337 if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
10424 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 10338 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10425 goto out;
10426 }
10427 10339
10428 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { 10340 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
10429 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 10341 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10430 goto out;
10431 }
10432
10433 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
10434 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
10435 goto out;
10436 }
10437 10342
10438 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 10343 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
10439 vmx->nested.nested_vmx_procbased_ctls_low, 10344 vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10450,28 +10355,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10450 !vmx_control_verify(vmcs12->vm_entry_controls, 10355 !vmx_control_verify(vmcs12->vm_entry_controls,
10451 vmx->nested.nested_vmx_entry_ctls_low, 10356 vmx->nested.nested_vmx_entry_ctls_low,
10452 vmx->nested.nested_vmx_entry_ctls_high)) 10357 vmx->nested.nested_vmx_entry_ctls_high))
10453 { 10358 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
10454 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
10455 goto out;
10456 }
10457 10359
10458 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || 10360 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
10459 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || 10361 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
10460 !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { 10362 !nested_cr3_valid(vcpu, vmcs12->host_cr3))
10461 nested_vmx_failValid(vcpu, 10363 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
10462 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 10364
10463 goto out; 10365 return 0;
10464 } 10366}
10367
10368static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10369 u32 *exit_qual)
10370{
10371 bool ia32e;
10372
10373 *exit_qual = ENTRY_FAIL_DEFAULT;
10465 10374
10466 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || 10375 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
10467 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { 10376 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
10468 nested_vmx_entry_failure(vcpu, vmcs12,
10469 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
10470 return 1; 10377 return 1;
10471 } 10378
10472 if (vmcs12->vmcs_link_pointer != -1ull) { 10379 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
10473 nested_vmx_entry_failure(vcpu, vmcs12, 10380 vmcs12->vmcs_link_pointer != -1ull) {
10474 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); 10381 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
10475 return 1; 10382 return 1;
10476 } 10383 }
10477 10384
@@ -10484,16 +10391,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10484 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 10391 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
10485 * CR0.PG) is 1. 10392 * CR0.PG) is 1.
10486 */ 10393 */
10487 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 10394 if (to_vmx(vcpu)->nested.nested_run_pending &&
10395 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
10488 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 10396 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
10489 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 10397 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
10490 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 10398 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
10491 ((vmcs12->guest_cr0 & X86_CR0_PG) && 10399 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
10492 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 10400 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
10493 nested_vmx_entry_failure(vcpu, vmcs12,
10494 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
10495 return 1; 10401 return 1;
10496 }
10497 } 10402 }
10498 10403
10499 /* 10404 /*
@@ -10507,28 +10412,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10507 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 10412 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
10508 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 10413 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
10509 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 10414 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
10510 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 10415 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
10511 nested_vmx_entry_failure(vcpu, vmcs12,
10512 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
10513 return 1; 10416 return 1;
10514 }
10515 } 10417 }
10516 10418
10517 /* 10419 return 0;
10518 * We're finally done with prerequisite checking, and can start with 10420}
10519 * the nested entry. 10421
10520 */ 10422static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
10423{
10424 struct vcpu_vmx *vmx = to_vmx(vcpu);
10425 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10426 struct loaded_vmcs *vmcs02;
10427 int cpu;
10428 u32 msr_entry_idx;
10429 u32 exit_qual;
10521 10430
10522 vmcs02 = nested_get_current_vmcs02(vmx); 10431 vmcs02 = nested_get_current_vmcs02(vmx);
10523 if (!vmcs02) 10432 if (!vmcs02)
10524 return -ENOMEM; 10433 return -ENOMEM;
10525 10434
10526 /*
10527 * After this point, the trap flag no longer triggers a singlestep trap
10528 * on the vm entry instructions. Don't call
10529 * kvm_skip_emulated_instruction.
10530 */
10531 skip_emulated_instruction(vcpu);
10532 enter_guest_mode(vcpu); 10435 enter_guest_mode(vcpu);
10533 10436
10534 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 10437 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10543,14 +10446,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10543 10446
10544 vmx_segment_cache_clear(vmx); 10447 vmx_segment_cache_clear(vmx);
10545 10448
10546 if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { 10449 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
10547 leave_guest_mode(vcpu); 10450 leave_guest_mode(vcpu);
10548 vmx_load_vmcs01(vcpu); 10451 vmx_load_vmcs01(vcpu);
10549 nested_vmx_entry_failure(vcpu, vmcs12, 10452 nested_vmx_entry_failure(vcpu, vmcs12,
10550 EXIT_REASON_INVALID_STATE, exit_qualification); 10453 EXIT_REASON_INVALID_STATE, exit_qual);
10551 return 1; 10454 return 1;
10552 } 10455 }
10553 10456
10457 nested_get_vmcs12_pages(vcpu, vmcs12);
10458
10554 msr_entry_idx = nested_vmx_load_msr(vcpu, 10459 msr_entry_idx = nested_vmx_load_msr(vcpu,
10555 vmcs12->vm_entry_msr_load_addr, 10460 vmcs12->vm_entry_msr_load_addr,
10556 vmcs12->vm_entry_msr_load_count); 10461 vmcs12->vm_entry_msr_load_count);
@@ -10564,17 +10469,90 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10564 10469
10565 vmcs12->launch_state = 1; 10470 vmcs12->launch_state = 1;
10566 10471
10567 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
10568 return kvm_vcpu_halt(vcpu);
10569
10570 vmx->nested.nested_run_pending = 1;
10571
10572 /* 10472 /*
10573 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 10473 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
10574 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 10474 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
10575 * returned as far as L1 is concerned. It will only return (and set 10475 * returned as far as L1 is concerned. It will only return (and set
10576 * the success flag) when L2 exits (see nested_vmx_vmexit()). 10476 * the success flag) when L2 exits (see nested_vmx_vmexit()).
10577 */ 10477 */
10478 return 0;
10479}
10480
10481/*
10482 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
10483 * for running an L2 nested guest.
10484 */
10485static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
10486{
10487 struct vmcs12 *vmcs12;
10488 struct vcpu_vmx *vmx = to_vmx(vcpu);
10489 u32 exit_qual;
10490 int ret;
10491
10492 if (!nested_vmx_check_permission(vcpu))
10493 return 1;
10494
10495 if (!nested_vmx_check_vmcs12(vcpu))
10496 goto out;
10497
10498 vmcs12 = get_vmcs12(vcpu);
10499
10500 if (enable_shadow_vmcs)
10501 copy_shadow_to_vmcs12(vmx);
10502
10503 /*
10504 * The nested entry process starts with enforcing various prerequisites
10505 * on vmcs12 as required by the Intel SDM, and act appropriately when
10506 * they fail: As the SDM explains, some conditions should cause the
10507 * instruction to fail, while others will cause the instruction to seem
10508 * to succeed, but return an EXIT_REASON_INVALID_STATE.
10509 * To speed up the normal (success) code path, we should avoid checking
10510 * for misconfigurations which will anyway be caught by the processor
10511 * when using the merged vmcs02.
10512 */
10513 if (vmcs12->launch_state == launch) {
10514 nested_vmx_failValid(vcpu,
10515 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
10516 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
10517 goto out;
10518 }
10519
10520 ret = check_vmentry_prereqs(vcpu, vmcs12);
10521 if (ret) {
10522 nested_vmx_failValid(vcpu, ret);
10523 goto out;
10524 }
10525
10526 /*
10527 * After this point, the trap flag no longer triggers a singlestep trap
10528 * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
10529 * This is not 100% correct; for performance reasons, we delegate most
10530 * of the checks on host state to the processor. If those fail,
10531 * the singlestep trap is missed.
10532 */
10533 skip_emulated_instruction(vcpu);
10534
10535 ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
10536 if (ret) {
10537 nested_vmx_entry_failure(vcpu, vmcs12,
10538 EXIT_REASON_INVALID_STATE, exit_qual);
10539 return 1;
10540 }
10541
10542 /*
10543 * We're finally done with prerequisite checking, and can start with
10544 * the nested entry.
10545 */
10546
10547 ret = enter_vmx_non_root_mode(vcpu, true);
10548 if (ret)
10549 return ret;
10550
10551 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
10552 return kvm_vcpu_halt(vcpu);
10553
10554 vmx->nested.nested_run_pending = 1;
10555
10578 return 1; 10556 return 1;
10579 10557
10580out: 10558out:
@@ -10696,7 +10674,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
10696 return 0; 10674 return 0;
10697 } 10675 }
10698 10676
10699 return vmx_complete_nested_posted_interrupt(vcpu); 10677 vmx_complete_nested_posted_interrupt(vcpu);
10678 return 0;
10700} 10679}
10701 10680
10702static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 10681static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -10714,21 +10693,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
10714} 10693}
10715 10694
10716/* 10695/*
10717 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 10696 * Update the guest state fields of vmcs12 to reflect changes that
10718 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 10697 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
10719 * and this function updates it to reflect the changes to the guest state while 10698 * VM-entry controls is also updated, since this is really a guest
10720 * L2 was running (and perhaps made some exits which were handled directly by L0 10699 * state bit.)
10721 * without going back to L1), and to reflect the exit reason.
10722 * Note that we do not have to copy here all VMCS fields, just those that
10723 * could have changed by the L2 guest or the exit - i.e., the guest-state and
10724 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
10725 * which already writes to vmcs12 directly.
10726 */ 10700 */
10727static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 10701static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
10728 u32 exit_reason, u32 exit_intr_info,
10729 unsigned long exit_qualification)
10730{ 10702{
10731 /* update guest state fields: */
10732 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 10703 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
10733 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 10704 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
10734 10705
@@ -10834,6 +10805,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10834 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 10805 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
10835 if (nested_cpu_has_xsaves(vmcs12)) 10806 if (nested_cpu_has_xsaves(vmcs12))
10836 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); 10807 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
10808}
10809
10810/*
10811 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
10812 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
10813 * and this function updates it to reflect the changes to the guest state while
10814 * L2 was running (and perhaps made some exits which were handled directly by L0
10815 * without going back to L1), and to reflect the exit reason.
10816 * Note that we do not have to copy here all VMCS fields, just those that
10817 * could have changed by the L2 guest or the exit - i.e., the guest-state and
10818 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
10819 * which already writes to vmcs12 directly.
10820 */
10821static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10822 u32 exit_reason, u32 exit_intr_info,
10823 unsigned long exit_qualification)
10824{
10825 /* update guest state fields: */
10826 sync_vmcs12(vcpu, vmcs12);
10837 10827
10838 /* update exit information fields: */ 10828 /* update exit information fields: */
10839 10829
@@ -10884,7 +10874,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
10884 struct vmcs12 *vmcs12) 10874 struct vmcs12 *vmcs12)
10885{ 10875{
10886 struct kvm_segment seg; 10876 struct kvm_segment seg;
10887 unsigned long entry_failure_code; 10877 u32 entry_failure_code;
10888 10878
10889 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 10879 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
10890 vcpu->arch.efer = vmcs12->host_ia32_efer; 10880 vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10899,24 +10889,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
10899 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 10889 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
10900 /* 10890 /*
10901 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 10891 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
10902 * actually changed, because it depends on the current state of 10892 * actually changed, because vmx_set_cr0 refers to efer set above.
10903 * fpu_active (which may have changed). 10893 *
10904 * Note that vmx_set_cr0 refers to efer set above. 10894 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
10895 * (KVM doesn't change it);
10905 */ 10896 */
10897 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
10906 vmx_set_cr0(vcpu, vmcs12->host_cr0); 10898 vmx_set_cr0(vcpu, vmcs12->host_cr0);
10907 /*
10908 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
10909 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
10910 * but we also need to update cr0_guest_host_mask and exception_bitmap.
10911 */
10912 update_exception_bitmap(vcpu);
10913 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
10914 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
10915 10899
10916 /* 10900 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
10917 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
10918 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
10919 */
10920 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 10901 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
10921 kvm_set_cr4(vcpu, vmcs12->host_cr4); 10902 kvm_set_cr4(vcpu, vmcs12->host_cr4);
10922 10903
@@ -11545,9 +11526,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
11545 11526
11546 .get_pkru = vmx_get_pkru, 11527 .get_pkru = vmx_get_pkru,
11547 11528
11548 .fpu_activate = vmx_fpu_activate,
11549 .fpu_deactivate = vmx_fpu_deactivate,
11550
11551 .tlb_flush = vmx_flush_tlb, 11529 .tlb_flush = vmx_flush_tlb,
11552 11530
11553 .run = vmx_vcpu_run, 11531 .run = vmx_vcpu_run,
@@ -11572,6 +11550,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
11572 .get_enable_apicv = vmx_get_enable_apicv, 11550 .get_enable_apicv = vmx_get_enable_apicv,
11573 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 11551 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
11574 .load_eoi_exitmap = vmx_load_eoi_exitmap, 11552 .load_eoi_exitmap = vmx_load_eoi_exitmap,
11553 .apicv_post_state_restore = vmx_apicv_post_state_restore,
11575 .hwapic_irr_update = vmx_hwapic_irr_update, 11554 .hwapic_irr_update = vmx_hwapic_irr_update,
11576 .hwapic_isr_update = vmx_hwapic_isr_update, 11555 .hwapic_isr_update = vmx_hwapic_isr_update,
11577 .sync_pir_to_irr = vmx_sync_pir_to_irr, 11556 .sync_pir_to_irr = vmx_sync_pir_to_irr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e52c9088660f..b2a4b11274b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
180 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 180 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
181 { "irq_injections", VCPU_STAT(irq_injections) }, 181 { "irq_injections", VCPU_STAT(irq_injections) },
182 { "nmi_injections", VCPU_STAT(nmi_injections) }, 182 { "nmi_injections", VCPU_STAT(nmi_injections) },
183 { "req_event", VCPU_STAT(req_event) },
183 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 184 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
184 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 185 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
185 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 186 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
190 { "mmu_unsync", VM_STAT(mmu_unsync) }, 191 { "mmu_unsync", VM_STAT(mmu_unsync) },
191 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 192 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
192 { "largepages", VM_STAT(lpages) }, 193 { "largepages", VM_STAT(lpages) },
194 { "max_mmu_page_hash_collisions",
195 VM_STAT(max_mmu_page_hash_collisions) },
193 { NULL } 196 { NULL }
194}; 197};
195 198
@@ -1139,6 +1142,7 @@ struct pvclock_gtod_data {
1139 1142
1140 u64 boot_ns; 1143 u64 boot_ns;
1141 u64 nsec_base; 1144 u64 nsec_base;
1145 u64 wall_time_sec;
1142}; 1146};
1143 1147
1144static struct pvclock_gtod_data pvclock_gtod_data; 1148static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
1162 vdata->boot_ns = boot_ns; 1166 vdata->boot_ns = boot_ns;
1163 vdata->nsec_base = tk->tkr_mono.xtime_nsec; 1167 vdata->nsec_base = tk->tkr_mono.xtime_nsec;
1164 1168
1169 vdata->wall_time_sec = tk->xtime_sec;
1170
1165 write_seqcount_end(&vdata->seq); 1171 write_seqcount_end(&vdata->seq);
1166} 1172}
1167#endif 1173#endif
@@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
1623 return mode; 1629 return mode;
1624} 1630}
1625 1631
1632static int do_realtime(struct timespec *ts, u64 *cycle_now)
1633{
1634 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1635 unsigned long seq;
1636 int mode;
1637 u64 ns;
1638
1639 do {
1640 seq = read_seqcount_begin(&gtod->seq);
1641 mode = gtod->clock.vclock_mode;
1642 ts->tv_sec = gtod->wall_time_sec;
1643 ns = gtod->nsec_base;
1644 ns += vgettsc(cycle_now);
1645 ns >>= gtod->clock.shift;
1646 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1647
1648 ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
1649 ts->tv_nsec = ns;
1650
1651 return mode;
1652}
1653
1626/* returns true if host is using tsc clocksource */ 1654/* returns true if host is using tsc clocksource */
1627static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) 1655static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
1628{ 1656{
@@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
1632 1660
1633 return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; 1661 return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
1634} 1662}
1663
1664/* returns true if host is using tsc clocksource */
1665static bool kvm_get_walltime_and_clockread(struct timespec *ts,
1666 u64 *cycle_now)
1667{
1668 /* checked again under seqlock below */
1669 if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1670 return false;
1671
1672 return do_realtime(ts, cycle_now) == VCLOCK_TSC;
1673}
1635#endif 1674#endif
1636 1675
1637/* 1676/*
@@ -1772,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1772 struct kvm_vcpu_arch *vcpu = &v->arch; 1811 struct kvm_vcpu_arch *vcpu = &v->arch;
1773 struct pvclock_vcpu_time_info guest_hv_clock; 1812 struct pvclock_vcpu_time_info guest_hv_clock;
1774 1813
1775 if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, 1814 if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
1776 &guest_hv_clock, sizeof(guest_hv_clock)))) 1815 &guest_hv_clock, sizeof(guest_hv_clock))))
1777 return; 1816 return;
1778 1817
@@ -1793,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1793 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); 1832 BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
1794 1833
1795 vcpu->hv_clock.version = guest_hv_clock.version + 1; 1834 vcpu->hv_clock.version = guest_hv_clock.version + 1;
1796 kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1835 kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
1797 &vcpu->hv_clock, 1836 &vcpu->hv_clock,
1798 sizeof(vcpu->hv_clock.version)); 1837 sizeof(vcpu->hv_clock.version));
1799 1838
1800 smp_wmb(); 1839 smp_wmb();
1801 1840
@@ -1809,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
1809 1848
1810 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); 1849 trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
1811 1850
1812 kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1851 kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
1813 &vcpu->hv_clock, 1852 &vcpu->hv_clock,
1814 sizeof(vcpu->hv_clock)); 1853 sizeof(vcpu->hv_clock));
1815 1854
1816 smp_wmb(); 1855 smp_wmb();
1817 1856
1818 vcpu->hv_clock.version++; 1857 vcpu->hv_clock.version++;
1819 kvm_write_guest_cached(v->kvm, &vcpu->pv_time, 1858 kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
1820 &vcpu->hv_clock, 1859 &vcpu->hv_clock,
1821 sizeof(vcpu->hv_clock.version)); 1860 sizeof(vcpu->hv_clock.version));
1822} 1861}
1823 1862
1824static int kvm_guest_time_update(struct kvm_vcpu *v) 1863static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2051,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2051 return 0; 2090 return 0;
2052 } 2091 }
2053 2092
2054 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, 2093 if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
2055 sizeof(u32))) 2094 sizeof(u32)))
2056 return 1; 2095 return 1;
2057 2096
@@ -2070,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
2070 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) 2109 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2071 return; 2110 return;
2072 2111
2073 if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2112 if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
2074 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) 2113 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2075 return; 2114 return;
2076 2115
@@ -2081,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
2081 2120
2082 vcpu->arch.st.steal.version += 1; 2121 vcpu->arch.st.steal.version += 1;
2083 2122
2084 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2123 kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
2085 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2124 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2086 2125
2087 smp_wmb(); 2126 smp_wmb();
@@ -2090,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
2090 vcpu->arch.st.last_steal; 2129 vcpu->arch.st.last_steal;
2091 vcpu->arch.st.last_steal = current->sched_info.run_delay; 2130 vcpu->arch.st.last_steal = current->sched_info.run_delay;
2092 2131
2093 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2132 kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
2094 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2133 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2095 2134
2096 smp_wmb(); 2135 smp_wmb();
2097 2136
2098 vcpu->arch.st.steal.version += 1; 2137 vcpu->arch.st.steal.version += 1;
2099 2138
2100 kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, 2139 kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
2101 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 2140 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2102} 2141}
2103 2142
@@ -2202,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2202 if (!(data & 1)) 2241 if (!(data & 1))
2203 break; 2242 break;
2204 2243
2205 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, 2244 if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
2206 &vcpu->arch.pv_time, data & ~1ULL, 2245 &vcpu->arch.pv_time, data & ~1ULL,
2207 sizeof(struct pvclock_vcpu_time_info))) 2246 sizeof(struct pvclock_vcpu_time_info)))
2208 vcpu->arch.pv_time_enabled = false; 2247 vcpu->arch.pv_time_enabled = false;
@@ -2223,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2223 if (data & KVM_STEAL_RESERVED_MASK) 2262 if (data & KVM_STEAL_RESERVED_MASK)
2224 return 1; 2263 return 1;
2225 2264
2226 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, 2265 if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
2227 data & KVM_STEAL_VALID_BITS, 2266 data & KVM_STEAL_VALID_BITS,
2228 sizeof(struct kvm_steal_time))) 2267 sizeof(struct kvm_steal_time)))
2229 return 1; 2268 return 1;
@@ -2633,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2633 case KVM_CAP_DISABLE_QUIRKS: 2672 case KVM_CAP_DISABLE_QUIRKS:
2634 case KVM_CAP_SET_BOOT_CPU_ID: 2673 case KVM_CAP_SET_BOOT_CPU_ID:
2635 case KVM_CAP_SPLIT_IRQCHIP: 2674 case KVM_CAP_SPLIT_IRQCHIP:
2675 case KVM_CAP_IMMEDIATE_EXIT:
2636#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT 2676#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2637 case KVM_CAP_ASSIGN_DEV_IRQ: 2677 case KVM_CAP_ASSIGN_DEV_IRQ:
2638 case KVM_CAP_PCI_2_3: 2678 case KVM_CAP_PCI_2_3:
@@ -2836,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
2836 2876
2837 vcpu->arch.st.steal.preempted = 1; 2877 vcpu->arch.st.steal.preempted = 1;
2838 2878
2839 kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, 2879 kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
2840 &vcpu->arch.st.steal.preempted, 2880 &vcpu->arch.st.steal.preempted,
2841 offsetof(struct kvm_steal_time, preempted), 2881 offsetof(struct kvm_steal_time, preempted),
2842 sizeof(vcpu->arch.st.steal.preempted)); 2882 sizeof(vcpu->arch.st.steal.preempted));
@@ -2870,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2870static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2910static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2871 struct kvm_lapic_state *s) 2911 struct kvm_lapic_state *s)
2872{ 2912{
2873 if (vcpu->arch.apicv_active) 2913 if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
2874 kvm_x86_ops->sync_pir_to_irr(vcpu); 2914 kvm_x86_ops->sync_pir_to_irr(vcpu);
2875 2915
2876 return kvm_apic_get_state(vcpu, s); 2916 return kvm_apic_get_state(vcpu, s);
@@ -3897,7 +3937,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3897 goto split_irqchip_unlock; 3937 goto split_irqchip_unlock;
3898 /* Pairs with irqchip_in_kernel. */ 3938 /* Pairs with irqchip_in_kernel. */
3899 smp_wmb(); 3939 smp_wmb();
3900 kvm->arch.irqchip_split = true; 3940 kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
3901 kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; 3941 kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
3902 r = 0; 3942 r = 0;
3903split_irqchip_unlock: 3943split_irqchip_unlock:
@@ -3960,40 +4000,41 @@ long kvm_arch_vm_ioctl(struct file *filp,
3960 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 4000 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3961 break; 4001 break;
3962 case KVM_CREATE_IRQCHIP: { 4002 case KVM_CREATE_IRQCHIP: {
3963 struct kvm_pic *vpic;
3964
3965 mutex_lock(&kvm->lock); 4003 mutex_lock(&kvm->lock);
4004
3966 r = -EEXIST; 4005 r = -EEXIST;
3967 if (kvm->arch.vpic) 4006 if (irqchip_in_kernel(kvm))
3968 goto create_irqchip_unlock; 4007 goto create_irqchip_unlock;
4008
3969 r = -EINVAL; 4009 r = -EINVAL;
3970 if (kvm->created_vcpus) 4010 if (kvm->created_vcpus)
3971 goto create_irqchip_unlock; 4011 goto create_irqchip_unlock;
3972 r = -ENOMEM; 4012
3973 vpic = kvm_create_pic(kvm); 4013 r = kvm_pic_init(kvm);
3974 if (vpic) { 4014 if (r)
3975 r = kvm_ioapic_init(kvm); 4015 goto create_irqchip_unlock;
3976 if (r) { 4016
3977 mutex_lock(&kvm->slots_lock); 4017 r = kvm_ioapic_init(kvm);
3978 kvm_destroy_pic(vpic); 4018 if (r) {
3979 mutex_unlock(&kvm->slots_lock); 4019 mutex_lock(&kvm->slots_lock);
3980 goto create_irqchip_unlock; 4020 kvm_pic_destroy(kvm);
3981 } 4021 mutex_unlock(&kvm->slots_lock);
3982 } else
3983 goto create_irqchip_unlock; 4022 goto create_irqchip_unlock;
4023 }
4024
3984 r = kvm_setup_default_irq_routing(kvm); 4025 r = kvm_setup_default_irq_routing(kvm);
3985 if (r) { 4026 if (r) {
3986 mutex_lock(&kvm->slots_lock); 4027 mutex_lock(&kvm->slots_lock);
3987 mutex_lock(&kvm->irq_lock); 4028 mutex_lock(&kvm->irq_lock);
3988 kvm_ioapic_destroy(kvm); 4029 kvm_ioapic_destroy(kvm);
3989 kvm_destroy_pic(vpic); 4030 kvm_pic_destroy(kvm);
3990 mutex_unlock(&kvm->irq_lock); 4031 mutex_unlock(&kvm->irq_lock);
3991 mutex_unlock(&kvm->slots_lock); 4032 mutex_unlock(&kvm->slots_lock);
3992 goto create_irqchip_unlock; 4033 goto create_irqchip_unlock;
3993 } 4034 }
3994 /* Write kvm->irq_routing before kvm->arch.vpic. */ 4035 /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
3995 smp_wmb(); 4036 smp_wmb();
3996 kvm->arch.vpic = vpic; 4037 kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
3997 create_irqchip_unlock: 4038 create_irqchip_unlock:
3998 mutex_unlock(&kvm->lock); 4039 mutex_unlock(&kvm->lock);
3999 break; 4040 break;
@@ -4029,7 +4070,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
4029 } 4070 }
4030 4071
4031 r = -ENXIO; 4072 r = -ENXIO;
4032 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) 4073 if (!irqchip_kernel(kvm))
4033 goto get_irqchip_out; 4074 goto get_irqchip_out;
4034 r = kvm_vm_ioctl_get_irqchip(kvm, chip); 4075 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
4035 if (r) 4076 if (r)
@@ -4053,7 +4094,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
4053 } 4094 }
4054 4095
4055 r = -ENXIO; 4096 r = -ENXIO;
4056 if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) 4097 if (!irqchip_kernel(kvm))
4057 goto set_irqchip_out; 4098 goto set_irqchip_out;
4058 r = kvm_vm_ioctl_set_irqchip(kvm, chip); 4099 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4059 if (r) 4100 if (r)
@@ -4462,6 +4503,21 @@ out:
4462} 4503}
4463EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); 4504EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4464 4505
4506static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4507 gpa_t gpa, bool write)
4508{
4509 /* For APIC access vmexit */
4510 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4511 return 1;
4512
4513 if (vcpu_match_mmio_gpa(vcpu, gpa)) {
4514 trace_vcpu_match_mmio(gva, gpa, write, true);
4515 return 1;
4516 }
4517
4518 return 0;
4519}
4520
4465static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, 4521static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4466 gpa_t *gpa, struct x86_exception *exception, 4522 gpa_t *gpa, struct x86_exception *exception,
4467 bool write) 4523 bool write)
@@ -4488,16 +4544,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4488 if (*gpa == UNMAPPED_GVA) 4544 if (*gpa == UNMAPPED_GVA)
4489 return -1; 4545 return -1;
4490 4546
4491 /* For APIC access vmexit */ 4547 return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
4492 if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4493 return 1;
4494
4495 if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4496 trace_vcpu_match_mmio(gva, *gpa, write, true);
4497 return 1;
4498 }
4499
4500 return 0;
4501} 4548}
4502 4549
4503int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 4550int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4594,6 +4641,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
4594 int handled, ret; 4641 int handled, ret;
4595 bool write = ops->write; 4642 bool write = ops->write;
4596 struct kvm_mmio_fragment *frag; 4643 struct kvm_mmio_fragment *frag;
4644 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4645
4646 /*
4647 * If the exit was due to a NPF we may already have a GPA.
4648 * If the GPA is present, use it to avoid the GVA to GPA table walk.
4649 * Note, this cannot be used on string operations since string
4650 * operation using rep will only have the initial GPA from the NPF
4651 * occurred.
4652 */
4653 if (vcpu->arch.gpa_available &&
4654 emulator_can_use_gpa(ctxt) &&
4655 vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
4656 (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
4657 gpa = exception->address;
4658 goto mmio;
4659 }
4597 4660
4598 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); 4661 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4599 4662
@@ -5610,6 +5673,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5610 } 5673 }
5611 5674
5612restart: 5675restart:
5676 /* Save the faulting GPA (cr2) in the address field */
5677 ctxt->exception.address = cr2;
5678
5613 r = x86_emulate_insn(ctxt); 5679 r = x86_emulate_insn(ctxt);
5614 5680
5615 if (r == EMULATION_INTERCEPTED) 5681 if (r == EMULATION_INTERCEPTED)
@@ -5924,9 +5990,6 @@ static void kvm_set_mmio_spte_mask(void)
5924 /* Mask the reserved physical address bits. */ 5990 /* Mask the reserved physical address bits. */
5925 mask = rsvd_bits(maxphyaddr, 51); 5991 mask = rsvd_bits(maxphyaddr, 51);
5926 5992
5927 /* Bit 62 is always reserved for 32bit host. */
5928 mask |= 0x3ull << 62;
5929
5930 /* Set the present bit. */ 5993 /* Set the present bit. */
5931 mask |= 1ull; 5994 mask |= 1ull;
5932 5995
@@ -6025,7 +6088,7 @@ int kvm_arch_init(void *opaque)
6025 6088
6026 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6089 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
6027 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6090 PT_DIRTY_MASK, PT64_NX_MASK, 0,
6028 PT_PRESENT_MASK); 6091 PT_PRESENT_MASK, 0);
6029 kvm_timer_init(); 6092 kvm_timer_init();
6030 6093
6031 perf_register_guest_info_callbacks(&kvm_guest_cbs); 6094 perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6087,6 +6150,35 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
6087} 6150}
6088EXPORT_SYMBOL_GPL(kvm_emulate_halt); 6151EXPORT_SYMBOL_GPL(kvm_emulate_halt);
6089 6152
6153#ifdef CONFIG_X86_64
6154static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
6155 unsigned long clock_type)
6156{
6157 struct kvm_clock_pairing clock_pairing;
6158 struct timespec ts;
6159 u64 cycle;
6160 int ret;
6161
6162 if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
6163 return -KVM_EOPNOTSUPP;
6164
6165 if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
6166 return -KVM_EOPNOTSUPP;
6167
6168 clock_pairing.sec = ts.tv_sec;
6169 clock_pairing.nsec = ts.tv_nsec;
6170 clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
6171 clock_pairing.flags = 0;
6172
6173 ret = 0;
6174 if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
6175 sizeof(struct kvm_clock_pairing)))
6176 ret = -KVM_EFAULT;
6177
6178 return ret;
6179}
6180#endif
6181
6090/* 6182/*
6091 * kvm_pv_kick_cpu_op: Kick a vcpu. 6183 * kvm_pv_kick_cpu_op: Kick a vcpu.
6092 * 6184 *
@@ -6151,6 +6243,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
6151 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); 6243 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
6152 ret = 0; 6244 ret = 0;
6153 break; 6245 break;
6246#ifdef CONFIG_X86_64
6247 case KVM_HC_CLOCK_PAIRING:
6248 ret = kvm_pv_clock_pairing(vcpu, a0, a1);
6249 break;
6250#endif
6154 default: 6251 default:
6155 ret = -KVM_ENOSYS; 6252 ret = -KVM_ENOSYS;
6156 break; 6253 break;
@@ -6564,7 +6661,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6564 if (irqchip_split(vcpu->kvm)) 6661 if (irqchip_split(vcpu->kvm))
6565 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); 6662 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
6566 else { 6663 else {
6567 if (vcpu->arch.apicv_active) 6664 if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
6568 kvm_x86_ops->sync_pir_to_irr(vcpu); 6665 kvm_x86_ops->sync_pir_to_irr(vcpu);
6569 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); 6666 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
6570 } 6667 }
@@ -6655,10 +6752,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6655 r = 0; 6752 r = 0;
6656 goto out; 6753 goto out;
6657 } 6754 }
6658 if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
6659 vcpu->fpu_active = 0;
6660 kvm_x86_ops->fpu_deactivate(vcpu);
6661 }
6662 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { 6755 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
6663 /* Page is swapped out. Do synthetic halt */ 6756 /* Page is swapped out. Do synthetic halt */
6664 vcpu->arch.apf.halted = true; 6757 vcpu->arch.apf.halted = true;
@@ -6718,21 +6811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6718 kvm_hv_process_stimers(vcpu); 6811 kvm_hv_process_stimers(vcpu);
6719 } 6812 }
6720 6813
6721 /*
6722 * KVM_REQ_EVENT is not set when posted interrupts are set by
6723 * VT-d hardware, so we have to update RVI unconditionally.
6724 */
6725 if (kvm_lapic_enabled(vcpu)) {
6726 /*
6727 * Update architecture specific hints for APIC
6728 * virtual interrupt delivery.
6729 */
6730 if (vcpu->arch.apicv_active)
6731 kvm_x86_ops->hwapic_irr_update(vcpu,
6732 kvm_lapic_find_highest_irr(vcpu));
6733 }
6734
6735 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 6814 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
6815 ++vcpu->stat.req_event;
6736 kvm_apic_accept_events(vcpu); 6816 kvm_apic_accept_events(vcpu);
6737 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 6817 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
6738 r = 1; 6818 r = 1;
@@ -6773,22 +6853,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6773 preempt_disable(); 6853 preempt_disable();
6774 6854
6775 kvm_x86_ops->prepare_guest_switch(vcpu); 6855 kvm_x86_ops->prepare_guest_switch(vcpu);
6776 if (vcpu->fpu_active) 6856 kvm_load_guest_fpu(vcpu);
6777 kvm_load_guest_fpu(vcpu); 6857
6858 /*
6859 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
6860 * IPI are then delayed after guest entry, which ensures that they
6861 * result in virtual interrupt delivery.
6862 */
6863 local_irq_disable();
6778 vcpu->mode = IN_GUEST_MODE; 6864 vcpu->mode = IN_GUEST_MODE;
6779 6865
6780 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 6866 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6781 6867
6782 /* 6868 /*
6783 * We should set ->mode before check ->requests, 6869 * 1) We should set ->mode before checking ->requests. Please see
6784 * Please see the comment in kvm_make_all_cpus_request. 6870 * the comment in kvm_make_all_cpus_request.
6785 * This also orders the write to mode from any reads 6871 *
6786 * to the page tables done while the VCPU is running. 6872 * 2) For APICv, we should set ->mode before checking PIR.ON. This
6787 * Please see the comment in kvm_flush_remote_tlbs. 6873 * pairs with the memory barrier implicit in pi_test_and_set_on
6874 * (see vmx_deliver_posted_interrupt).
6875 *
6876 * 3) This also orders the write to mode from any reads to the page
6877 * tables done while the VCPU is running. Please see the comment
6878 * in kvm_flush_remote_tlbs.
6788 */ 6879 */
6789 smp_mb__after_srcu_read_unlock(); 6880 smp_mb__after_srcu_read_unlock();
6790 6881
6791 local_irq_disable(); 6882 /*
6883 * This handles the case where a posted interrupt was
6884 * notified with kvm_vcpu_kick.
6885 */
6886 if (kvm_lapic_enabled(vcpu)) {
6887 if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
6888 kvm_x86_ops->sync_pir_to_irr(vcpu);
6889 }
6792 6890
6793 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests 6891 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
6794 || need_resched() || signal_pending(current)) { 6892 || need_resched() || signal_pending(current)) {
@@ -6927,6 +7025,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
6927 7025
6928static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 7026static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
6929{ 7027{
7028 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7029 kvm_x86_ops->check_nested_events(vcpu, false);
7030
6930 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 7031 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6931 !vcpu->arch.apf.halted); 7032 !vcpu->arch.apf.halted);
6932} 7033}
@@ -7098,7 +7199,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
7098 } else 7199 } else
7099 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 7200 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
7100 7201
7101 r = vcpu_run(vcpu); 7202 if (kvm_run->immediate_exit)
7203 r = -EINTR;
7204 else
7205 r = vcpu_run(vcpu);
7102 7206
7103out: 7207out:
7104 post_kvm_run_save(vcpu); 7208 post_kvm_run_save(vcpu);
@@ -8293,9 +8397,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
8293 8397
8294int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 8398int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
8295{ 8399{
8296 if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
8297 kvm_x86_ops->check_nested_events(vcpu, false);
8298
8299 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 8400 return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
8300} 8401}
8301 8402
@@ -8432,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
8432 8533
8433static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) 8534static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
8434{ 8535{
8435 8536 return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
8436 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, 8537 sizeof(val));
8437 sizeof(val));
8438} 8538}
8439 8539
8440void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, 8540void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index bdce33291161..384f661a6496 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -90,4 +90,16 @@ config PTP_1588_CLOCK_PCH
90 To compile this driver as a module, choose M here: the module 90 To compile this driver as a module, choose M here: the module
91 will be called ptp_pch. 91 will be called ptp_pch.
92 92
93config PTP_1588_CLOCK_KVM
94 tristate "KVM virtual PTP clock"
95 depends on PTP_1588_CLOCK
96 depends on KVM_GUEST && X86
97 default y
98 help
99 This driver adds support for using kvm infrastructure as a PTP
100 clock. This clock is only useful if you are using KVM guests.
101
102 To compile this driver as a module, choose M here: the module
103 will be called ptp_kvm.
104
93endmenu 105endmenu
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 8b58597298de..530736161a8b 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -6,3 +6,4 @@ ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o
6obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o 6obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o
7obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o 7obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o
8obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o 8obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o
9obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
new file mode 100644
index 000000000000..09b4df74291e
--- /dev/null
+++ b/drivers/ptp/ptp_kvm.c
@@ -0,0 +1,207 @@
1/*
2 * Virtual PTP 1588 clock for use with KVM guests
3 *
4 * Copyright (C) 2017 Red Hat Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17#include <linux/device.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <uapi/linux/kvm_para.h>
23#include <asm/kvm_para.h>
24#include <asm/pvclock.h>
25#include <asm/kvmclock.h>
26#include <uapi/asm/kvm_para.h>
27
28#include <linux/ptp_clock_kernel.h>
29
30struct kvm_ptp_clock {
31 struct ptp_clock *ptp_clock;
32 struct ptp_clock_info caps;
33};
34
35DEFINE_SPINLOCK(kvm_ptp_lock);
36
37static struct pvclock_vsyscall_time_info *hv_clock;
38
39static struct kvm_clock_pairing clock_pair;
40static phys_addr_t clock_pair_gpa;
41
42static int ptp_kvm_get_time_fn(ktime_t *device_time,
43 struct system_counterval_t *system_counter,
44 void *ctx)
45{
46 unsigned long ret;
47 struct timespec64 tspec;
48 unsigned version;
49 int cpu;
50 struct pvclock_vcpu_time_info *src;
51
52 spin_lock(&kvm_ptp_lock);
53
54 preempt_disable_notrace();
55 cpu = smp_processor_id();
56 src = &hv_clock[cpu].pvti;
57
58 do {
59 /*
60 * We are using a TSC value read in the hosts
61 * kvm_hc_clock_pairing handling.
62 * So any changes to tsc_to_system_mul
63 * and tsc_shift or any other pvclock
64 * data invalidate that measurement.
65 */
66 version = pvclock_read_begin(src);
67
68 ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
69 clock_pair_gpa,
70 KVM_CLOCK_PAIRING_WALLCLOCK);
71 if (ret != 0) {
72 pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
73 spin_unlock(&kvm_ptp_lock);
74 preempt_enable_notrace();
75 return -EOPNOTSUPP;
76 }
77
78 tspec.tv_sec = clock_pair.sec;
79 tspec.tv_nsec = clock_pair.nsec;
80 ret = __pvclock_read_cycles(src, clock_pair.tsc);
81 } while (pvclock_read_retry(src, version));
82
83 preempt_enable_notrace();
84
85 system_counter->cycles = ret;
86 system_counter->cs = &kvm_clock;
87
88 *device_time = timespec64_to_ktime(tspec);
89
90 spin_unlock(&kvm_ptp_lock);
91
92 return 0;
93}
94
95static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp,
96 struct system_device_crosststamp *xtstamp)
97{
98 return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL,
99 NULL, xtstamp);
100}
101
102/*
103 * PTP clock operations
104 */
105
106static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
107{
108 return -EOPNOTSUPP;
109}
110
111static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta)
112{
113 return -EOPNOTSUPP;
114}
115
116static int ptp_kvm_settime(struct ptp_clock_info *ptp,
117 const struct timespec64 *ts)
118{
119 return -EOPNOTSUPP;
120}
121
122static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
123{
124 unsigned long ret;
125 struct timespec64 tspec;
126
127 spin_lock(&kvm_ptp_lock);
128
129 ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
130 clock_pair_gpa,
131 KVM_CLOCK_PAIRING_WALLCLOCK);
132 if (ret != 0) {
133 pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
134 spin_unlock(&kvm_ptp_lock);
135 return -EOPNOTSUPP;
136 }
137
138 tspec.tv_sec = clock_pair.sec;
139 tspec.tv_nsec = clock_pair.nsec;
140 spin_unlock(&kvm_ptp_lock);
141
142 memcpy(ts, &tspec, sizeof(struct timespec64));
143
144 return 0;
145}
146
147static int ptp_kvm_enable(struct ptp_clock_info *ptp,
148 struct ptp_clock_request *rq, int on)
149{
150 return -EOPNOTSUPP;
151}
152
153static struct ptp_clock_info ptp_kvm_caps = {
154 .owner = THIS_MODULE,
155 .name = "KVM virtual PTP",
156 .max_adj = 0,
157 .n_ext_ts = 0,
158 .n_pins = 0,
159 .pps = 0,
160 .adjfreq = ptp_kvm_adjfreq,
161 .adjtime = ptp_kvm_adjtime,
162 .gettime64 = ptp_kvm_gettime,
163 .settime64 = ptp_kvm_settime,
164 .enable = ptp_kvm_enable,
165 .getcrosststamp = ptp_kvm_getcrosststamp,
166};
167
168/* module operations */
169
170static struct kvm_ptp_clock kvm_ptp_clock;
171
172static void __exit ptp_kvm_exit(void)
173{
174 ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
175}
176
177static int __init ptp_kvm_init(void)
178{
179 long ret;
180
181 clock_pair_gpa = slow_virt_to_phys(&clock_pair);
182 hv_clock = pvclock_pvti_cpu0_va();
183
184 if (!hv_clock)
185 return -ENODEV;
186
187 ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
188 KVM_CLOCK_PAIRING_WALLCLOCK);
189 if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
190 return -ENODEV;
191
192 kvm_ptp_clock.caps = ptp_kvm_caps;
193
194 kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
195
196 if (IS_ERR(kvm_ptp_clock.ptp_clock))
197 return PTR_ERR(kvm_ptp_clock.ptp_clock);
198
199 return 0;
200}
201
202module_init(ptp_kvm_init);
203module_exit(ptp_kvm_exit);
204
205MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
206MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
207MODULE_LICENSE("GPL");
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 5c970ce67949..fe797d6ef89d 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -23,20 +23,24 @@
23#include <linux/hrtimer.h> 23#include <linux/hrtimer.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25 25
26struct arch_timer_kvm { 26struct arch_timer_context {
27 /* Registers: control register, timer value */
28 u32 cnt_ctl;
29 u64 cnt_cval;
30
31 /* Timer IRQ */
32 struct kvm_irq_level irq;
33
34 /* Active IRQ state caching */
35 bool active_cleared_last;
36
27 /* Virtual offset */ 37 /* Virtual offset */
28 u64 cntvoff; 38 u64 cntvoff;
29}; 39};
30 40
31struct arch_timer_cpu { 41struct arch_timer_cpu {
32 /* Registers: control register, timer value */ 42 struct arch_timer_context vtimer;
33 u32 cntv_ctl; /* Saved/restored */ 43 struct arch_timer_context ptimer;
34 u64 cntv_cval; /* Saved/restored */
35
36 /*
37 * Anything that is not used directly from assembly code goes
38 * here.
39 */
40 44
41 /* Background timer used when the guest is not running */ 45 /* Background timer used when the guest is not running */
42 struct hrtimer timer; 46 struct hrtimer timer;
@@ -47,21 +51,15 @@ struct arch_timer_cpu {
47 /* Background timer active */ 51 /* Background timer active */
48 bool armed; 52 bool armed;
49 53
50 /* Timer IRQ */
51 struct kvm_irq_level irq;
52
53 /* Active IRQ state caching */
54 bool active_cleared_last;
55
56 /* Is the timer enabled */ 54 /* Is the timer enabled */
57 bool enabled; 55 bool enabled;
58}; 56};
59 57
60int kvm_timer_hyp_init(void); 58int kvm_timer_hyp_init(void);
61int kvm_timer_enable(struct kvm_vcpu *vcpu); 59int kvm_timer_enable(struct kvm_vcpu *vcpu);
62void kvm_timer_init(struct kvm *kvm);
63int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, 60int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
64 const struct kvm_irq_level *irq); 61 const struct kvm_irq_level *virt_irq,
62 const struct kvm_irq_level *phys_irq);
65void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); 63void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
66void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); 64void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
67void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); 65void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -70,11 +68,16 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
70u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); 68u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
71int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); 69int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
72 70
73bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); 71bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
74void kvm_timer_schedule(struct kvm_vcpu *vcpu); 72void kvm_timer_schedule(struct kvm_vcpu *vcpu);
75void kvm_timer_unschedule(struct kvm_vcpu *vcpu); 73void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
76 74
75u64 kvm_phys_timer_read(void);
76
77void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu); 77void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu);
78 78
79void kvm_timer_init_vhe(void); 79void kvm_timer_init_vhe(void);
80
81#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer)
82#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer)
80#endif 83#endif
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 002f0922cd92..b72dd2ad5f44 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -71,6 +71,8 @@ struct vgic_global {
71 71
72 /* GIC system register CPU interface */ 72 /* GIC system register CPU interface */
73 struct static_key_false gicv3_cpuif; 73 struct static_key_false gicv3_cpuif;
74
75 u32 ich_vtr_el2;
74}; 76};
75 77
76extern struct vgic_global kvm_vgic_global_state; 78extern struct vgic_global kvm_vgic_global_state;
@@ -101,9 +103,10 @@ struct vgic_irq {
101 */ 103 */
102 104
103 u32 intid; /* Guest visible INTID */ 105 u32 intid; /* Guest visible INTID */
104 bool pending;
105 bool line_level; /* Level only */ 106 bool line_level; /* Level only */
106 bool soft_pending; /* Level only */ 107 bool pending_latch; /* The pending latch state used to calculate
108 * the pending state for both level
109 * and edge triggered IRQs. */
107 bool active; /* not used for LPIs */ 110 bool active; /* not used for LPIs */
108 bool enabled; 111 bool enabled;
109 bool hw; /* Tied to HW IRQ */ 112 bool hw; /* Tied to HW IRQ */
@@ -165,6 +168,8 @@ struct vgic_its {
165 struct list_head collection_list; 168 struct list_head collection_list;
166}; 169};
167 170
171struct vgic_state_iter;
172
168struct vgic_dist { 173struct vgic_dist {
169 bool in_kernel; 174 bool in_kernel;
170 bool ready; 175 bool ready;
@@ -212,6 +217,9 @@ struct vgic_dist {
212 spinlock_t lpi_list_lock; 217 spinlock_t lpi_list_lock;
213 struct list_head lpi_list_head; 218 struct list_head lpi_list_head;
214 int lpi_list_count; 219 int lpi_list_count;
220
221 /* used by vgic-debug */
222 struct vgic_state_iter *iter;
215}; 223};
216 224
217struct vgic_v2_cpu_if { 225struct vgic_v2_cpu_if {
@@ -269,6 +277,12 @@ struct vgic_cpu {
269 u64 pendbaser; 277 u64 pendbaser;
270 278
271 bool lpis_enabled; 279 bool lpis_enabled;
280
281 /* Cache guest priority bits */
282 u32 num_pri_bits;
283
284 /* Cache guest interrupt ID bits */
285 u32 num_id_bits;
272}; 286};
273 287
274extern struct static_key_false vgic_v2_cpuif_trap; 288extern struct static_key_false vgic_v2_cpuif_trap;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 725e86b506f3..672cfef72fc8 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -349,8 +349,30 @@
349/* 349/*
350 * CPU interface registers 350 * CPU interface registers
351 */ 351 */
352#define ICC_CTLR_EL1_EOImode_drop_dir (0U << 1) 352#define ICC_CTLR_EL1_EOImode_SHIFT (1)
353#define ICC_CTLR_EL1_EOImode_drop (1U << 1) 353#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT)
354#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT)
355#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT)
356#define ICC_CTLR_EL1_CBPR_SHIFT 0
357#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT)
358#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8
359#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT)
360#define ICC_CTLR_EL1_ID_BITS_SHIFT 11
361#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT)
362#define ICC_CTLR_EL1_SEIS_SHIFT 14
363#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT)
364#define ICC_CTLR_EL1_A3V_SHIFT 15
365#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT)
366#define ICC_PMR_EL1_SHIFT 0
367#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT)
368#define ICC_BPR0_EL1_SHIFT 0
369#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT)
370#define ICC_BPR1_EL1_SHIFT 0
371#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT)
372#define ICC_IGRPEN0_EL1_SHIFT 0
373#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT)
374#define ICC_IGRPEN1_EL1_SHIFT 0
375#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT)
354#define ICC_SRE_EL1_SRE (1U << 0) 376#define ICC_SRE_EL1_SRE (1U << 0)
355 377
356/* 378/*
@@ -379,14 +401,29 @@
379#define ICH_HCR_EN (1 << 0) 401#define ICH_HCR_EN (1 << 0)
380#define ICH_HCR_UIE (1 << 1) 402#define ICH_HCR_UIE (1 << 1)
381 403
382#define ICH_VMCR_CTLR_SHIFT 0 404#define ICH_VMCR_CBPR_SHIFT 4
383#define ICH_VMCR_CTLR_MASK (0x21f << ICH_VMCR_CTLR_SHIFT) 405#define ICH_VMCR_CBPR_MASK (1 << ICH_VMCR_CBPR_SHIFT)
406#define ICH_VMCR_EOIM_SHIFT 9
407#define ICH_VMCR_EOIM_MASK (1 << ICH_VMCR_EOIM_SHIFT)
384#define ICH_VMCR_BPR1_SHIFT 18 408#define ICH_VMCR_BPR1_SHIFT 18
385#define ICH_VMCR_BPR1_MASK (7 << ICH_VMCR_BPR1_SHIFT) 409#define ICH_VMCR_BPR1_MASK (7 << ICH_VMCR_BPR1_SHIFT)
386#define ICH_VMCR_BPR0_SHIFT 21 410#define ICH_VMCR_BPR0_SHIFT 21
387#define ICH_VMCR_BPR0_MASK (7 << ICH_VMCR_BPR0_SHIFT) 411#define ICH_VMCR_BPR0_MASK (7 << ICH_VMCR_BPR0_SHIFT)
388#define ICH_VMCR_PMR_SHIFT 24 412#define ICH_VMCR_PMR_SHIFT 24
389#define ICH_VMCR_PMR_MASK (0xffUL << ICH_VMCR_PMR_SHIFT) 413#define ICH_VMCR_PMR_MASK (0xffUL << ICH_VMCR_PMR_SHIFT)
414#define ICH_VMCR_ENG0_SHIFT 0
415#define ICH_VMCR_ENG0_MASK (1 << ICH_VMCR_ENG0_SHIFT)
416#define ICH_VMCR_ENG1_SHIFT 1
417#define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT)
418
419#define ICH_VTR_PRI_BITS_SHIFT 29
420#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT)
421#define ICH_VTR_ID_BITS_SHIFT 23
422#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT)
423#define ICH_VTR_SEIS_SHIFT 22
424#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT)
425#define ICH_VTR_A3V_SHIFT 21
426#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT)
390 427
391#define ICC_IAR1_EL1_SPURIOUS 0x3ff 428#define ICC_IAR1_EL1_SPURIOUS 0x3ff
392 429
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c5190dab2c1..8d69d5150748 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -45,7 +45,6 @@
45 * include/linux/kvm_h. 45 * include/linux/kvm_h.
46 */ 46 */
47#define KVM_MEMSLOT_INVALID (1UL << 16) 47#define KVM_MEMSLOT_INVALID (1UL << 16)
48#define KVM_MEMSLOT_INCOHERENT (1UL << 17)
49 48
50/* Two fragments for cross MMIO pages. */ 49/* Two fragments for cross MMIO pages. */
51#define KVM_MAX_MMIO_FRAGMENTS 2 50#define KVM_MAX_MMIO_FRAGMENTS 2
@@ -222,7 +221,6 @@ struct kvm_vcpu {
222 struct mutex mutex; 221 struct mutex mutex;
223 struct kvm_run *run; 222 struct kvm_run *run;
224 223
225 int fpu_active;
226 int guest_fpu_loaded, guest_xcr0_loaded; 224 int guest_fpu_loaded, guest_xcr0_loaded;
227 struct swait_queue_head wq; 225 struct swait_queue_head wq;
228 struct pid *pid; 226 struct pid *pid;
@@ -642,18 +640,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
642int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 640int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
643 unsigned long len); 641 unsigned long len);
644int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); 642int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
645int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 643int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
646 void *data, unsigned long len); 644 void *data, unsigned long len);
647int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 645int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
648 int offset, int len); 646 int offset, int len);
649int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 647int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
650 unsigned long len); 648 unsigned long len);
651int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 649int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
652 void *data, unsigned long len); 650 void *data, unsigned long len);
653int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 651int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
654 void *data, int offset, unsigned long len); 652 void *data, int offset, unsigned long len);
655int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 653int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
656 gpa_t gpa, unsigned long len); 654 gpa_t gpa, unsigned long len);
657int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); 655int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
658int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); 656int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
659struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); 657struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e0035808c814..f51d5082a377 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
218struct kvm_run { 218struct kvm_run {
219 /* in */ 219 /* in */
220 __u8 request_interrupt_window; 220 __u8 request_interrupt_window;
221 __u8 padding1[7]; 221 __u8 immediate_exit;
222 __u8 padding1[6];
222 223
223 /* out */ 224 /* out */
224 __u32 exit_reason; 225 __u32 exit_reason;
@@ -685,6 +686,13 @@ struct kvm_ppc_smmu_info {
685 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 686 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
686}; 687};
687 688
689/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
690struct kvm_ppc_resize_hpt {
691 __u64 flags;
692 __u32 shift;
693 __u32 pad;
694};
695
688#define KVMIO 0xAE 696#define KVMIO 0xAE
689 697
690/* machine type bits, to be used as argument to KVM_CREATE_VM */ 698/* machine type bits, to be used as argument to KVM_CREATE_VM */
@@ -871,8 +879,10 @@ struct kvm_ppc_smmu_info {
871#define KVM_CAP_S390_USER_INSTR0 130 879#define KVM_CAP_S390_USER_INSTR0 130
872#define KVM_CAP_MSI_DEVID 131 880#define KVM_CAP_MSI_DEVID 131
873#define KVM_CAP_PPC_HTM 132 881#define KVM_CAP_PPC_HTM 132
882#define KVM_CAP_SPAPR_RESIZE_HPT 133
874#define KVM_CAP_PPC_MMU_RADIX 134 883#define KVM_CAP_PPC_MMU_RADIX 134
875#define KVM_CAP_PPC_MMU_HASH_V3 135 884#define KVM_CAP_PPC_MMU_HASH_V3 135
885#define KVM_CAP_IMMEDIATE_EXIT 136
876 886
877#ifdef KVM_CAP_IRQ_ROUTING 887#ifdef KVM_CAP_IRQ_ROUTING
878 888
@@ -1189,6 +1199,9 @@ struct kvm_s390_ucas_mapping {
1189#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) 1199#define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr)
1190/* Available with KVM_CAP_PPC_RTAS */ 1200/* Available with KVM_CAP_PPC_RTAS */
1191#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) 1201#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args)
1202/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
1203#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
1204#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
1192/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ 1205/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
1193#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) 1206#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg)
1194/* Available with KVM_CAP_PPC_RADIX_MMU */ 1207/* Available with KVM_CAP_PPC_RADIX_MMU */
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index bf6cd7d5cac2..fed506aeff62 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -14,6 +14,7 @@
14#define KVM_EFAULT EFAULT 14#define KVM_EFAULT EFAULT
15#define KVM_E2BIG E2BIG 15#define KVM_E2BIG E2BIG
16#define KVM_EPERM EPERM 16#define KVM_EPERM EPERM
17#define KVM_EOPNOTSUPP 95
17 18
18#define KVM_HC_VAPIC_POLL_IRQ 1 19#define KVM_HC_VAPIC_POLL_IRQ 1
19#define KVM_HC_MMU_OP 2 20#define KVM_HC_MMU_OP 2
@@ -23,6 +24,7 @@
23#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 24#define KVM_HC_MIPS_GET_CLOCK_FREQ 6
24#define KVM_HC_MIPS_EXIT_VM 7 25#define KVM_HC_MIPS_EXIT_VM 7
25#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 26#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
27#define KVM_HC_CLOCK_PAIRING 9
26 28
27/* 29/*
28 * hypercalls use architecture specific 30 * hypercalls use architecture specific
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 6a084cd57b88..35d7100e0815 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -37,10 +37,10 @@ static u32 host_vtimer_irq_flags;
37 37
38void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 38void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
39{ 39{
40 vcpu->arch.timer_cpu.active_cleared_last = false; 40 vcpu_vtimer(vcpu)->active_cleared_last = false;
41} 41}
42 42
43static u64 kvm_phys_timer_read(void) 43u64 kvm_phys_timer_read(void)
44{ 44{
45 return timecounter->cc->read(timecounter->cc); 45 return timecounter->cc->read(timecounter->cc);
46} 46}
@@ -98,12 +98,12 @@ static void kvm_timer_inject_irq_work(struct work_struct *work)
98 kvm_vcpu_kick(vcpu); 98 kvm_vcpu_kick(vcpu);
99} 99}
100 100
101static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu) 101static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
102{ 102{
103 u64 cval, now; 103 u64 cval, now;
104 104
105 cval = vcpu->arch.timer_cpu.cntv_cval; 105 cval = timer_ctx->cnt_cval;
106 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; 106 now = kvm_phys_timer_read() - timer_ctx->cntvoff;
107 107
108 if (now < cval) { 108 if (now < cval) {
109 u64 ns; 109 u64 ns;
@@ -118,6 +118,35 @@ static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu)
118 return 0; 118 return 0;
119} 119}
120 120
121static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
122{
123 return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
124 (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
125}
126
127/*
128 * Returns the earliest expiration time in ns among guest timers.
129 * Note that it will return 0 if none of timers can fire.
130 */
131static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
132{
133 u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX;
134 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
135 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
136
137 if (kvm_timer_irq_can_fire(vtimer))
138 min_virt = kvm_timer_compute_delta(vtimer);
139
140 if (kvm_timer_irq_can_fire(ptimer))
141 min_phys = kvm_timer_compute_delta(ptimer);
142
143 /* If none of timers can fire, then return 0 */
144 if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX))
145 return 0;
146
147 return min(min_virt, min_phys);
148}
149
121static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) 150static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
122{ 151{
123 struct arch_timer_cpu *timer; 152 struct arch_timer_cpu *timer;
@@ -132,7 +161,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
132 * PoV (NTP on the host may have forced it to expire 161 * PoV (NTP on the host may have forced it to expire
133 * early). If we should have slept longer, restart it. 162 * early). If we should have slept longer, restart it.
134 */ 163 */
135 ns = kvm_timer_compute_delta(vcpu); 164 ns = kvm_timer_earliest_exp(vcpu);
136 if (unlikely(ns)) { 165 if (unlikely(ns)) {
137 hrtimer_forward_now(hrt, ns_to_ktime(ns)); 166 hrtimer_forward_now(hrt, ns_to_ktime(ns));
138 return HRTIMER_RESTART; 167 return HRTIMER_RESTART;
@@ -142,42 +171,33 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
142 return HRTIMER_NORESTART; 171 return HRTIMER_NORESTART;
143} 172}
144 173
145static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) 174bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
146{
147 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
148
149 return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
150 (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
151}
152
153bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
154{ 175{
155 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
156 u64 cval, now; 176 u64 cval, now;
157 177
158 if (!kvm_timer_irq_can_fire(vcpu)) 178 if (!kvm_timer_irq_can_fire(timer_ctx))
159 return false; 179 return false;
160 180
161 cval = timer->cntv_cval; 181 cval = timer_ctx->cnt_cval;
162 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; 182 now = kvm_phys_timer_read() - timer_ctx->cntvoff;
163 183
164 return cval <= now; 184 return cval <= now;
165} 185}
166 186
167static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) 187static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
188 struct arch_timer_context *timer_ctx)
168{ 189{
169 int ret; 190 int ret;
170 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
171 191
172 BUG_ON(!vgic_initialized(vcpu->kvm)); 192 BUG_ON(!vgic_initialized(vcpu->kvm));
173 193
174 timer->active_cleared_last = false; 194 timer_ctx->active_cleared_last = false;
175 timer->irq.level = new_level; 195 timer_ctx->irq.level = new_level;
176 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->irq.irq, 196 trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
177 timer->irq.level); 197 timer_ctx->irq.level);
178 ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, 198
179 timer->irq.irq, 199 ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq,
180 timer->irq.level); 200 timer_ctx->irq.level);
181 WARN_ON(ret); 201 WARN_ON(ret);
182} 202}
183 203
@@ -188,22 +208,43 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
188static int kvm_timer_update_state(struct kvm_vcpu *vcpu) 208static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
189{ 209{
190 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 210 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
211 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
212 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
191 213
192 /* 214 /*
193 * If userspace modified the timer registers via SET_ONE_REG before 215 * If userspace modified the timer registers via SET_ONE_REG before
194 * the vgic was initialized, we mustn't set the timer->irq.level value 216 * the vgic was initialized, we mustn't set the vtimer->irq.level value
195 * because the guest would never see the interrupt. Instead wait 217 * because the guest would never see the interrupt. Instead wait
196 * until we call this function from kvm_timer_flush_hwstate. 218 * until we call this function from kvm_timer_flush_hwstate.
197 */ 219 */
198 if (!vgic_initialized(vcpu->kvm) || !timer->enabled) 220 if (!vgic_initialized(vcpu->kvm) || !timer->enabled)
199 return -ENODEV; 221 return -ENODEV;
200 222
201 if (kvm_timer_should_fire(vcpu) != timer->irq.level) 223 if (kvm_timer_should_fire(vtimer) != vtimer->irq.level)
202 kvm_timer_update_irq(vcpu, !timer->irq.level); 224 kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer);
225
226 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
227 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
203 228
204 return 0; 229 return 0;
205} 230}
206 231
232/* Schedule the background timer for the emulated timer. */
233static void kvm_timer_emulate(struct kvm_vcpu *vcpu,
234 struct arch_timer_context *timer_ctx)
235{
236 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
237
238 if (kvm_timer_should_fire(timer_ctx))
239 return;
240
241 if (!kvm_timer_irq_can_fire(timer_ctx))
242 return;
243
244 /* The timer has not yet expired, schedule a background timer */
245 timer_arm(timer, kvm_timer_compute_delta(timer_ctx));
246}
247
207/* 248/*
208 * Schedule the background timer before calling kvm_vcpu_block, so that this 249 * Schedule the background timer before calling kvm_vcpu_block, so that this
209 * thread is removed from its waitqueue and made runnable when there's a timer 250 * thread is removed from its waitqueue and made runnable when there's a timer
@@ -212,26 +253,31 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu)
212void kvm_timer_schedule(struct kvm_vcpu *vcpu) 253void kvm_timer_schedule(struct kvm_vcpu *vcpu)
213{ 254{
214 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 255 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
256 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
257 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
215 258
216 BUG_ON(timer_is_armed(timer)); 259 BUG_ON(timer_is_armed(timer));
217 260
218 /* 261 /*
219 * No need to schedule a background timer if the guest timer has 262 * No need to schedule a background timer if any guest timer has
220 * already expired, because kvm_vcpu_block will return before putting 263 * already expired, because kvm_vcpu_block will return before putting
221 * the thread to sleep. 264 * the thread to sleep.
222 */ 265 */
223 if (kvm_timer_should_fire(vcpu)) 266 if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer))
224 return; 267 return;
225 268
226 /* 269 /*
227 * If the timer is not capable of raising interrupts (disabled or 270 * If both timers are not capable of raising interrupts (disabled or
228 * masked), then there's no more work for us to do. 271 * masked), then there's no more work for us to do.
229 */ 272 */
230 if (!kvm_timer_irq_can_fire(vcpu)) 273 if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer))
231 return; 274 return;
232 275
233 /* The timer has not yet expired, schedule a background timer */ 276 /*
234 timer_arm(timer, kvm_timer_compute_delta(vcpu)); 277 * The guest timers have not yet expired, schedule a background timer.
278 * Set the earliest expiration time among the guest timers.
279 */
280 timer_arm(timer, kvm_timer_earliest_exp(vcpu));
235} 281}
236 282
237void kvm_timer_unschedule(struct kvm_vcpu *vcpu) 283void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
@@ -249,13 +295,16 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
249 */ 295 */
250void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) 296void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
251{ 297{
252 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 298 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
253 bool phys_active; 299 bool phys_active;
254 int ret; 300 int ret;
255 301
256 if (kvm_timer_update_state(vcpu)) 302 if (kvm_timer_update_state(vcpu))
257 return; 303 return;
258 304
305 /* Set the background timer for the physical timer emulation. */
306 kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu));
307
259 /* 308 /*
260 * If we enter the guest with the virtual input level to the VGIC 309 * If we enter the guest with the virtual input level to the VGIC
261 * asserted, then we have already told the VGIC what we need to, and 310 * asserted, then we have already told the VGIC what we need to, and
@@ -273,8 +322,8 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
273 * to ensure that hardware interrupts from the timer triggers a guest 322 * to ensure that hardware interrupts from the timer triggers a guest
274 * exit. 323 * exit.
275 */ 324 */
276 phys_active = timer->irq.level || 325 phys_active = vtimer->irq.level ||
277 kvm_vgic_map_is_active(vcpu, timer->irq.irq); 326 kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
278 327
279 /* 328 /*
280 * We want to avoid hitting the (re)distributor as much as 329 * We want to avoid hitting the (re)distributor as much as
@@ -296,7 +345,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
296 * - cached value is "active clear" 345 * - cached value is "active clear"
297 * - value to be programmed is "active clear" 346 * - value to be programmed is "active clear"
298 */ 347 */
299 if (timer->active_cleared_last && !phys_active) 348 if (vtimer->active_cleared_last && !phys_active)
300 return; 349 return;
301 350
302 ret = irq_set_irqchip_state(host_vtimer_irq, 351 ret = irq_set_irqchip_state(host_vtimer_irq,
@@ -304,7 +353,7 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
304 phys_active); 353 phys_active);
305 WARN_ON(ret); 354 WARN_ON(ret);
306 355
307 timer->active_cleared_last = !phys_active; 356 vtimer->active_cleared_last = !phys_active;
308} 357}
309 358
310/** 359/**
@@ -318,7 +367,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
318{ 367{
319 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 368 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
320 369
321 BUG_ON(timer_is_armed(timer)); 370 /*
371 * This is to cancel the background timer for the physical timer
372 * emulation if it is set.
373 */
374 timer_disarm(timer);
322 375
323 /* 376 /*
324 * The guest could have modified the timer registers or the timer 377 * The guest could have modified the timer registers or the timer
@@ -328,9 +381,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
328} 381}
329 382
330int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, 383int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
331 const struct kvm_irq_level *irq) 384 const struct kvm_irq_level *virt_irq,
385 const struct kvm_irq_level *phys_irq)
332{ 386{
333 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 387 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
388 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
334 389
335 /* 390 /*
336 * The vcpu timer irq number cannot be determined in 391 * The vcpu timer irq number cannot be determined in
@@ -338,7 +393,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
338 * kvm_vcpu_set_target(). To handle this, we determine 393 * kvm_vcpu_set_target(). To handle this, we determine
339 * vcpu timer irq number when the vcpu is reset. 394 * vcpu timer irq number when the vcpu is reset.
340 */ 395 */
341 timer->irq.irq = irq->irq; 396 vtimer->irq.irq = virt_irq->irq;
397 ptimer->irq.irq = phys_irq->irq;
342 398
343 /* 399 /*
344 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 400 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -346,16 +402,40 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
346 * resets the timer to be disabled and unmasked and is compliant with 402 * resets the timer to be disabled and unmasked and is compliant with
347 * the ARMv7 architecture. 403 * the ARMv7 architecture.
348 */ 404 */
349 timer->cntv_ctl = 0; 405 vtimer->cnt_ctl = 0;
406 ptimer->cnt_ctl = 0;
350 kvm_timer_update_state(vcpu); 407 kvm_timer_update_state(vcpu);
351 408
352 return 0; 409 return 0;
353} 410}
354 411
412/* Make the updates of cntvoff for all vtimer contexts atomic */
413static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
414{
415 int i;
416 struct kvm *kvm = vcpu->kvm;
417 struct kvm_vcpu *tmp;
418
419 mutex_lock(&kvm->lock);
420 kvm_for_each_vcpu(i, tmp, kvm)
421 vcpu_vtimer(tmp)->cntvoff = cntvoff;
422
423 /*
424 * When called from the vcpu create path, the CPU being created is not
425 * included in the loop above, so we just set it here as well.
426 */
427 vcpu_vtimer(vcpu)->cntvoff = cntvoff;
428 mutex_unlock(&kvm->lock);
429}
430
355void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) 431void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
356{ 432{
357 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 433 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
358 434
435 /* Synchronize cntvoff across all vtimers of a VM. */
436 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
437 vcpu_ptimer(vcpu)->cntvoff = 0;
438
359 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); 439 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
360 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 440 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
361 timer->timer.function = kvm_timer_expire; 441 timer->timer.function = kvm_timer_expire;
@@ -368,17 +448,17 @@ static void kvm_timer_init_interrupt(void *info)
368 448
369int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) 449int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
370{ 450{
371 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 451 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
372 452
373 switch (regid) { 453 switch (regid) {
374 case KVM_REG_ARM_TIMER_CTL: 454 case KVM_REG_ARM_TIMER_CTL:
375 timer->cntv_ctl = value; 455 vtimer->cnt_ctl = value;
376 break; 456 break;
377 case KVM_REG_ARM_TIMER_CNT: 457 case KVM_REG_ARM_TIMER_CNT:
378 vcpu->kvm->arch.timer.cntvoff = kvm_phys_timer_read() - value; 458 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
379 break; 459 break;
380 case KVM_REG_ARM_TIMER_CVAL: 460 case KVM_REG_ARM_TIMER_CVAL:
381 timer->cntv_cval = value; 461 vtimer->cnt_cval = value;
382 break; 462 break;
383 default: 463 default:
384 return -1; 464 return -1;
@@ -390,15 +470,15 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
390 470
391u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) 471u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
392{ 472{
393 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 473 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
394 474
395 switch (regid) { 475 switch (regid) {
396 case KVM_REG_ARM_TIMER_CTL: 476 case KVM_REG_ARM_TIMER_CTL:
397 return timer->cntv_ctl; 477 return vtimer->cnt_ctl;
398 case KVM_REG_ARM_TIMER_CNT: 478 case KVM_REG_ARM_TIMER_CNT:
399 return kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; 479 return kvm_phys_timer_read() - vtimer->cntvoff;
400 case KVM_REG_ARM_TIMER_CVAL: 480 case KVM_REG_ARM_TIMER_CVAL:
401 return timer->cntv_cval; 481 return vtimer->cnt_cval;
402 } 482 }
403 return (u64)-1; 483 return (u64)-1;
404} 484}
@@ -462,14 +542,16 @@ int kvm_timer_hyp_init(void)
462void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) 542void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
463{ 543{
464 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 544 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
545 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
465 546
466 timer_disarm(timer); 547 timer_disarm(timer);
467 kvm_vgic_unmap_phys_irq(vcpu, timer->irq.irq); 548 kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
468} 549}
469 550
470int kvm_timer_enable(struct kvm_vcpu *vcpu) 551int kvm_timer_enable(struct kvm_vcpu *vcpu)
471{ 552{
472 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 553 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
554 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
473 struct irq_desc *desc; 555 struct irq_desc *desc;
474 struct irq_data *data; 556 struct irq_data *data;
475 int phys_irq; 557 int phys_irq;
@@ -497,7 +579,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
497 * Tell the VGIC that the virtual interrupt is tied to a 579 * Tell the VGIC that the virtual interrupt is tied to a
498 * physical interrupt. We do that once per VCPU. 580 * physical interrupt. We do that once per VCPU.
499 */ 581 */
500 ret = kvm_vgic_map_phys_irq(vcpu, timer->irq.irq, phys_irq); 582 ret = kvm_vgic_map_phys_irq(vcpu, vtimer->irq.irq, phys_irq);
501 if (ret) 583 if (ret)
502 return ret; 584 return ret;
503 585
@@ -506,11 +588,6 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
506 return 0; 588 return 0;
507} 589}
508 590
509void kvm_timer_init(struct kvm *kvm)
510{
511 kvm->arch.timer.cntvoff = kvm_phys_timer_read();
512}
513
514/* 591/*
515 * On VHE system, we only need to configure trap on physical timer and counter 592 * On VHE system, we only need to configure trap on physical timer and counter
516 * accesses in EL0 and EL1 once, not for every world switch. 593 * accesses in EL0 and EL1 once, not for every world switch.
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
index 63e28dd18bb0..4734915ab71f 100644
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ b/virt/kvm/arm/hyp/timer-sr.c
@@ -25,11 +25,12 @@
25void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) 25void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
26{ 26{
27 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 27 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
28 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
28 u64 val; 29 u64 val;
29 30
30 if (timer->enabled) { 31 if (timer->enabled) {
31 timer->cntv_ctl = read_sysreg_el0(cntv_ctl); 32 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
32 timer->cntv_cval = read_sysreg_el0(cntv_cval); 33 vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
33 } 34 }
34 35
35 /* Disable the virtual timer */ 36 /* Disable the virtual timer */
@@ -52,8 +53,8 @@ void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
52 53
53void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) 54void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
54{ 55{
55 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
56 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 56 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
57 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
57 u64 val; 58 u64 val;
58 59
59 /* Those bits are already configured at boot on VHE-system */ 60 /* Those bits are already configured at boot on VHE-system */
@@ -69,9 +70,9 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
69 } 70 }
70 71
71 if (timer->enabled) { 72 if (timer->enabled) {
72 write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2); 73 write_sysreg(vtimer->cntvoff, cntvoff_el2);
73 write_sysreg_el0(timer->cntv_cval, cntv_cval); 74 write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
74 isb(); 75 isb();
75 write_sysreg_el0(timer->cntv_ctl, cntv_ctl); 76 write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
76 } 77 }
77} 78}
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
new file mode 100644
index 000000000000..7072ab743332
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -0,0 +1,283 @@
1/*
2 * Copyright (C) 2016 Linaro
3 * Author: Christoffer Dall <christoffer.dall@linaro.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18#include <linux/cpu.h>
19#include <linux/debugfs.h>
20#include <linux/interrupt.h>
21#include <linux/kvm_host.h>
22#include <linux/seq_file.h>
23#include <kvm/arm_vgic.h>
24#include <asm/kvm_mmu.h>
25#include "vgic.h"
26
27/*
28 * Structure to control looping through the entire vgic state. We start at
29 * zero for each field and move upwards. So, if dist_id is 0 we print the
30 * distributor info. When dist_id is 1, we have already printed it and move
31 * on.
32 *
33 * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
34 * so on.
35 */
36struct vgic_state_iter {
37 int nr_cpus;
38 int nr_spis;
39 int dist_id;
40 int vcpu_id;
41 int intid;
42};
43
44static void iter_next(struct vgic_state_iter *iter)
45{
46 if (iter->dist_id == 0) {
47 iter->dist_id++;
48 return;
49 }
50
51 iter->intid++;
52 if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
53 ++iter->vcpu_id < iter->nr_cpus)
54 iter->intid = 0;
55}
56
57static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
58 loff_t pos)
59{
60 int nr_cpus = atomic_read(&kvm->online_vcpus);
61
62 memset(iter, 0, sizeof(*iter));
63
64 iter->nr_cpus = nr_cpus;
65 iter->nr_spis = kvm->arch.vgic.nr_spis;
66
67 /* Fast forward to the right position if needed */
68 while (pos--)
69 iter_next(iter);
70}
71
72static bool end_of_vgic(struct vgic_state_iter *iter)
73{
74 return iter->dist_id > 0 &&
75 iter->vcpu_id == iter->nr_cpus &&
76 (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
77}
78
79static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
80{
81 struct kvm *kvm = (struct kvm *)s->private;
82 struct vgic_state_iter *iter;
83
84 mutex_lock(&kvm->lock);
85 iter = kvm->arch.vgic.iter;
86 if (iter) {
87 iter = ERR_PTR(-EBUSY);
88 goto out;
89 }
90
91 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
92 if (!iter) {
93 iter = ERR_PTR(-ENOMEM);
94 goto out;
95 }
96
97 iter_init(kvm, iter, *pos);
98 kvm->arch.vgic.iter = iter;
99
100 if (end_of_vgic(iter))
101 iter = NULL;
102out:
103 mutex_unlock(&kvm->lock);
104 return iter;
105}
106
107static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
108{
109 struct kvm *kvm = (struct kvm *)s->private;
110 struct vgic_state_iter *iter = kvm->arch.vgic.iter;
111
112 ++*pos;
113 iter_next(iter);
114 if (end_of_vgic(iter))
115 iter = NULL;
116 return iter;
117}
118
119static void vgic_debug_stop(struct seq_file *s, void *v)
120{
121 struct kvm *kvm = (struct kvm *)s->private;
122 struct vgic_state_iter *iter;
123
124 /*
125 * If the seq file wasn't properly opened, there's nothing to clearn
126 * up.
127 */
128 if (IS_ERR(v))
129 return;
130
131 mutex_lock(&kvm->lock);
132 iter = kvm->arch.vgic.iter;
133 kfree(iter);
134 kvm->arch.vgic.iter = NULL;
135 mutex_unlock(&kvm->lock);
136}
137
138static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
139{
140 seq_printf(s, "Distributor\n");
141 seq_printf(s, "===========\n");
142 seq_printf(s, "vgic_model:\t%s\n",
143 (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) ?
144 "GICv3" : "GICv2");
145 seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
146 seq_printf(s, "enabled:\t%d\n", dist->enabled);
147 seq_printf(s, "\n");
148
149 seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
150 seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
151}
152
153static void print_header(struct seq_file *s, struct vgic_irq *irq,
154 struct kvm_vcpu *vcpu)
155{
156 int id = 0;
157 char *hdr = "SPI ";
158
159 if (vcpu) {
160 hdr = "VCPU";
161 id = vcpu->vcpu_id;
162 }
163
164 seq_printf(s, "\n");
165 seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHC HWID TARGET SRC PRI VCPU_ID\n", hdr, id);
166 seq_printf(s, "---------------------------------------------------------------\n");
167}
168
169static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
170 struct kvm_vcpu *vcpu)
171{
172 char *type;
173 if (irq->intid < VGIC_NR_SGIS)
174 type = "SGI";
175 else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
176 type = "PPI";
177 else
178 type = "SPI";
179
180 if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
181 print_header(s, irq, vcpu);
182
183 seq_printf(s, " %s %4d "
184 " %2d "
185 "%d%d%d%d%d%d "
186 "%8d "
187 "%8x "
188 " %2x "
189 "%3d "
190 " %2d "
191 "\n",
192 type, irq->intid,
193 (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
194 irq->pending_latch,
195 irq->line_level,
196 irq->active,
197 irq->enabled,
198 irq->hw,
199 irq->config == VGIC_CONFIG_LEVEL,
200 irq->hwintid,
201 irq->mpidr,
202 irq->source,
203 irq->priority,
204 (irq->vcpu) ? irq->vcpu->vcpu_id : -1);
205
206}
207
208static int vgic_debug_show(struct seq_file *s, void *v)
209{
210 struct kvm *kvm = (struct kvm *)s->private;
211 struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
212 struct vgic_irq *irq;
213 struct kvm_vcpu *vcpu = NULL;
214
215 if (iter->dist_id == 0) {
216 print_dist_state(s, &kvm->arch.vgic);
217 return 0;
218 }
219
220 if (!kvm->arch.vgic.initialized)
221 return 0;
222
223 if (iter->vcpu_id < iter->nr_cpus) {
224 vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
225 irq = &vcpu->arch.vgic_cpu.private_irqs[iter->intid];
226 } else {
227 irq = &kvm->arch.vgic.spis[iter->intid - VGIC_NR_PRIVATE_IRQS];
228 }
229
230 spin_lock(&irq->irq_lock);
231 print_irq_state(s, irq, vcpu);
232 spin_unlock(&irq->irq_lock);
233
234 return 0;
235}
236
237static struct seq_operations vgic_debug_seq_ops = {
238 .start = vgic_debug_start,
239 .next = vgic_debug_next,
240 .stop = vgic_debug_stop,
241 .show = vgic_debug_show
242};
243
244static int debug_open(struct inode *inode, struct file *file)
245{
246 int ret;
247 ret = seq_open(file, &vgic_debug_seq_ops);
248 if (!ret) {
249 struct seq_file *seq;
250 /* seq_open will have modified file->private_data */
251 seq = file->private_data;
252 seq->private = inode->i_private;
253 }
254
255 return ret;
256};
257
258static struct file_operations vgic_debug_fops = {
259 .owner = THIS_MODULE,
260 .open = debug_open,
261 .read = seq_read,
262 .llseek = seq_lseek,
263 .release = seq_release
264};
265
266int vgic_debug_init(struct kvm *kvm)
267{
268 if (!kvm->debugfs_dentry)
269 return -ENOENT;
270
271 if (!debugfs_create_file("vgic-state", 0444,
272 kvm->debugfs_dentry,
273 kvm,
274 &vgic_debug_fops))
275 return -ENOMEM;
276
277 return 0;
278}
279
280int vgic_debug_destroy(struct kvm *kvm)
281{
282 return 0;
283}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index c737ea0a310a..276139a24e6f 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -259,6 +259,8 @@ int vgic_init(struct kvm *kvm)
259 if (ret) 259 if (ret)
260 goto out; 260 goto out;
261 261
262 vgic_debug_init(kvm);
263
262 dist->initialized = true; 264 dist->initialized = true;
263out: 265out:
264 return ret; 266 return ret;
@@ -288,6 +290,8 @@ static void __kvm_vgic_destroy(struct kvm *kvm)
288 struct kvm_vcpu *vcpu; 290 struct kvm_vcpu *vcpu;
289 int i; 291 int i;
290 292
293 vgic_debug_destroy(kvm);
294
291 kvm_vgic_dist_destroy(kvm); 295 kvm_vgic_dist_destroy(kvm);
292 296
293 kvm_for_each_vcpu(i, vcpu, kvm) 297 kvm_for_each_vcpu(i, vcpu, kvm)
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
index d918dcf26a5a..f138ed2e9c63 100644
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ b/virt/kvm/arm/vgic/vgic-irqfd.c
@@ -99,6 +99,9 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
99 if (!vgic_has_its(kvm)) 99 if (!vgic_has_its(kvm))
100 return -ENODEV; 100 return -ENODEV;
101 101
102 if (!level)
103 return -1;
104
102 return vgic_its_inject_msi(kvm, &msi); 105 return vgic_its_inject_msi(kvm, &msi);
103} 106}
104 107
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 8c2b3cdcb2c5..571b64a01c50 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -350,7 +350,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
350 350
351 irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); 351 irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
352 spin_lock(&irq->irq_lock); 352 spin_lock(&irq->irq_lock);
353 irq->pending = pendmask & (1U << bit_nr); 353 irq->pending_latch = pendmask & (1U << bit_nr);
354 vgic_queue_irq_unlock(vcpu->kvm, irq); 354 vgic_queue_irq_unlock(vcpu->kvm, irq);
355 vgic_put_irq(vcpu->kvm, irq); 355 vgic_put_irq(vcpu->kvm, irq);
356 } 356 }
@@ -465,7 +465,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
465 return -EBUSY; 465 return -EBUSY;
466 466
467 spin_lock(&itte->irq->irq_lock); 467 spin_lock(&itte->irq->irq_lock);
468 itte->irq->pending = true; 468 itte->irq->pending_latch = true;
469 vgic_queue_irq_unlock(kvm, itte->irq); 469 vgic_queue_irq_unlock(kvm, itte->irq);
470 470
471 return 0; 471 return 0;
@@ -913,7 +913,7 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
913 if (!itte) 913 if (!itte)
914 return E_ITS_CLEAR_UNMAPPED_INTERRUPT; 914 return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
915 915
916 itte->irq->pending = false; 916 itte->irq->pending_latch = false;
917 917
918 return 0; 918 return 0;
919} 919}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index fbe87a63d250..d181d2baee9c 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -17,6 +17,7 @@
17#include <kvm/arm_vgic.h> 17#include <kvm/arm_vgic.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <asm/kvm_mmu.h> 19#include <asm/kvm_mmu.h>
20#include <asm/cputype.h>
20#include "vgic.h" 21#include "vgic.h"
21 22
22/* common helpers */ 23/* common helpers */
@@ -230,14 +231,8 @@ int kvm_register_vgic_device(unsigned long type)
230 return ret; 231 return ret;
231} 232}
232 233
233struct vgic_reg_attr { 234int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
234 struct kvm_vcpu *vcpu; 235 struct vgic_reg_attr *reg_attr)
235 gpa_t addr;
236};
237
238static int parse_vgic_v2_attr(struct kvm_device *dev,
239 struct kvm_device_attr *attr,
240 struct vgic_reg_attr *reg_attr)
241{ 236{
242 int cpuid; 237 int cpuid;
243 238
@@ -292,14 +287,14 @@ static bool lock_all_vcpus(struct kvm *kvm)
292} 287}
293 288
294/** 289/**
295 * vgic_attr_regs_access_v2 - allows user space to access VGIC v2 state 290 * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
296 * 291 *
297 * @dev: kvm device handle 292 * @dev: kvm device handle
298 * @attr: kvm device attribute 293 * @attr: kvm device attribute
299 * @reg: address the value is read or written 294 * @reg: address the value is read or written
300 * @is_write: true if userspace is writing a register 295 * @is_write: true if userspace is writing a register
301 */ 296 */
302static int vgic_attr_regs_access_v2(struct kvm_device *dev, 297static int vgic_v2_attr_regs_access(struct kvm_device *dev,
303 struct kvm_device_attr *attr, 298 struct kvm_device_attr *attr,
304 u32 *reg, bool is_write) 299 u32 *reg, bool is_write)
305{ 300{
@@ -308,7 +303,7 @@ static int vgic_attr_regs_access_v2(struct kvm_device *dev,
308 struct kvm_vcpu *vcpu; 303 struct kvm_vcpu *vcpu;
309 int ret; 304 int ret;
310 305
311 ret = parse_vgic_v2_attr(dev, attr, &reg_attr); 306 ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
312 if (ret) 307 if (ret)
313 return ret; 308 return ret;
314 309
@@ -362,7 +357,7 @@ static int vgic_v2_set_attr(struct kvm_device *dev,
362 if (get_user(reg, uaddr)) 357 if (get_user(reg, uaddr))
363 return -EFAULT; 358 return -EFAULT;
364 359
365 return vgic_attr_regs_access_v2(dev, attr, &reg, true); 360 return vgic_v2_attr_regs_access(dev, attr, &reg, true);
366 } 361 }
367 } 362 }
368 363
@@ -384,7 +379,7 @@ static int vgic_v2_get_attr(struct kvm_device *dev,
384 u32 __user *uaddr = (u32 __user *)(long)attr->addr; 379 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
385 u32 reg = 0; 380 u32 reg = 0;
386 381
387 ret = vgic_attr_regs_access_v2(dev, attr, &reg, false); 382 ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
388 if (ret) 383 if (ret)
389 return ret; 384 return ret;
390 return put_user(reg, uaddr); 385 return put_user(reg, uaddr);
@@ -428,16 +423,211 @@ struct kvm_device_ops kvm_arm_vgic_v2_ops = {
428 .has_attr = vgic_v2_has_attr, 423 .has_attr = vgic_v2_has_attr,
429}; 424};
430 425
426int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
427 struct vgic_reg_attr *reg_attr)
428{
429 unsigned long vgic_mpidr, mpidr_reg;
430
431 /*
432 * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
433 * attr might not hold MPIDR. Hence assume vcpu0.
434 */
435 if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
436 vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
437 KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
438
439 mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
440 reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
441 } else {
442 reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
443 }
444
445 if (!reg_attr->vcpu)
446 return -EINVAL;
447
448 reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
449
450 return 0;
451}
452
453/*
454 * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
455 *
456 * @dev: kvm device handle
457 * @attr: kvm device attribute
458 * @reg: address the value is read or written
459 * @is_write: true if userspace is writing a register
460 */
461static int vgic_v3_attr_regs_access(struct kvm_device *dev,
462 struct kvm_device_attr *attr,
463 u64 *reg, bool is_write)
464{
465 struct vgic_reg_attr reg_attr;
466 gpa_t addr;
467 struct kvm_vcpu *vcpu;
468 int ret;
469 u32 tmp32;
470
471 ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
472 if (ret)
473 return ret;
474
475 vcpu = reg_attr.vcpu;
476 addr = reg_attr.addr;
477
478 mutex_lock(&dev->kvm->lock);
479
480 if (unlikely(!vgic_initialized(dev->kvm))) {
481 ret = -EBUSY;
482 goto out;
483 }
484
485 if (!lock_all_vcpus(dev->kvm)) {
486 ret = -EBUSY;
487 goto out;
488 }
489
490 switch (attr->group) {
491 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
492 if (is_write)
493 tmp32 = *reg;
494
495 ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
496 if (!is_write)
497 *reg = tmp32;
498 break;
499 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
500 if (is_write)
501 tmp32 = *reg;
502
503 ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
504 if (!is_write)
505 *reg = tmp32;
506 break;
507 case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
508 u64 regid;
509
510 regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
511 ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
512 regid, reg);
513 break;
514 }
515 case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
516 unsigned int info, intid;
517
518 info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
519 KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
520 if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
521 intid = attr->attr &
522 KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
523 ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
524 intid, reg);
525 } else {
526 ret = -EINVAL;
527 }
528 break;
529 }
530 default:
531 ret = -EINVAL;
532 break;
533 }
534
535 unlock_all_vcpus(dev->kvm);
536out:
537 mutex_unlock(&dev->kvm->lock);
538 return ret;
539}
540
431static int vgic_v3_set_attr(struct kvm_device *dev, 541static int vgic_v3_set_attr(struct kvm_device *dev,
432 struct kvm_device_attr *attr) 542 struct kvm_device_attr *attr)
433{ 543{
434 return vgic_set_common_attr(dev, attr); 544 int ret;
545
546 ret = vgic_set_common_attr(dev, attr);
547 if (ret != -ENXIO)
548 return ret;
549
550 switch (attr->group) {
551 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
552 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
553 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
554 u32 tmp32;
555 u64 reg;
556
557 if (get_user(tmp32, uaddr))
558 return -EFAULT;
559
560 reg = tmp32;
561 return vgic_v3_attr_regs_access(dev, attr, &reg, true);
562 }
563 case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
564 u64 __user *uaddr = (u64 __user *)(long)attr->addr;
565 u64 reg;
566
567 if (get_user(reg, uaddr))
568 return -EFAULT;
569
570 return vgic_v3_attr_regs_access(dev, attr, &reg, true);
571 }
572 case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
573 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
574 u64 reg;
575 u32 tmp32;
576
577 if (get_user(tmp32, uaddr))
578 return -EFAULT;
579
580 reg = tmp32;
581 return vgic_v3_attr_regs_access(dev, attr, &reg, true);
582 }
583 }
584 return -ENXIO;
435} 585}
436 586
437static int vgic_v3_get_attr(struct kvm_device *dev, 587static int vgic_v3_get_attr(struct kvm_device *dev,
438 struct kvm_device_attr *attr) 588 struct kvm_device_attr *attr)
439{ 589{
440 return vgic_get_common_attr(dev, attr); 590 int ret;
591
592 ret = vgic_get_common_attr(dev, attr);
593 if (ret != -ENXIO)
594 return ret;
595
596 switch (attr->group) {
597 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
598 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
599 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
600 u64 reg;
601 u32 tmp32;
602
603 ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
604 if (ret)
605 return ret;
606 tmp32 = reg;
607 return put_user(tmp32, uaddr);
608 }
609 case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
610 u64 __user *uaddr = (u64 __user *)(long)attr->addr;
611 u64 reg;
612
613 ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
614 if (ret)
615 return ret;
616 return put_user(reg, uaddr);
617 }
618 case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
619 u32 __user *uaddr = (u32 __user *)(long)attr->addr;
620 u64 reg;
621 u32 tmp32;
622
623 ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
624 if (ret)
625 return ret;
626 tmp32 = reg;
627 return put_user(tmp32, uaddr);
628 }
629 }
630 return -ENXIO;
441} 631}
442 632
443static int vgic_v3_has_attr(struct kvm_device *dev, 633static int vgic_v3_has_attr(struct kvm_device *dev,
@@ -451,8 +641,19 @@ static int vgic_v3_has_attr(struct kvm_device *dev,
451 return 0; 641 return 0;
452 } 642 }
453 break; 643 break;
644 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
645 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
646 case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
647 return vgic_v3_has_attr_regs(dev, attr);
454 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: 648 case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
455 return 0; 649 return 0;
650 case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
651 if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
652 KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
653 VGIC_LEVEL_INFO_LINE_LEVEL)
654 return 0;
655 break;
656 }
456 case KVM_DEV_ARM_VGIC_GRP_CTRL: 657 case KVM_DEV_ARM_VGIC_GRP_CTRL:
457 switch (attr->attr) { 658 switch (attr->attr) {
458 case KVM_DEV_ARM_VGIC_CTRL_INIT: 659 case KVM_DEV_ARM_VGIC_CTRL_INIT:
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 78e34bc4d89b..a3ad7ff95c9b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -98,7 +98,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
98 irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); 98 irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
99 99
100 spin_lock(&irq->irq_lock); 100 spin_lock(&irq->irq_lock);
101 irq->pending = true; 101 irq->pending_latch = true;
102 irq->source |= 1U << source_vcpu->vcpu_id; 102 irq->source |= 1U << source_vcpu->vcpu_id;
103 103
104 vgic_queue_irq_unlock(source_vcpu->kvm, irq); 104 vgic_queue_irq_unlock(source_vcpu->kvm, irq);
@@ -182,7 +182,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
182 182
183 irq->source &= ~((val >> (i * 8)) & 0xff); 183 irq->source &= ~((val >> (i * 8)) & 0xff);
184 if (!irq->source) 184 if (!irq->source)
185 irq->pending = false; 185 irq->pending_latch = false;
186 186
187 spin_unlock(&irq->irq_lock); 187 spin_unlock(&irq->irq_lock);
188 vgic_put_irq(vcpu->kvm, irq); 188 vgic_put_irq(vcpu->kvm, irq);
@@ -204,7 +204,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
204 irq->source |= (val >> (i * 8)) & 0xff; 204 irq->source |= (val >> (i * 8)) & 0xff;
205 205
206 if (irq->source) { 206 if (irq->source) {
207 irq->pending = true; 207 irq->pending_latch = true;
208 vgic_queue_irq_unlock(vcpu->kvm, irq); 208 vgic_queue_irq_unlock(vcpu->kvm, irq);
209 } else { 209 } else {
210 spin_unlock(&irq->irq_lock); 210 spin_unlock(&irq->irq_lock);
@@ -213,22 +213,6 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
213 } 213 }
214} 214}
215 215
216static void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
217{
218 if (kvm_vgic_global_state.type == VGIC_V2)
219 vgic_v2_set_vmcr(vcpu, vmcr);
220 else
221 vgic_v3_set_vmcr(vcpu, vmcr);
222}
223
224static void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
225{
226 if (kvm_vgic_global_state.type == VGIC_V2)
227 vgic_v2_get_vmcr(vcpu, vmcr);
228 else
229 vgic_v3_get_vmcr(vcpu, vmcr);
230}
231
232#define GICC_ARCH_VERSION_V2 0x2 216#define GICC_ARCH_VERSION_V2 0x2
233 217
234/* These are for userland accesses only, there is no guest-facing emulation. */ 218/* These are for userland accesses only, there is no guest-facing emulation. */
@@ -369,21 +353,30 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
369 353
370int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) 354int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
371{ 355{
372 int nr_irqs = dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; 356 const struct vgic_register_region *region;
373 const struct vgic_register_region *regions; 357 struct vgic_io_device iodev;
358 struct vgic_reg_attr reg_attr;
359 struct kvm_vcpu *vcpu;
374 gpa_t addr; 360 gpa_t addr;
375 int nr_regions, i, len; 361 int ret;
362
363 ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
364 if (ret)
365 return ret;
376 366
377 addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; 367 vcpu = reg_attr.vcpu;
368 addr = reg_attr.addr;
378 369
379 switch (attr->group) { 370 switch (attr->group) {
380 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: 371 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
381 regions = vgic_v2_dist_registers; 372 iodev.regions = vgic_v2_dist_registers;
382 nr_regions = ARRAY_SIZE(vgic_v2_dist_registers); 373 iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
374 iodev.base_addr = 0;
383 break; 375 break;
384 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: 376 case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
385 regions = vgic_v2_cpu_registers; 377 iodev.regions = vgic_v2_cpu_registers;
386 nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers); 378 iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
379 iodev.base_addr = 0;
387 break; 380 break;
388 default: 381 default:
389 return -ENXIO; 382 return -ENXIO;
@@ -393,43 +386,11 @@ int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
393 if (addr & 3) 386 if (addr & 3)
394 return -ENXIO; 387 return -ENXIO;
395 388
396 for (i = 0; i < nr_regions; i++) { 389 region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
397 if (regions[i].bits_per_irq) 390 if (!region)
398 len = (regions[i].bits_per_irq * nr_irqs) / 8; 391 return -ENXIO;
399 else
400 len = regions[i].len;
401
402 if (regions[i].reg_offset <= addr &&
403 regions[i].reg_offset + len > addr)
404 return 0;
405 }
406
407 return -ENXIO;
408}
409
410/*
411 * When userland tries to access the VGIC register handlers, we need to
412 * create a usable struct vgic_io_device to be passed to the handlers and we
413 * have to set up a buffer similar to what would have happened if a guest MMIO
414 * access occurred, including doing endian conversions on BE systems.
415 */
416static int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
417 bool is_write, int offset, u32 *val)
418{
419 unsigned int len = 4;
420 u8 buf[4];
421 int ret;
422
423 if (is_write) {
424 vgic_data_host_to_mmio_bus(buf, len, *val);
425 ret = kvm_io_gic_ops.write(vcpu, &dev->dev, offset, len, buf);
426 } else {
427 ret = kvm_io_gic_ops.read(vcpu, &dev->dev, offset, len, buf);
428 if (!ret)
429 *val = vgic_data_mmio_bus_to_host(buf, len);
430 }
431 392
432 return ret; 393 return 0;
433} 394}
434 395
435int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, 396int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 50f42f0f8c4f..6afb3b484886 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -18,6 +18,8 @@
18#include <kvm/arm_vgic.h> 18#include <kvm/arm_vgic.h>
19 19
20#include <asm/kvm_emulate.h> 20#include <asm/kvm_emulate.h>
21#include <asm/kvm_arm.h>
22#include <asm/kvm_mmu.h>
21 23
22#include "vgic.h" 24#include "vgic.h"
23#include "vgic-mmio.h" 25#include "vgic-mmio.h"
@@ -207,6 +209,60 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
207 return 0; 209 return 0;
208} 210}
209 211
212static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
213 gpa_t addr, unsigned int len)
214{
215 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
216 u32 value = 0;
217 int i;
218
219 /*
220 * pending state of interrupt is latched in pending_latch variable.
221 * Userspace will save and restore pending state and line_level
222 * separately.
223 * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt
224 * for handling of ISPENDR and ICPENDR.
225 */
226 for (i = 0; i < len * 8; i++) {
227 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
228
229 if (irq->pending_latch)
230 value |= (1U << i);
231
232 vgic_put_irq(vcpu->kvm, irq);
233 }
234
235 return value;
236}
237
238static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
239 gpa_t addr, unsigned int len,
240 unsigned long val)
241{
242 u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
243 int i;
244
245 for (i = 0; i < len * 8; i++) {
246 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
247
248 spin_lock(&irq->irq_lock);
249 if (test_bit(i, &val)) {
250 /*
251 * pending_latch is set irrespective of irq type
252 * (level or edge) to avoid dependency that VM should
253 * restore irq config before pending info.
254 */
255 irq->pending_latch = true;
256 vgic_queue_irq_unlock(vcpu->kvm, irq);
257 } else {
258 irq->pending_latch = false;
259 spin_unlock(&irq->irq_lock);
260 }
261
262 vgic_put_irq(vcpu->kvm, irq);
263 }
264}
265
210/* We want to avoid outer shareable. */ 266/* We want to avoid outer shareable. */
211u64 vgic_sanitise_shareability(u64 field) 267u64 vgic_sanitise_shareability(u64 field)
212{ 268{
@@ -356,7 +412,7 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
356 * We take some special care here to fix the calculation of the register 412 * We take some special care here to fix the calculation of the register
357 * offset. 413 * offset.
358 */ 414 */
359#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, bpi, acc) \ 415#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
360 { \ 416 { \
361 .reg_offset = off, \ 417 .reg_offset = off, \
362 .bits_per_irq = bpi, \ 418 .bits_per_irq = bpi, \
@@ -371,47 +427,54 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
371 .access_flags = acc, \ 427 .access_flags = acc, \
372 .read = rd, \ 428 .read = rd, \
373 .write = wr, \ 429 .write = wr, \
430 .uaccess_read = ur, \
431 .uaccess_write = uw, \
374 } 432 }
375 433
376static const struct vgic_register_region vgic_v3_dist_registers[] = { 434static const struct vgic_register_region vgic_v3_dist_registers[] = {
377 REGISTER_DESC_WITH_LENGTH(GICD_CTLR, 435 REGISTER_DESC_WITH_LENGTH(GICD_CTLR,
378 vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16, 436 vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc, 16,
379 VGIC_ACCESS_32bit), 437 VGIC_ACCESS_32bit),
438 REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
439 vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
440 VGIC_ACCESS_32bit),
380 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR, 441 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
381 vgic_mmio_read_rao, vgic_mmio_write_wi, 1, 442 vgic_mmio_read_rao, vgic_mmio_write_wi, NULL, NULL, 1,
382 VGIC_ACCESS_32bit), 443 VGIC_ACCESS_32bit),
383 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, 444 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
384 vgic_mmio_read_enable, vgic_mmio_write_senable, 1, 445 vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
385 VGIC_ACCESS_32bit), 446 VGIC_ACCESS_32bit),
386 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, 447 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
387 vgic_mmio_read_enable, vgic_mmio_write_cenable, 1, 448 vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
388 VGIC_ACCESS_32bit), 449 VGIC_ACCESS_32bit),
389 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, 450 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
390 vgic_mmio_read_pending, vgic_mmio_write_spending, 1, 451 vgic_mmio_read_pending, vgic_mmio_write_spending,
452 vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
391 VGIC_ACCESS_32bit), 453 VGIC_ACCESS_32bit),
392 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR, 454 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
393 vgic_mmio_read_pending, vgic_mmio_write_cpending, 1, 455 vgic_mmio_read_pending, vgic_mmio_write_cpending,
456 vgic_mmio_read_raz, vgic_mmio_write_wi, 1,
394 VGIC_ACCESS_32bit), 457 VGIC_ACCESS_32bit),
395 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, 458 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
396 vgic_mmio_read_active, vgic_mmio_write_sactive, 1, 459 vgic_mmio_read_active, vgic_mmio_write_sactive, NULL, NULL, 1,
397 VGIC_ACCESS_32bit), 460 VGIC_ACCESS_32bit),
398 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, 461 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
399 vgic_mmio_read_active, vgic_mmio_write_cactive, 1, 462 vgic_mmio_read_active, vgic_mmio_write_cactive, NULL, NULL, 1,
400 VGIC_ACCESS_32bit), 463 VGIC_ACCESS_32bit),
401 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, 464 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
402 vgic_mmio_read_priority, vgic_mmio_write_priority, 8, 465 vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
403 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 466 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
404 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR, 467 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
405 vgic_mmio_read_raz, vgic_mmio_write_wi, 8, 468 vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
406 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), 469 VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
407 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR, 470 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
408 vgic_mmio_read_config, vgic_mmio_write_config, 2, 471 vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
409 VGIC_ACCESS_32bit), 472 VGIC_ACCESS_32bit),
410 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR, 473 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
411 vgic_mmio_read_raz, vgic_mmio_write_wi, 1, 474 vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
412 VGIC_ACCESS_32bit), 475 VGIC_ACCESS_32bit),
413 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER, 476 REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
414 vgic_mmio_read_irouter, vgic_mmio_write_irouter, 64, 477 vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
415 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), 478 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
416 REGISTER_DESC_WITH_LENGTH(GICD_IDREGS, 479 REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
417 vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, 480 vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
@@ -422,12 +485,18 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
422 REGISTER_DESC_WITH_LENGTH(GICR_CTLR, 485 REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
423 vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, 486 vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
424 VGIC_ACCESS_32bit), 487 VGIC_ACCESS_32bit),
488 REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
489 vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
490 VGIC_ACCESS_32bit),
425 REGISTER_DESC_WITH_LENGTH(GICR_IIDR, 491 REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
426 vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, 492 vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
427 VGIC_ACCESS_32bit), 493 VGIC_ACCESS_32bit),
428 REGISTER_DESC_WITH_LENGTH(GICR_TYPER, 494 REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
429 vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, 495 vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
430 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), 496 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
497 REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
498 vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
499 VGIC_ACCESS_32bit),
431 REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, 500 REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
432 vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, 501 vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
433 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), 502 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
@@ -449,11 +518,13 @@ static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
449 REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0, 518 REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0,
450 vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, 519 vgic_mmio_read_enable, vgic_mmio_write_cenable, 4,
451 VGIC_ACCESS_32bit), 520 VGIC_ACCESS_32bit),
452 REGISTER_DESC_WITH_LENGTH(GICR_ISPENDR0, 521 REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISPENDR0,
453 vgic_mmio_read_pending, vgic_mmio_write_spending, 4, 522 vgic_mmio_read_pending, vgic_mmio_write_spending,
523 vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
454 VGIC_ACCESS_32bit), 524 VGIC_ACCESS_32bit),
455 REGISTER_DESC_WITH_LENGTH(GICR_ICPENDR0, 525 REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0,
456 vgic_mmio_read_pending, vgic_mmio_write_cpending, 4, 526 vgic_mmio_read_pending, vgic_mmio_write_cpending,
527 vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
457 VGIC_ACCESS_32bit), 528 VGIC_ACCESS_32bit),
458 REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0, 529 REGISTER_DESC_WITH_LENGTH(GICR_ISACTIVER0,
459 vgic_mmio_read_active, vgic_mmio_write_sactive, 4, 530 vgic_mmio_read_active, vgic_mmio_write_sactive, 4,
@@ -546,6 +617,54 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
546 return ret; 617 return ret;
547} 618}
548 619
620int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
621{
622 const struct vgic_register_region *region;
623 struct vgic_io_device iodev;
624 struct vgic_reg_attr reg_attr;
625 struct kvm_vcpu *vcpu;
626 gpa_t addr;
627 int ret;
628
629 ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
630 if (ret)
631 return ret;
632
633 vcpu = reg_attr.vcpu;
634 addr = reg_attr.addr;
635
636 switch (attr->group) {
637 case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
638 iodev.regions = vgic_v3_dist_registers;
639 iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
640 iodev.base_addr = 0;
641 break;
642 case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
643 iodev.regions = vgic_v3_rdbase_registers;
644 iodev.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
645 iodev.base_addr = 0;
646 break;
647 }
648 case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
649 u64 reg, id;
650
651 id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
652 return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
653 }
654 default:
655 return -ENXIO;
656 }
657
658 /* We only support aligned 32-bit accesses. */
659 if (addr & 3)
660 return -ENXIO;
661
662 region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
663 if (!region)
664 return -ENXIO;
665
666 return 0;
667}
549/* 668/*
550 * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI 669 * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
551 * generation register ICC_SGI1R_EL1) with a given VCPU. 670 * generation register ICC_SGI1R_EL1) with a given VCPU.
@@ -646,9 +765,55 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
646 irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); 765 irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
647 766
648 spin_lock(&irq->irq_lock); 767 spin_lock(&irq->irq_lock);
649 irq->pending = true; 768 irq->pending_latch = true;
650 769
651 vgic_queue_irq_unlock(vcpu->kvm, irq); 770 vgic_queue_irq_unlock(vcpu->kvm, irq);
652 vgic_put_irq(vcpu->kvm, irq); 771 vgic_put_irq(vcpu->kvm, irq);
653 } 772 }
654} 773}
774
775int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
776 int offset, u32 *val)
777{
778 struct vgic_io_device dev = {
779 .regions = vgic_v3_dist_registers,
780 .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
781 };
782
783 return vgic_uaccess(vcpu, &dev, is_write, offset, val);
784}
785
786int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
787 int offset, u32 *val)
788{
789 struct vgic_io_device rd_dev = {
790 .regions = vgic_v3_rdbase_registers,
791 .nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers),
792 };
793
794 struct vgic_io_device sgi_dev = {
795 .regions = vgic_v3_sgibase_registers,
796 .nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers),
797 };
798
799 /* SGI_base is the next 64K frame after RD_base */
800 if (offset >= SZ_64K)
801 return vgic_uaccess(vcpu, &sgi_dev, is_write, offset - SZ_64K,
802 val);
803 else
804 return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
805}
806
807int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
808 u32 intid, u64 *val)
809{
810 if (intid % 32)
811 return -EINVAL;
812
813 if (is_write)
814 vgic_write_irq_line_level_info(vcpu, intid, *val);
815 else
816 *val = vgic_read_irq_line_level_info(vcpu, intid);
817
818 return 0;
819}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index ebe1b9fa3c4d..3654b4c835ef 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -111,7 +111,7 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
111 for (i = 0; i < len * 8; i++) { 111 for (i = 0; i < len * 8; i++) {
112 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 112 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
113 113
114 if (irq->pending) 114 if (irq_is_pending(irq))
115 value |= (1U << i); 115 value |= (1U << i);
116 116
117 vgic_put_irq(vcpu->kvm, irq); 117 vgic_put_irq(vcpu->kvm, irq);
@@ -131,9 +131,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
131 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 131 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
132 132
133 spin_lock(&irq->irq_lock); 133 spin_lock(&irq->irq_lock);
134 irq->pending = true; 134 irq->pending_latch = true;
135 if (irq->config == VGIC_CONFIG_LEVEL)
136 irq->soft_pending = true;
137 135
138 vgic_queue_irq_unlock(vcpu->kvm, irq); 136 vgic_queue_irq_unlock(vcpu->kvm, irq);
139 vgic_put_irq(vcpu->kvm, irq); 137 vgic_put_irq(vcpu->kvm, irq);
@@ -152,12 +150,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
152 150
153 spin_lock(&irq->irq_lock); 151 spin_lock(&irq->irq_lock);
154 152
155 if (irq->config == VGIC_CONFIG_LEVEL) { 153 irq->pending_latch = false;
156 irq->soft_pending = false;
157 irq->pending = irq->line_level;
158 } else {
159 irq->pending = false;
160 }
161 154
162 spin_unlock(&irq->irq_lock); 155 spin_unlock(&irq->irq_lock);
163 vgic_put_irq(vcpu->kvm, irq); 156 vgic_put_irq(vcpu->kvm, irq);
@@ -359,18 +352,70 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
359 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); 352 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
360 spin_lock(&irq->irq_lock); 353 spin_lock(&irq->irq_lock);
361 354
362 if (test_bit(i * 2 + 1, &val)) { 355 if (test_bit(i * 2 + 1, &val))
363 irq->config = VGIC_CONFIG_EDGE; 356 irq->config = VGIC_CONFIG_EDGE;
364 } else { 357 else
365 irq->config = VGIC_CONFIG_LEVEL; 358 irq->config = VGIC_CONFIG_LEVEL;
366 irq->pending = irq->line_level | irq->soft_pending;
367 }
368 359
369 spin_unlock(&irq->irq_lock); 360 spin_unlock(&irq->irq_lock);
370 vgic_put_irq(vcpu->kvm, irq); 361 vgic_put_irq(vcpu->kvm, irq);
371 } 362 }
372} 363}
373 364
365u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
366{
367 int i;
368 u64 val = 0;
369 int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
370
371 for (i = 0; i < 32; i++) {
372 struct vgic_irq *irq;
373
374 if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
375 continue;
376
377 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
378 if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
379 val |= (1U << i);
380
381 vgic_put_irq(vcpu->kvm, irq);
382 }
383
384 return val;
385}
386
387void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
388 const u64 val)
389{
390 int i;
391 int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
392
393 for (i = 0; i < 32; i++) {
394 struct vgic_irq *irq;
395 bool new_level;
396
397 if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
398 continue;
399
400 irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
401
402 /*
403 * Line level is set irrespective of irq type
404 * (level or edge) to avoid dependency that VM should
405 * restore irq config before line level.
406 */
407 new_level = !!(val & (1U << i));
408 spin_lock(&irq->irq_lock);
409 irq->line_level = new_level;
410 if (new_level)
411 vgic_queue_irq_unlock(vcpu->kvm, irq);
412 else
413 spin_unlock(&irq->irq_lock);
414
415 vgic_put_irq(vcpu->kvm, irq);
416 }
417}
418
374static int match_region(const void *key, const void *elt) 419static int match_region(const void *key, const void *elt)
375{ 420{
376 const unsigned int offset = (unsigned long)key; 421 const unsigned int offset = (unsigned long)key;
@@ -394,6 +439,22 @@ vgic_find_mmio_region(const struct vgic_register_region *region, int nr_regions,
394 sizeof(region[0]), match_region); 439 sizeof(region[0]), match_region);
395} 440}
396 441
442void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
443{
444 if (kvm_vgic_global_state.type == VGIC_V2)
445 vgic_v2_set_vmcr(vcpu, vmcr);
446 else
447 vgic_v3_set_vmcr(vcpu, vmcr);
448}
449
450void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
451{
452 if (kvm_vgic_global_state.type == VGIC_V2)
453 vgic_v2_get_vmcr(vcpu, vmcr);
454 else
455 vgic_v3_get_vmcr(vcpu, vmcr);
456}
457
397/* 458/*
398 * kvm_mmio_read_buf() returns a value in a format where it can be converted 459 * kvm_mmio_read_buf() returns a value in a format where it can be converted
399 * to a byte array and be directly observed as the guest wanted it to appear 460 * to a byte array and be directly observed as the guest wanted it to appear
@@ -484,6 +545,74 @@ static bool check_region(const struct kvm *kvm,
484 return false; 545 return false;
485} 546}
486 547
548const struct vgic_register_region *
549vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
550 gpa_t addr, int len)
551{
552 const struct vgic_register_region *region;
553
554 region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
555 addr - iodev->base_addr);
556 if (!region || !check_region(vcpu->kvm, region, addr, len))
557 return NULL;
558
559 return region;
560}
561
562static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
563 gpa_t addr, u32 *val)
564{
565 struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
566 const struct vgic_register_region *region;
567 struct kvm_vcpu *r_vcpu;
568
569 region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
570 if (!region) {
571 *val = 0;
572 return 0;
573 }
574
575 r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
576 if (region->uaccess_read)
577 *val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
578 else
579 *val = region->read(r_vcpu, addr, sizeof(u32));
580
581 return 0;
582}
583
584static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
585 gpa_t addr, const u32 *val)
586{
587 struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
588 const struct vgic_register_region *region;
589 struct kvm_vcpu *r_vcpu;
590
591 region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
592 if (!region)
593 return 0;
594
595 r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
596 if (region->uaccess_write)
597 region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
598 else
599 region->write(r_vcpu, addr, sizeof(u32), *val);
600
601 return 0;
602}
603
604/*
605 * Userland access to VGIC registers.
606 */
607int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
608 bool is_write, int offset, u32 *val)
609{
610 if (is_write)
611 return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
612 else
613 return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
614}
615
487static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, 616static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
488 gpa_t addr, int len, void *val) 617 gpa_t addr, int len, void *val)
489{ 618{
@@ -491,9 +620,8 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
491 const struct vgic_register_region *region; 620 const struct vgic_register_region *region;
492 unsigned long data = 0; 621 unsigned long data = 0;
493 622
494 region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, 623 region = vgic_get_mmio_region(vcpu, iodev, addr, len);
495 addr - iodev->base_addr); 624 if (!region) {
496 if (!region || !check_region(vcpu->kvm, region, addr, len)) {
497 memset(val, 0, len); 625 memset(val, 0, len);
498 return 0; 626 return 0;
499 } 627 }
@@ -524,9 +652,8 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
524 const struct vgic_register_region *region; 652 const struct vgic_register_region *region;
525 unsigned long data = vgic_data_mmio_bus_to_host(val, len); 653 unsigned long data = vgic_data_mmio_bus_to_host(val, len);
526 654
527 region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, 655 region = vgic_get_mmio_region(vcpu, iodev, addr, len);
528 addr - iodev->base_addr); 656 if (!region)
529 if (!region || !check_region(vcpu->kvm, region, addr, len))
530 return 0; 657 return 0;
531 658
532 switch (iodev->iodev_type) { 659 switch (iodev->iodev_type) {
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 84961b4e4422..98bb566b660a 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -34,6 +34,10 @@ struct vgic_register_region {
34 gpa_t addr, unsigned int len, 34 gpa_t addr, unsigned int len,
35 unsigned long val); 35 unsigned long val);
36 }; 36 };
37 unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
38 unsigned int len);
39 void (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
40 unsigned int len, unsigned long val);
37}; 41};
38 42
39extern struct kvm_io_device_ops kvm_io_gic_ops; 43extern struct kvm_io_device_ops kvm_io_gic_ops;
@@ -86,6 +90,18 @@ extern struct kvm_io_device_ops kvm_io_gic_ops;
86 .write = wr, \ 90 .write = wr, \
87 } 91 }
88 92
93#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
94 { \
95 .reg_offset = off, \
96 .bits_per_irq = 0, \
97 .len = length, \
98 .access_flags = acc, \
99 .read = rd, \
100 .write = wr, \
101 .uaccess_read = urd, \
102 .uaccess_write = uwr, \
103 }
104
89int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu, 105int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu,
90 struct vgic_register_region *reg_desc, 106 struct vgic_register_region *reg_desc,
91 struct vgic_io_device *region, 107 struct vgic_io_device *region,
@@ -158,6 +174,14 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
158 gpa_t addr, unsigned int len, 174 gpa_t addr, unsigned int len,
159 unsigned long val); 175 unsigned long val);
160 176
177int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
178 bool is_write, int offset, u32 *val);
179
180u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
181
182void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
183 const u64 val);
184
161unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); 185unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
162 186
163unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); 187unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 834137e7b83f..b834ecdf3225 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -104,7 +104,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
104 /* Edge is the only case where we preserve the pending bit */ 104 /* Edge is the only case where we preserve the pending bit */
105 if (irq->config == VGIC_CONFIG_EDGE && 105 if (irq->config == VGIC_CONFIG_EDGE &&
106 (val & GICH_LR_PENDING_BIT)) { 106 (val & GICH_LR_PENDING_BIT)) {
107 irq->pending = true; 107 irq->pending_latch = true;
108 108
109 if (vgic_irq_is_sgi(intid)) { 109 if (vgic_irq_is_sgi(intid)) {
110 u32 cpuid = val & GICH_LR_PHYSID_CPUID; 110 u32 cpuid = val & GICH_LR_PHYSID_CPUID;
@@ -120,9 +120,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
120 */ 120 */
121 if (irq->config == VGIC_CONFIG_LEVEL) { 121 if (irq->config == VGIC_CONFIG_LEVEL) {
122 if (!(val & GICH_LR_PENDING_BIT)) 122 if (!(val & GICH_LR_PENDING_BIT))
123 irq->soft_pending = false; 123 irq->pending_latch = false;
124
125 irq->pending = irq->line_level || irq->soft_pending;
126 } 124 }
127 125
128 spin_unlock(&irq->irq_lock); 126 spin_unlock(&irq->irq_lock);
@@ -145,11 +143,11 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
145{ 143{
146 u32 val = irq->intid; 144 u32 val = irq->intid;
147 145
148 if (irq->pending) { 146 if (irq_is_pending(irq)) {
149 val |= GICH_LR_PENDING_BIT; 147 val |= GICH_LR_PENDING_BIT;
150 148
151 if (irq->config == VGIC_CONFIG_EDGE) 149 if (irq->config == VGIC_CONFIG_EDGE)
152 irq->pending = false; 150 irq->pending_latch = false;
153 151
154 if (vgic_irq_is_sgi(irq->intid)) { 152 if (vgic_irq_is_sgi(irq->intid)) {
155 u32 src = ffs(irq->source); 153 u32 src = ffs(irq->source);
@@ -158,7 +156,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
158 val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; 156 val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
159 irq->source &= ~(1 << (src - 1)); 157 irq->source &= ~(1 << (src - 1));
160 if (irq->source) 158 if (irq->source)
161 irq->pending = true; 159 irq->pending_latch = true;
162 } 160 }
163 } 161 }
164 162
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index e6b03fd8c374..edc6ee2dc852 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -94,7 +94,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
94 /* Edge is the only case where we preserve the pending bit */ 94 /* Edge is the only case where we preserve the pending bit */
95 if (irq->config == VGIC_CONFIG_EDGE && 95 if (irq->config == VGIC_CONFIG_EDGE &&
96 (val & ICH_LR_PENDING_BIT)) { 96 (val & ICH_LR_PENDING_BIT)) {
97 irq->pending = true; 97 irq->pending_latch = true;
98 98
99 if (vgic_irq_is_sgi(intid) && 99 if (vgic_irq_is_sgi(intid) &&
100 model == KVM_DEV_TYPE_ARM_VGIC_V2) { 100 model == KVM_DEV_TYPE_ARM_VGIC_V2) {
@@ -111,9 +111,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
111 */ 111 */
112 if (irq->config == VGIC_CONFIG_LEVEL) { 112 if (irq->config == VGIC_CONFIG_LEVEL) {
113 if (!(val & ICH_LR_PENDING_BIT)) 113 if (!(val & ICH_LR_PENDING_BIT))
114 irq->soft_pending = false; 114 irq->pending_latch = false;
115
116 irq->pending = irq->line_level || irq->soft_pending;
117 } 115 }
118 116
119 spin_unlock(&irq->irq_lock); 117 spin_unlock(&irq->irq_lock);
@@ -127,11 +125,11 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
127 u32 model = vcpu->kvm->arch.vgic.vgic_model; 125 u32 model = vcpu->kvm->arch.vgic.vgic_model;
128 u64 val = irq->intid; 126 u64 val = irq->intid;
129 127
130 if (irq->pending) { 128 if (irq_is_pending(irq)) {
131 val |= ICH_LR_PENDING_BIT; 129 val |= ICH_LR_PENDING_BIT;
132 130
133 if (irq->config == VGIC_CONFIG_EDGE) 131 if (irq->config == VGIC_CONFIG_EDGE)
134 irq->pending = false; 132 irq->pending_latch = false;
135 133
136 if (vgic_irq_is_sgi(irq->intid) && 134 if (vgic_irq_is_sgi(irq->intid) &&
137 model == KVM_DEV_TYPE_ARM_VGIC_V2) { 135 model == KVM_DEV_TYPE_ARM_VGIC_V2) {
@@ -141,7 +139,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
141 val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; 139 val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
142 irq->source &= ~(1 << (src - 1)); 140 irq->source &= ~(1 << (src - 1));
143 if (irq->source) 141 if (irq->source)
144 irq->pending = true; 142 irq->pending_latch = true;
145 } 143 }
146 } 144 }
147 145
@@ -177,10 +175,18 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
177{ 175{
178 u32 vmcr; 176 u32 vmcr;
179 177
180 vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; 178 /*
179 * Ignore the FIQen bit, because GIC emulation always implies
180 * SRE=1 which means the vFIQEn bit is also RES1.
181 */
182 vmcr = ((vmcrp->ctlr >> ICC_CTLR_EL1_EOImode_SHIFT) <<
183 ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
184 vmcr |= (vmcrp->ctlr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
181 vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; 185 vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
182 vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; 186 vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
183 vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; 187 vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
188 vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
189 vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
184 190
185 vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; 191 vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
186} 192}
@@ -189,10 +195,18 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
189{ 195{
190 u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; 196 u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
191 197
192 vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; 198 /*
199 * Ignore the FIQen bit, because GIC emulation always implies
200 * SRE=1 which means the vFIQEn bit is also RES1.
201 */
202 vmcrp->ctlr = ((vmcr >> ICH_VMCR_EOIM_SHIFT) <<
203 ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK;
204 vmcrp->ctlr |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
193 vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; 205 vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
194 vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; 206 vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
195 vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; 207 vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
208 vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
209 vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
196} 210}
197 211
198#define INITIAL_PENDBASER_VALUE \ 212#define INITIAL_PENDBASER_VALUE \
@@ -224,6 +238,13 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
224 vgic_v3->vgic_sre = 0; 238 vgic_v3->vgic_sre = 0;
225 } 239 }
226 240
241 vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
242 ICH_VTR_ID_BITS_MASK) >>
243 ICH_VTR_ID_BITS_SHIFT;
244 vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
245 ICH_VTR_PRI_BITS_MASK) >>
246 ICH_VTR_PRI_BITS_SHIFT) + 1;
247
227 /* Get the show on the road... */ 248 /* Get the show on the road... */
228 vgic_v3->vgic_hcr = ICH_HCR_EN; 249 vgic_v3->vgic_hcr = ICH_HCR_EN;
229} 250}
@@ -322,6 +343,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
322 */ 343 */
323 kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; 344 kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
324 kvm_vgic_global_state.can_emulate_gicv2 = false; 345 kvm_vgic_global_state.can_emulate_gicv2 = false;
346 kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
325 347
326 if (!info->vcpu.start) { 348 if (!info->vcpu.start) {
327 kvm_info("GICv3: no GICV resource entry\n"); 349 kvm_info("GICv3: no GICV resource entry\n");
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 6440b56ec90e..654dfd40e449 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -160,7 +160,7 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
160 * If the distributor is disabled, pending interrupts shouldn't be 160 * If the distributor is disabled, pending interrupts shouldn't be
161 * forwarded. 161 * forwarded.
162 */ 162 */
163 if (irq->enabled && irq->pending) { 163 if (irq->enabled && irq_is_pending(irq)) {
164 if (unlikely(irq->target_vcpu && 164 if (unlikely(irq->target_vcpu &&
165 !irq->target_vcpu->kvm->arch.vgic.enabled)) 165 !irq->target_vcpu->kvm->arch.vgic.enabled))
166 return NULL; 166 return NULL;
@@ -204,8 +204,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
204 goto out; 204 goto out;
205 } 205 }
206 206
207 penda = irqa->enabled && irqa->pending; 207 penda = irqa->enabled && irq_is_pending(irqa);
208 pendb = irqb->enabled && irqb->pending; 208 pendb = irqb->enabled && irq_is_pending(irqb);
209 209
210 if (!penda || !pendb) { 210 if (!penda || !pendb) {
211 ret = (int)pendb - (int)penda; 211 ret = (int)pendb - (int)penda;
@@ -335,9 +335,22 @@ retry:
335 return true; 335 return true;
336} 336}
337 337
338static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, 338/**
339 unsigned int intid, bool level, 339 * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
340 bool mapped_irq) 340 * @kvm: The VM structure pointer
341 * @cpuid: The CPU for PPIs
342 * @intid: The INTID to inject a new state to.
343 * @level: Edge-triggered: true: to trigger the interrupt
344 * false: to ignore the call
345 * Level-sensitive true: raise the input signal
346 * false: lower the input signal
347 *
348 * The VGIC is not concerned with devices being active-LOW or active-HIGH for
349 * level-sensitive interrupts. You can think of the level parameter as 1
350 * being HIGH and 0 being LOW and all devices being active-HIGH.
351 */
352int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
353 bool level)
341{ 354{
342 struct kvm_vcpu *vcpu; 355 struct kvm_vcpu *vcpu;
343 struct vgic_irq *irq; 356 struct vgic_irq *irq;
@@ -357,11 +370,6 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
357 if (!irq) 370 if (!irq)
358 return -EINVAL; 371 return -EINVAL;
359 372
360 if (irq->hw != mapped_irq) {
361 vgic_put_irq(kvm, irq);
362 return -EINVAL;
363 }
364
365 spin_lock(&irq->irq_lock); 373 spin_lock(&irq->irq_lock);
366 374
367 if (!vgic_validate_injection(irq, level)) { 375 if (!vgic_validate_injection(irq, level)) {
@@ -371,12 +379,10 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
371 return 0; 379 return 0;
372 } 380 }
373 381
374 if (irq->config == VGIC_CONFIG_LEVEL) { 382 if (irq->config == VGIC_CONFIG_LEVEL)
375 irq->line_level = level; 383 irq->line_level = level;
376 irq->pending = level || irq->soft_pending; 384 else
377 } else { 385 irq->pending_latch = true;
378 irq->pending = true;
379 }
380 386
381 vgic_queue_irq_unlock(kvm, irq); 387 vgic_queue_irq_unlock(kvm, irq);
382 vgic_put_irq(kvm, irq); 388 vgic_put_irq(kvm, irq);
@@ -384,32 +390,6 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
384 return 0; 390 return 0;
385} 391}
386 392
387/**
388 * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
389 * @kvm: The VM structure pointer
390 * @cpuid: The CPU for PPIs
391 * @intid: The INTID to inject a new state to.
392 * @level: Edge-triggered: true: to trigger the interrupt
393 * false: to ignore the call
394 * Level-sensitive true: raise the input signal
395 * false: lower the input signal
396 *
397 * The VGIC is not concerned with devices being active-LOW or active-HIGH for
398 * level-sensitive interrupts. You can think of the level parameter as 1
399 * being HIGH and 0 being LOW and all devices being active-HIGH.
400 */
401int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
402 bool level)
403{
404 return vgic_update_irq_pending(kvm, cpuid, intid, level, false);
405}
406
407int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
408 bool level)
409{
410 return vgic_update_irq_pending(kvm, cpuid, intid, level, true);
411}
412
413int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) 393int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
414{ 394{
415 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); 395 struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
@@ -689,7 +669,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
689 669
690 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { 670 list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
691 spin_lock(&irq->irq_lock); 671 spin_lock(&irq->irq_lock);
692 pending = irq->pending && irq->enabled; 672 pending = irq_is_pending(irq) && irq->enabled;
693 spin_unlock(&irq->irq_lock); 673 spin_unlock(&irq->irq_lock);
694 674
695 if (pending) 675 if (pending)
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 859f65c6e056..db28f7cadab2 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -30,13 +30,79 @@
30 30
31#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) 31#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
32 32
33#define VGIC_AFFINITY_0_SHIFT 0
34#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
35#define VGIC_AFFINITY_1_SHIFT 8
36#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
37#define VGIC_AFFINITY_2_SHIFT 16
38#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
39#define VGIC_AFFINITY_3_SHIFT 24
40#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
41
42#define VGIC_AFFINITY_LEVEL(reg, level) \
43 ((((reg) & VGIC_AFFINITY_## level ##_MASK) \
44 >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
45
46/*
47 * The Userspace encodes the affinity differently from the MPIDR,
48 * Below macro converts vgic userspace format to MPIDR reg format.
49 */
50#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
51 VGIC_AFFINITY_LEVEL(val, 1) | \
52 VGIC_AFFINITY_LEVEL(val, 2) | \
53 VGIC_AFFINITY_LEVEL(val, 3))
54
55/*
56 * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt,
57 * below macros are defined for CPUREG encoding.
58 */
59#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000
60#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14
61#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800
62#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11
63#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780
64#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7
65#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078
66#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3
67#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007
68#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0
69
70#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
71 KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
72 KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
73 KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
74 KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
75
76static inline bool irq_is_pending(struct vgic_irq *irq)
77{
78 if (irq->config == VGIC_CONFIG_EDGE)
79 return irq->pending_latch;
80 else
81 return irq->pending_latch || irq->line_level;
82}
83
33struct vgic_vmcr { 84struct vgic_vmcr {
34 u32 ctlr; 85 u32 ctlr;
35 u32 abpr; 86 u32 abpr;
36 u32 bpr; 87 u32 bpr;
37 u32 pmr; 88 u32 pmr;
89 /* Below member variable are valid only for GICv3 */
90 u32 grpen0;
91 u32 grpen1;
92};
93
94struct vgic_reg_attr {
95 struct kvm_vcpu *vcpu;
96 gpa_t addr;
38}; 97};
39 98
99int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
100 struct vgic_reg_attr *reg_attr);
101int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
102 struct vgic_reg_attr *reg_attr);
103const struct vgic_register_region *
104vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
105 gpa_t addr, int len);
40struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, 106struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
41 u32 intid); 107 u32 intid);
42void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); 108void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
@@ -89,9 +155,24 @@ bool vgic_has_its(struct kvm *kvm);
89int kvm_vgic_register_its_device(void); 155int kvm_vgic_register_its_device(void);
90void vgic_enable_lpis(struct kvm_vcpu *vcpu); 156void vgic_enable_lpis(struct kvm_vcpu *vcpu);
91int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); 157int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
92 158int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
159int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
160 int offset, u32 *val);
161int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
162 int offset, u32 *val);
163int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
164 u64 id, u64 *val);
165int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
166 u64 *reg);
167int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
168 u32 intid, u64 *val);
93int kvm_register_vgic_device(unsigned long type); 169int kvm_register_vgic_device(unsigned long type);
170void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
171void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
94int vgic_lazy_init(struct kvm *kvm); 172int vgic_lazy_init(struct kvm *kvm);
95int vgic_init(struct kvm *kvm); 173int vgic_init(struct kvm *kvm);
96 174
175int vgic_debug_init(struct kvm *kvm);
176int vgic_debug_destroy(struct kvm *kvm);
177
97#endif 178#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b4e496..cc4d6e0dd2a2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -506,11 +506,6 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
506 if (!slots) 506 if (!slots)
507 return NULL; 507 return NULL;
508 508
509 /*
510 * Init kvm generation close to the maximum to easily test the
511 * code of handling generation number wrap-around.
512 */
513 slots->generation = -150;
514 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 509 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
515 slots->id_to_index[i] = slots->memslots[i].id = i; 510 slots->id_to_index[i] = slots->memslots[i].id = i;
516 511
@@ -641,9 +636,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
641 636
642 r = -ENOMEM; 637 r = -ENOMEM;
643 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 638 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
644 kvm->memslots[i] = kvm_alloc_memslots(); 639 struct kvm_memslots *slots = kvm_alloc_memslots();
645 if (!kvm->memslots[i]) 640 if (!slots)
646 goto out_err_no_srcu; 641 goto out_err_no_srcu;
642 /*
643 * Generations must be different for each address space.
644 * Init kvm generation close to the maximum to easily test the
645 * code of handling generation number wrap-around.
646 */
647 slots->generation = i * 2 - 150;
648 rcu_assign_pointer(kvm->memslots[i], slots);
647 } 649 }
648 650
649 if (init_srcu_struct(&kvm->srcu)) 651 if (init_srcu_struct(&kvm->srcu))
@@ -870,8 +872,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
870 * Increment the new memslot generation a second time. This prevents 872 * Increment the new memslot generation a second time. This prevents
871 * vm exits that race with memslot updates from caching a memslot 873 * vm exits that race with memslot updates from caching a memslot
872 * generation that will (potentially) be valid forever. 874 * generation that will (potentially) be valid forever.
875 *
876 * Generations must be unique even across address spaces. We do not need
877 * a global counter for that, instead the generation space is evenly split
878 * across address spaces. For example, with two address spaces, address
879 * space 0 will use generations 0, 4, 8, ... while * address space 1 will
880 * use generations 2, 6, 10, 14, ...
873 */ 881 */
874 slots->generation++; 882 slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
875 883
876 kvm_arch_memslots_updated(kvm, slots); 884 kvm_arch_memslots_updated(kvm, slots);
877 885
@@ -1094,37 +1102,31 @@ int kvm_get_dirty_log(struct kvm *kvm,
1094{ 1102{
1095 struct kvm_memslots *slots; 1103 struct kvm_memslots *slots;
1096 struct kvm_memory_slot *memslot; 1104 struct kvm_memory_slot *memslot;
1097 int r, i, as_id, id; 1105 int i, as_id, id;
1098 unsigned long n; 1106 unsigned long n;
1099 unsigned long any = 0; 1107 unsigned long any = 0;
1100 1108
1101 r = -EINVAL;
1102 as_id = log->slot >> 16; 1109 as_id = log->slot >> 16;
1103 id = (u16)log->slot; 1110 id = (u16)log->slot;
1104 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1111 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1105 goto out; 1112 return -EINVAL;
1106 1113
1107 slots = __kvm_memslots(kvm, as_id); 1114 slots = __kvm_memslots(kvm, as_id);
1108 memslot = id_to_memslot(slots, id); 1115 memslot = id_to_memslot(slots, id);
1109 r = -ENOENT;
1110 if (!memslot->dirty_bitmap) 1116 if (!memslot->dirty_bitmap)
1111 goto out; 1117 return -ENOENT;
1112 1118
1113 n = kvm_dirty_bitmap_bytes(memslot); 1119 n = kvm_dirty_bitmap_bytes(memslot);
1114 1120
1115 for (i = 0; !any && i < n/sizeof(long); ++i) 1121 for (i = 0; !any && i < n/sizeof(long); ++i)
1116 any = memslot->dirty_bitmap[i]; 1122 any = memslot->dirty_bitmap[i];
1117 1123
1118 r = -EFAULT;
1119 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1124 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1120 goto out; 1125 return -EFAULT;
1121 1126
1122 if (any) 1127 if (any)
1123 *is_dirty = 1; 1128 *is_dirty = 1;
1124 1129 return 0;
1125 r = 0;
1126out:
1127 return r;
1128} 1130}
1129EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1131EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1130 1132
@@ -1156,24 +1158,22 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
1156{ 1158{
1157 struct kvm_memslots *slots; 1159 struct kvm_memslots *slots;
1158 struct kvm_memory_slot *memslot; 1160 struct kvm_memory_slot *memslot;
1159 int r, i, as_id, id; 1161 int i, as_id, id;
1160 unsigned long n; 1162 unsigned long n;
1161 unsigned long *dirty_bitmap; 1163 unsigned long *dirty_bitmap;
1162 unsigned long *dirty_bitmap_buffer; 1164 unsigned long *dirty_bitmap_buffer;
1163 1165
1164 r = -EINVAL;
1165 as_id = log->slot >> 16; 1166 as_id = log->slot >> 16;
1166 id = (u16)log->slot; 1167 id = (u16)log->slot;
1167 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1168 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1168 goto out; 1169 return -EINVAL;
1169 1170
1170 slots = __kvm_memslots(kvm, as_id); 1171 slots = __kvm_memslots(kvm, as_id);
1171 memslot = id_to_memslot(slots, id); 1172 memslot = id_to_memslot(slots, id);
1172 1173
1173 dirty_bitmap = memslot->dirty_bitmap; 1174 dirty_bitmap = memslot->dirty_bitmap;
1174 r = -ENOENT;
1175 if (!dirty_bitmap) 1175 if (!dirty_bitmap)
1176 goto out; 1176 return -ENOENT;
1177 1177
1178 n = kvm_dirty_bitmap_bytes(memslot); 1178 n = kvm_dirty_bitmap_bytes(memslot);
1179 1179
@@ -1202,14 +1202,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
1202 } 1202 }
1203 1203
1204 spin_unlock(&kvm->mmu_lock); 1204 spin_unlock(&kvm->mmu_lock);
1205
1206 r = -EFAULT;
1207 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1205 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1208 goto out; 1206 return -EFAULT;
1209 1207 return 0;
1210 r = 0;
1211out:
1212 return r;
1213} 1208}
1214EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1209EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1215#endif 1210#endif
@@ -1937,10 +1932,10 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
1937} 1932}
1938EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 1933EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
1939 1934
1940int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1935static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
1941 gpa_t gpa, unsigned long len) 1936 struct gfn_to_hva_cache *ghc,
1937 gpa_t gpa, unsigned long len)
1942{ 1938{
1943 struct kvm_memslots *slots = kvm_memslots(kvm);
1944 int offset = offset_in_page(gpa); 1939 int offset = offset_in_page(gpa);
1945 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1940 gfn_t start_gfn = gpa >> PAGE_SHIFT;
1946 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1941 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
@@ -1950,7 +1945,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1950 ghc->gpa = gpa; 1945 ghc->gpa = gpa;
1951 ghc->generation = slots->generation; 1946 ghc->generation = slots->generation;
1952 ghc->len = len; 1947 ghc->len = len;
1953 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1948 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
1954 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1949 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
1955 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1950 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
1956 ghc->hva += offset; 1951 ghc->hva += offset;
@@ -1960,7 +1955,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1960 * verify that the entire region is valid here. 1955 * verify that the entire region is valid here.
1961 */ 1956 */
1962 while (start_gfn <= end_gfn) { 1957 while (start_gfn <= end_gfn) {
1963 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1958 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
1964 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1959 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
1965 &nr_pages_avail); 1960 &nr_pages_avail);
1966 if (kvm_is_error_hva(ghc->hva)) 1961 if (kvm_is_error_hva(ghc->hva))
@@ -1972,22 +1967,29 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1972 } 1967 }
1973 return 0; 1968 return 0;
1974} 1969}
1975EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1976 1970
1977int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1971int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
1978 void *data, int offset, unsigned long len) 1972 gpa_t gpa, unsigned long len)
1979{ 1973{
1980 struct kvm_memslots *slots = kvm_memslots(kvm); 1974 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
1975 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
1976}
1977EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
1978
1979int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
1980 void *data, int offset, unsigned long len)
1981{
1982 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
1981 int r; 1983 int r;
1982 gpa_t gpa = ghc->gpa + offset; 1984 gpa_t gpa = ghc->gpa + offset;
1983 1985
1984 BUG_ON(len + offset > ghc->len); 1986 BUG_ON(len + offset > ghc->len);
1985 1987
1986 if (slots->generation != ghc->generation) 1988 if (slots->generation != ghc->generation)
1987 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 1989 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
1988 1990
1989 if (unlikely(!ghc->memslot)) 1991 if (unlikely(!ghc->memslot))
1990 return kvm_write_guest(kvm, gpa, data, len); 1992 return kvm_vcpu_write_guest(vcpu, gpa, data, len);
1991 1993
1992 if (kvm_is_error_hva(ghc->hva)) 1994 if (kvm_is_error_hva(ghc->hva))
1993 return -EFAULT; 1995 return -EFAULT;
@@ -1999,28 +2001,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1999 2001
2000 return 0; 2002 return 0;
2001} 2003}
2002EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2004EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
2003 2005
2004int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2006int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
2005 void *data, unsigned long len) 2007 void *data, unsigned long len)
2006{ 2008{
2007 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2009 return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
2008} 2010}
2009EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2011EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
2010 2012
2011int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2013int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
2012 void *data, unsigned long len) 2014 void *data, unsigned long len)
2013{ 2015{
2014 struct kvm_memslots *slots = kvm_memslots(kvm); 2016 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2015 int r; 2017 int r;
2016 2018
2017 BUG_ON(len > ghc->len); 2019 BUG_ON(len > ghc->len);
2018 2020
2019 if (slots->generation != ghc->generation) 2021 if (slots->generation != ghc->generation)
2020 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 2022 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2021 2023
2022 if (unlikely(!ghc->memslot)) 2024 if (unlikely(!ghc->memslot))
2023 return kvm_read_guest(kvm, ghc->gpa, data, len); 2025 return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
2024 2026
2025 if (kvm_is_error_hva(ghc->hva)) 2027 if (kvm_is_error_hva(ghc->hva))
2026 return -EFAULT; 2028 return -EFAULT;
@@ -2031,7 +2033,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2031 2033
2032 return 0; 2034 return 0;
2033} 2035}
2034EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2036EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
2035 2037
2036int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2038int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
2037{ 2039{
@@ -3133,10 +3135,9 @@ static long kvm_vm_compat_ioctl(struct file *filp,
3133 struct compat_kvm_dirty_log compat_log; 3135 struct compat_kvm_dirty_log compat_log;
3134 struct kvm_dirty_log log; 3136 struct kvm_dirty_log log;
3135 3137
3136 r = -EFAULT;
3137 if (copy_from_user(&compat_log, (void __user *)arg, 3138 if (copy_from_user(&compat_log, (void __user *)arg,
3138 sizeof(compat_log))) 3139 sizeof(compat_log)))
3139 goto out; 3140 return -EFAULT;
3140 log.slot = compat_log.slot; 3141 log.slot = compat_log.slot;
3141 log.padding1 = compat_log.padding1; 3142 log.padding1 = compat_log.padding1;
3142 log.padding2 = compat_log.padding2; 3143 log.padding2 = compat_log.padding2;
@@ -3148,8 +3149,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
3148 default: 3149 default:
3149 r = kvm_vm_ioctl(filp, ioctl, arg); 3150 r = kvm_vm_ioctl(filp, ioctl, arg);
3150 } 3151 }
3151
3152out:
3153 return r; 3152 return r;
3154} 3153}
3155#endif 3154#endif