summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-15 18:00:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-15 18:00:28 -0400
commit636deed6c0bc137a7c4f4a97ae1fcf0ad75323da (patch)
tree7bd27189b8e30e3c1466f7730831a08db65f8646
parentaa2e3ac64ace127f403be85aa4d6015b859385f2 (diff)
parent4a605bc08e98381d8df61c30a4acb2eac15eb7da (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "ARM: - some cleanups - direct physical timer assignment - cache sanitization for 32-bit guests s390: - interrupt cleanup - introduction of the Guest Information Block - preparation for processor subfunctions in cpu models PPC: - bug fixes and improvements, especially related to machine checks and protection keys x86: - many, many cleanups, including removing a bunch of MMU code for unnecessary optimizations - AVIC fixes Generic: - memcg accounting" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (147 commits) kvm: vmx: fix formatting of a comment KVM: doc: Document the life cycle of a VM and its resources MAINTAINERS: Add KVM selftests to existing KVM entry Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()" KVM: PPC: Book3S: Add count cache flush parameters to kvmppc_get_cpu_char() KVM: PPC: Fix compilation when KVM is not enabled KVM: Minor cleanups for kvm_main.c KVM: s390: add debug logging for cpu model subfunctions KVM: s390: implement subfunction processor calls arm64: KVM: Fix architecturally invalid reset value for FPEXC32_EL2 KVM: arm/arm64: Remove unused timer variable KVM: PPC: Book3S: Improve KVM reference counting KVM: PPC: Book3S HV: Fix build failure without IOMMU support Revert "KVM: Eliminate extra function calls in kvm_get_dirty_log_protect()" x86: kvmguest: use TSC clocksource if invariant TSC is exposed KVM: Never start grow vCPU halt_poll_ns from value below halt_poll_ns_grow_start KVM: Expose the initial start value in grow_halt_poll_ns() as a module parameter KVM: grow_halt_poll_ns() should never shrink vCPU halt_poll_ns KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes() KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children ...
-rw-r--r--Documentation/virtual/kvm/api.txt17
-rw-r--r--Documentation/virtual/kvm/halt-polling.txt37
-rw-r--r--Documentation/virtual/kvm/mmu.txt41
-rw-r--r--MAINTAINERS19
-rw-r--r--arch/arm/include/asm/arch_gicv3.h4
-rw-r--r--arch/arm/include/asm/kvm_emulate.h8
-rw-r--r--arch/arm/include/asm/kvm_host.h53
-rw-r--r--arch/arm/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm/include/asm/kvm_mmu.h9
-rw-r--r--arch/arm/kvm/Makefile5
-rw-r--r--arch/arm/kvm/coproc.c23
-rw-r--r--arch/arm/kvm/hyp/cp15-sr.c1
-rw-r--r--arch/arm/kvm/hyp/hyp-entry.S2
-rw-r--r--arch/arm/kvm/hyp/switch.c2
-rw-r--r--arch/arm/kvm/hyp/tlb.c4
-rw-r--r--arch/arm/kvm/interrupts.S4
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h12
-rw-r--r--arch/arm64/include/asm/kvm_host.h48
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h13
-rw-r--r--arch/arm64/include/asm/sysreg.h7
-rw-r--r--arch/arm64/kvm/Makefile4
-rw-r--r--arch/arm64/kvm/debug.c2
-rw-r--r--arch/arm64/kvm/hyp.S3
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S12
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c1
-rw-r--r--arch/arm64/kvm/sys_regs.c168
-rw-r--r--arch/mips/include/asm/kvm_host.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h5
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h14
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--arch/powerpc/kvm/book3s.c13
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c1
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c14
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c18
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c15
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c8
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c18
-rw-r--r--arch/powerpc/kvm/book3s_hv.c33
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c14
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c7
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S10
-rw-r--r--arch/powerpc/kvm/book3s_rtas.c8
-rw-r--r--arch/powerpc/kvm/powerpc.c22
-rw-r--r--arch/s390/include/asm/cio.h1
-rw-r--r--arch/s390/include/asm/irq.h1
-rw-r--r--arch/s390/include/asm/isc.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h39
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kvm/interrupt.c431
-rw-r--r--arch/s390/kvm/kvm-s390.c190
-rw-r--r--arch/s390/kvm/kvm-s390.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h42
-rw-r--r--arch/x86/include/asm/kvm_vcpu_regs.h25
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/mmu.c466
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/mmutrace.h42
-rw-r--r--arch/x86/kvm/page_track.c2
-rw-r--r--arch/x86/kvm/svm.c120
-rw-r--r--arch/x86/kvm/vmx/nested.c129
-rw-r--r--arch/x86/kvm/vmx/vmcs.h1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S167
-rw-r--r--arch/x86/kvm/vmx/vmx.c188
-rw-r--r--arch/x86/kvm/vmx/vmx.h20
-rw-r--r--arch/x86/kvm/x86.c32
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--drivers/clocksource/arm_arch_timer.c11
-rw-r--r--drivers/s390/cio/chsc.c37
-rw-r--r--drivers/s390/cio/chsc.h1
-rw-r--r--include/clocksource/arm_arch_timer.h1
-rw-r--r--include/kvm/arm_arch_timer.h68
-rw-r--r--include/linux/kvm_host.h24
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile1
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c95
-rw-r--r--virt/kvm/arm/arch_timer.c608
-rw-r--r--virt/kvm/arm/arm.c64
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c2
-rw-r--r--virt/kvm/arm/mmu.c20
-rw-r--r--virt/kvm/arm/trace.h107
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c4
-rw-r--r--virt/kvm/coalesced_mmio.c3
-rw-r--r--virt/kvm/eventfd.c7
-rw-r--r--virt/kvm/irqchip.c4
-rw-r--r--virt/kvm/kvm_main.c103
-rw-r--r--virt/kvm/vfio.c4
93 files changed, 2623 insertions, 1199 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 356156f5c52d..7de9eee73fcd 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -45,6 +45,23 @@ the API. The only supported use is one virtual machine per process,
45and one vcpu per thread. 45and one vcpu per thread.
46 46
47 47
48It is important to note that althought VM ioctls may only be issued from
49the process that created the VM, a VM's lifecycle is associated with its
50file descriptor, not its creator (process). In other words, the VM and
51its resources, *including the associated address space*, are not freed
52until the last reference to the VM's file descriptor has been released.
53For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will
54not be freed until both the parent (original) process and its child have
55put their references to the VM's file descriptor.
56
57Because a VM's resources are not freed until the last reference to its
58file descriptor is released, creating additional references to a VM via
59via fork(), dup(), etc... without careful consideration is strongly
60discouraged and may have unwanted side effects, e.g. memory allocated
61by and on behalf of the VM's process may not be freed/unaccounted when
62the VM is shut down.
63
64
483. Extensions 653. Extensions
49------------- 66-------------
50 67
diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virtual/kvm/halt-polling.txt
index 4a8418318769..4f791b128dd2 100644
--- a/Documentation/virtual/kvm/halt-polling.txt
+++ b/Documentation/virtual/kvm/halt-polling.txt
@@ -53,7 +53,8 @@ the global max polling interval then the polling interval can be increased in
53the hope that next time during the longer polling interval the wake up source 53the hope that next time during the longer polling interval the wake up source
54will be received while the host is polling and the latency benefits will be 54will be received while the host is polling and the latency benefits will be
55received. The polling interval is grown in the function grow_halt_poll_ns() and 55received. The polling interval is grown in the function grow_halt_poll_ns() and
56is multiplied by the module parameter halt_poll_ns_grow. 56is multiplied by the module parameters halt_poll_ns_grow and
57halt_poll_ns_grow_start.
57 58
58In the event that the total block time was greater than the global max polling 59In the event that the total block time was greater than the global max polling
59interval then the host will never poll for long enough (limited by the global 60interval then the host will never poll for long enough (limited by the global
@@ -80,22 +81,30 @@ shrunk. These variables are defined in include/linux/kvm_host.h and as module
80parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the 81parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
81powerpc kvm-hv case. 82powerpc kvm-hv case.
82 83
83Module Parameter | Description | Default Value 84Module Parameter | Description | Default Value
84-------------------------------------------------------------------------------- 85--------------------------------------------------------------------------------
85halt_poll_ns | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT 86halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT
86 | which defines the ceiling value | 87 | interval which defines |
87 | of the polling interval for | (per arch value) 88 | the ceiling value of the |
88 | each vcpu. | 89 | polling interval for | (per arch value)
90 | each vcpu. |
89-------------------------------------------------------------------------------- 91--------------------------------------------------------------------------------
90halt_poll_ns_grow | The value by which the halt | 2 92halt_poll_ns_grow | The value by which the | 2
91 | polling interval is multiplied | 93 | halt polling interval is |
92 | in the grow_halt_poll_ns() | 94 | multiplied in the |
93 | function. | 95 | grow_halt_poll_ns() |
96 | function. |
94-------------------------------------------------------------------------------- 97--------------------------------------------------------------------------------
95halt_poll_ns_shrink | The value by which the halt | 0 98halt_poll_ns_grow_start | The initial value to grow | 10000
96 | polling interval is divided in | 99 | to from zero in the |
97 | the shrink_halt_poll_ns() | 100 | grow_halt_poll_ns() |
98 | function. | 101 | function. |
102--------------------------------------------------------------------------------
103halt_poll_ns_shrink | The value by which the | 0
104 | halt polling interval is |
105 | divided in the |
106 | shrink_halt_poll_ns() |
107 | function. |
99-------------------------------------------------------------------------------- 108--------------------------------------------------------------------------------
100 109
101These module parameters can be set from the debugfs files in: 110These module parameters can be set from the debugfs files in:
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index e507a9e0421e..f365102c80f5 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -224,10 +224,6 @@ Shadow pages contain the following information:
224 A bitmap indicating which sptes in spt point (directly or indirectly) at 224 A bitmap indicating which sptes in spt point (directly or indirectly) at
225 pages that may be unsynchronized. Used to quickly locate all unsychronized 225 pages that may be unsynchronized. Used to quickly locate all unsychronized
226 pages reachable from a given page. 226 pages reachable from a given page.
227 mmu_valid_gen:
228 Generation number of the page. It is compared with kvm->arch.mmu_valid_gen
229 during hash table lookup, and used to skip invalidated shadow pages (see
230 "Zapping all pages" below.)
231 clear_spte_count: 227 clear_spte_count:
232 Only present on 32-bit hosts, where a 64-bit spte cannot be written 228 Only present on 32-bit hosts, where a 64-bit spte cannot be written
233 atomically. The reader uses this while running out of the MMU lock 229 atomically. The reader uses this while running out of the MMU lock
@@ -402,27 +398,6 @@ causes its disallow_lpage to be incremented, thus preventing instantiation of
402a large spte. The frames at the end of an unaligned memory slot have 398a large spte. The frames at the end of an unaligned memory slot have
403artificially inflated ->disallow_lpages so they can never be instantiated. 399artificially inflated ->disallow_lpages so they can never be instantiated.
404 400
405Zapping all pages (page generation count)
406=========================================
407
408For the large memory guests, walking and zapping all pages is really slow
409(because there are a lot of pages), and also blocks memory accesses of
410all VCPUs because it needs to hold the MMU lock.
411
412To make it be more scalable, kvm maintains a global generation number
413which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores
414the current global generation-number into sp->mmu_valid_gen when it
415is created. Pages with a mismatching generation number are "obsolete".
416
417When KVM need zap all shadow pages sptes, it just simply increases the global
418generation-number then reload root shadow pages on all vcpus. As the VCPUs
419create new shadow page tables, the old pages are not used because of the
420mismatching generation number.
421
422KVM then walks through all pages and zaps obsolete pages. While the zap
423operation needs to take the MMU lock, the lock can be released periodically
424so that the VCPUs can make progress.
425
426Fast invalidation of MMIO sptes 401Fast invalidation of MMIO sptes
427=============================== 402===============================
428 403
@@ -435,8 +410,7 @@ shadow pages, and is made more scalable with a similar technique.
435MMIO sptes have a few spare bits, which are used to store a 410MMIO sptes have a few spare bits, which are used to store a
436generation number. The global generation number is stored in 411generation number. The global generation number is stored in
437kvm_memslots(kvm)->generation, and increased whenever guest memory info 412kvm_memslots(kvm)->generation, and increased whenever guest memory info
438changes. This generation number is distinct from the one described in 413changes.
439the previous section.
440 414
441When KVM finds an MMIO spte, it checks the generation number of the spte. 415When KVM finds an MMIO spte, it checks the generation number of the spte.
442If the generation number of the spte does not equal the global generation 416If the generation number of the spte does not equal the global generation
@@ -452,13 +426,16 @@ stored into the MMIO spte. Thus, the MMIO spte might be created based on
452out-of-date information, but with an up-to-date generation number. 426out-of-date information, but with an up-to-date generation number.
453 427
454To avoid this, the generation number is incremented again after synchronize_srcu 428To avoid this, the generation number is incremented again after synchronize_srcu
455returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a 429returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a
456memslot update, while some SRCU readers might be using the old copy. We do not 430memslot update, while some SRCU readers might be using the old copy. We do not
457want to use an MMIO sptes created with an odd generation number, and we can do 431want to use an MMIO sptes created with an odd generation number, and we can do
458this without losing a bit in the MMIO spte. The low bit of the generation 432this without losing a bit in the MMIO spte. The "update in-progress" bit of the
459is not stored in MMIO spte, and presumed zero when it is extracted out of the 433generation is not stored in MMIO spte, and is so is implicitly zero when the
460spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1, 434generation is extracted out of the spte. If KVM is unlucky and creates an MMIO
461the next access to the spte will always be a cache miss. 435spte while an update is in-progress, the next access to the spte will always be
436a cache miss. For example, a subsequent access during the update window will
437miss due to the in-progress flag diverging, while an access after the update
438window closes will have a higher generation number (as compared to the spte).
462 439
463 440
464Further reading 441Further reading
diff --git a/MAINTAINERS b/MAINTAINERS
index c009ad17ae64..e17ebf70b548 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8461,6 +8461,7 @@ F: include/linux/kvm*
8461F: include/kvm/iodev.h 8461F: include/kvm/iodev.h
8462F: virt/kvm/* 8462F: virt/kvm/*
8463F: tools/kvm/ 8463F: tools/kvm/
8464F: tools/testing/selftests/kvm/
8464 8465
8465KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd) 8466KERNEL VIRTUAL MACHINE FOR AMD-V (KVM/amd)
8466M: Joerg Roedel <joro@8bytes.org> 8467M: Joerg Roedel <joro@8bytes.org>
@@ -8470,29 +8471,25 @@ S: Maintained
8470F: arch/x86/include/asm/svm.h 8471F: arch/x86/include/asm/svm.h
8471F: arch/x86/kvm/svm.c 8472F: arch/x86/kvm/svm.c
8472 8473
8473KERNEL VIRTUAL MACHINE FOR ARM (KVM/arm) 8474KERNEL VIRTUAL MACHINE FOR ARM/ARM64 (KVM/arm, KVM/arm64)
8474M: Christoffer Dall <christoffer.dall@arm.com> 8475M: Christoffer Dall <christoffer.dall@arm.com>
8475M: Marc Zyngier <marc.zyngier@arm.com> 8476M: Marc Zyngier <marc.zyngier@arm.com>
8477R: James Morse <james.morse@arm.com>
8478R: Julien Thierry <julien.thierry@arm.com>
8479R: Suzuki K Pouloze <suzuki.poulose@arm.com>
8476L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 8480L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
8477L: kvmarm@lists.cs.columbia.edu 8481L: kvmarm@lists.cs.columbia.edu
8478W: http://systems.cs.columbia.edu/projects/kvm-arm 8482W: http://systems.cs.columbia.edu/projects/kvm-arm
8479T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 8483T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
8480S: Supported 8484S: Maintained
8481F: arch/arm/include/uapi/asm/kvm* 8485F: arch/arm/include/uapi/asm/kvm*
8482F: arch/arm/include/asm/kvm* 8486F: arch/arm/include/asm/kvm*
8483F: arch/arm/kvm/ 8487F: arch/arm/kvm/
8484F: virt/kvm/arm/
8485F: include/kvm/arm_*
8486
8487KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
8488M: Christoffer Dall <christoffer.dall@arm.com>
8489M: Marc Zyngier <marc.zyngier@arm.com>
8490L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
8491L: kvmarm@lists.cs.columbia.edu
8492S: Maintained
8493F: arch/arm64/include/uapi/asm/kvm* 8488F: arch/arm64/include/uapi/asm/kvm*
8494F: arch/arm64/include/asm/kvm* 8489F: arch/arm64/include/asm/kvm*
8495F: arch/arm64/kvm/ 8490F: arch/arm64/kvm/
8491F: virt/kvm/arm/
8492F: include/kvm/arm_*
8496 8493
8497KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) 8494KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips)
8498M: James Hogan <jhogan@kernel.org> 8495M: James Hogan <jhogan@kernel.org>
diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index f6f485f4744e..d15b8c99f1b3 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -55,7 +55,7 @@
55#define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1) 55#define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1)
56#define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2) 56#define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2)
57#define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3) 57#define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3)
58#define ICH_ELSR __ACCESS_CP15(c12, 4, c11, 5) 58#define ICH_ELRSR __ACCESS_CP15(c12, 4, c11, 5)
59#define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7) 59#define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7)
60 60
61#define __LR0(x) __ACCESS_CP15(c12, 4, c12, x) 61#define __LR0(x) __ACCESS_CP15(c12, 4, c12, x)
@@ -152,7 +152,7 @@ CPUIF_MAP(ICH_HCR, ICH_HCR_EL2)
152CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) 152CPUIF_MAP(ICH_VTR, ICH_VTR_EL2)
153CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) 153CPUIF_MAP(ICH_MISR, ICH_MISR_EL2)
154CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) 154CPUIF_MAP(ICH_EISR, ICH_EISR_EL2)
155CPUIF_MAP(ICH_ELSR, ICH_ELSR_EL2) 155CPUIF_MAP(ICH_ELRSR, ICH_ELRSR_EL2)
156CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) 156CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2)
157CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) 157CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2)
158CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) 158CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2)
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 77121b713bef..8927cae7c966 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -265,6 +265,14 @@ static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu)
265 } 265 }
266} 266}
267 267
268static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
269{
270 if (kvm_vcpu_trap_is_iabt(vcpu))
271 return false;
272
273 return kvm_vcpu_dabt_iswrite(vcpu);
274}
275
268static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) 276static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
269{ 277{
270 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; 278 return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 50e89869178a..770d73257ad9 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -26,6 +26,7 @@
26#include <asm/kvm_asm.h> 26#include <asm/kvm_asm.h>
27#include <asm/kvm_mmio.h> 27#include <asm/kvm_mmio.h>
28#include <asm/fpstate.h> 28#include <asm/fpstate.h>
29#include <asm/smp_plat.h>
29#include <kvm/arm_arch_timer.h> 30#include <kvm/arm_arch_timer.h>
30 31
31#define __KVM_HAVE_ARCH_INTC_INITIALIZED 32#define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -57,10 +58,13 @@ int __attribute_const__ kvm_target_cpu(void);
57int kvm_reset_vcpu(struct kvm_vcpu *vcpu); 58int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
58void kvm_reset_coprocs(struct kvm_vcpu *vcpu); 59void kvm_reset_coprocs(struct kvm_vcpu *vcpu);
59 60
60struct kvm_arch { 61struct kvm_vmid {
61 /* VTTBR value associated with below pgd and vmid */ 62 /* The VMID generation used for the virt. memory system */
62 u64 vttbr; 63 u64 vmid_gen;
64 u32 vmid;
65};
63 66
67struct kvm_arch {
64 /* The last vcpu id that ran on each physical CPU */ 68 /* The last vcpu id that ran on each physical CPU */
65 int __percpu *last_vcpu_ran; 69 int __percpu *last_vcpu_ran;
66 70
@@ -70,11 +74,11 @@ struct kvm_arch {
70 */ 74 */
71 75
72 /* The VMID generation used for the virt. memory system */ 76 /* The VMID generation used for the virt. memory system */
73 u64 vmid_gen; 77 struct kvm_vmid vmid;
74 u32 vmid;
75 78
76 /* Stage-2 page table */ 79 /* Stage-2 page table */
77 pgd_t *pgd; 80 pgd_t *pgd;
81 phys_addr_t pgd_phys;
78 82
79 /* Interrupt controller */ 83 /* Interrupt controller */
80 struct vgic_dist vgic; 84 struct vgic_dist vgic;
@@ -148,6 +152,13 @@ struct kvm_cpu_context {
148 152
149typedef struct kvm_cpu_context kvm_cpu_context_t; 153typedef struct kvm_cpu_context kvm_cpu_context_t;
150 154
155static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt,
156 int cpu)
157{
158 /* The host's MPIDR is immutable, so let's set it up at boot time */
159 cpu_ctxt->cp15[c0_MPIDR] = cpu_logical_map(cpu);
160}
161
151struct vcpu_reset_state { 162struct vcpu_reset_state {
152 unsigned long pc; 163 unsigned long pc;
153 unsigned long r0; 164 unsigned long r0;
@@ -224,7 +235,35 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
224int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); 235int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
225int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); 236int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
226int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); 237int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
227unsigned long kvm_call_hyp(void *hypfn, ...); 238
239unsigned long __kvm_call_hyp(void *hypfn, ...);
240
241/*
242 * The has_vhe() part doesn't get emitted, but is used for type-checking.
243 */
244#define kvm_call_hyp(f, ...) \
245 do { \
246 if (has_vhe()) { \
247 f(__VA_ARGS__); \
248 } else { \
249 __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \
250 } \
251 } while(0)
252
253#define kvm_call_hyp_ret(f, ...) \
254 ({ \
255 typeof(f(__VA_ARGS__)) ret; \
256 \
257 if (has_vhe()) { \
258 ret = f(__VA_ARGS__); \
259 } else { \
260 ret = __kvm_call_hyp(kvm_ksym_ref(f), \
261 ##__VA_ARGS__); \
262 } \
263 \
264 ret; \
265 })
266
228void force_vm_exit(const cpumask_t *mask); 267void force_vm_exit(const cpumask_t *mask);
229int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, 268int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
230 struct kvm_vcpu_events *events); 269 struct kvm_vcpu_events *events);
@@ -275,7 +314,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
275 * compliant with the PCS!). 314 * compliant with the PCS!).
276 */ 315 */
277 316
278 kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); 317 __kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
279} 318}
280 319
281static inline void __cpu_init_stage2(void) 320static inline void __cpu_init_stage2(void)
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index e93a0cac9add..87bcd18df8d5 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -40,6 +40,7 @@
40#define TTBR1 __ACCESS_CP15_64(1, c2) 40#define TTBR1 __ACCESS_CP15_64(1, c2)
41#define VTTBR __ACCESS_CP15_64(6, c2) 41#define VTTBR __ACCESS_CP15_64(6, c2)
42#define PAR __ACCESS_CP15_64(0, c7) 42#define PAR __ACCESS_CP15_64(0, c7)
43#define CNTP_CVAL __ACCESS_CP15_64(2, c14)
43#define CNTV_CVAL __ACCESS_CP15_64(3, c14) 44#define CNTV_CVAL __ACCESS_CP15_64(3, c14)
44#define CNTVOFF __ACCESS_CP15_64(4, c14) 45#define CNTVOFF __ACCESS_CP15_64(4, c14)
45 46
@@ -85,6 +86,7 @@
85#define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4) 86#define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4)
86#define HTPIDR __ACCESS_CP15(c13, 4, c0, 2) 87#define HTPIDR __ACCESS_CP15(c13, 4, c0, 2)
87#define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0) 88#define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0)
89#define CNTP_CTL __ACCESS_CP15(c14, 0, c2, 1)
88#define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1) 90#define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1)
89#define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0) 91#define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0)
90 92
@@ -94,6 +96,8 @@
94#define read_sysreg_el0(r) read_sysreg(r##_el0) 96#define read_sysreg_el0(r) read_sysreg(r##_el0)
95#define write_sysreg_el0(v, r) write_sysreg(v, r##_el0) 97#define write_sysreg_el0(v, r) write_sysreg(v, r##_el0)
96 98
99#define cntp_ctl_el0 CNTP_CTL
100#define cntp_cval_el0 CNTP_CVAL
97#define cntv_ctl_el0 CNTV_CTL 101#define cntv_ctl_el0 CNTV_CTL
98#define cntv_cval_el0 CNTV_CVAL 102#define cntv_cval_el0 CNTV_CVAL
99#define cntvoff_el2 CNTVOFF 103#define cntvoff_el2 CNTVOFF
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 3a875fc1b63c..2de96a180166 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -421,9 +421,14 @@ static inline int hyp_map_aux_data(void)
421 421
422static inline void kvm_set_ipa_limit(void) {} 422static inline void kvm_set_ipa_limit(void) {}
423 423
424static inline bool kvm_cpu_has_cnp(void) 424static __always_inline u64 kvm_get_vttbr(struct kvm *kvm)
425{ 425{
426 return false; 426 struct kvm_vmid *vmid = &kvm->arch.vmid;
427 u64 vmid_field, baddr;
428
429 baddr = kvm->arch.pgd_phys;
430 vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
431 return kvm_phys_to_vttbr(baddr) | vmid_field;
427} 432}
428 433
429#endif /* !__ASSEMBLY__ */ 434#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 48de846f2246..531e59f5be9c 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -8,9 +8,8 @@ ifeq ($(plus_virt),+virt)
8 plus_virt_def := -DREQUIRES_VIRT=1 8 plus_virt_def := -DREQUIRES_VIRT=1
9endif 9endif
10 10
11ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic 11ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic
12CFLAGS_arm.o := -I. $(plus_virt_def) 12CFLAGS_arm.o := $(plus_virt_def)
13CFLAGS_mmu.o := -I.
14 13
15AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) 14AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
16AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) 15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index e8bd288fd5be..14915c78bd99 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -293,15 +293,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu,
293 const struct coproc_params *p, 293 const struct coproc_params *p,
294 const struct coproc_reg *r) 294 const struct coproc_reg *r)
295{ 295{
296 u64 now = kvm_phys_timer_read(); 296 u32 val;
297 u64 val;
298 297
299 if (p->is_write) { 298 if (p->is_write) {
300 val = *vcpu_reg(vcpu, p->Rt1); 299 val = *vcpu_reg(vcpu, p->Rt1);
301 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val + now); 300 kvm_arm_timer_write_sysreg(vcpu,
301 TIMER_PTIMER, TIMER_REG_TVAL, val);
302 } else { 302 } else {
303 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 303 val = kvm_arm_timer_read_sysreg(vcpu,
304 *vcpu_reg(vcpu, p->Rt1) = val - now; 304 TIMER_PTIMER, TIMER_REG_TVAL);
305 *vcpu_reg(vcpu, p->Rt1) = val;
305 } 306 }
306 307
307 return true; 308 return true;
@@ -315,9 +316,11 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
315 316
316 if (p->is_write) { 317 if (p->is_write) {
317 val = *vcpu_reg(vcpu, p->Rt1); 318 val = *vcpu_reg(vcpu, p->Rt1);
318 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, val); 319 kvm_arm_timer_write_sysreg(vcpu,
320 TIMER_PTIMER, TIMER_REG_CTL, val);
319 } else { 321 } else {
320 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL); 322 val = kvm_arm_timer_read_sysreg(vcpu,
323 TIMER_PTIMER, TIMER_REG_CTL);
321 *vcpu_reg(vcpu, p->Rt1) = val; 324 *vcpu_reg(vcpu, p->Rt1) = val;
322 } 325 }
323 326
@@ -333,9 +336,11 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
333 if (p->is_write) { 336 if (p->is_write) {
334 val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; 337 val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32;
335 val |= *vcpu_reg(vcpu, p->Rt1); 338 val |= *vcpu_reg(vcpu, p->Rt1);
336 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, val); 339 kvm_arm_timer_write_sysreg(vcpu,
340 TIMER_PTIMER, TIMER_REG_CVAL, val);
337 } else { 341 } else {
338 val = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 342 val = kvm_arm_timer_read_sysreg(vcpu,
343 TIMER_PTIMER, TIMER_REG_CVAL);
339 *vcpu_reg(vcpu, p->Rt1) = val; 344 *vcpu_reg(vcpu, p->Rt1) = val;
340 *vcpu_reg(vcpu, p->Rt2) = val >> 32; 345 *vcpu_reg(vcpu, p->Rt2) = val >> 32;
341 } 346 }
diff --git a/arch/arm/kvm/hyp/cp15-sr.c b/arch/arm/kvm/hyp/cp15-sr.c
index c4782812714c..8bf895ec6e04 100644
--- a/arch/arm/kvm/hyp/cp15-sr.c
+++ b/arch/arm/kvm/hyp/cp15-sr.c
@@ -27,7 +27,6 @@ static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx)
27 27
28void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) 28void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
29{ 29{
30 ctxt->cp15[c0_MPIDR] = read_sysreg(VMPIDR);
31 ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR); 30 ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR);
32 ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR); 31 ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR);
33 ctxt->cp15[c1_CPACR] = read_sysreg(CPACR); 32 ctxt->cp15[c1_CPACR] = read_sysreg(CPACR);
diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S
index aa3f9a9837ac..6ed3cf23fe89 100644
--- a/arch/arm/kvm/hyp/hyp-entry.S
+++ b/arch/arm/kvm/hyp/hyp-entry.S
@@ -176,7 +176,7 @@ THUMB( orr lr, lr, #PSR_T_BIT )
176 msr spsr_cxsf, lr 176 msr spsr_cxsf, lr
177 ldr lr, =panic 177 ldr lr, =panic
178 msr ELR_hyp, lr 178 msr ELR_hyp, lr
179 ldr lr, =kvm_call_hyp 179 ldr lr, =__kvm_call_hyp
180 clrex 180 clrex
181 eret 181 eret
182ENDPROC(__hyp_do_panic) 182ENDPROC(__hyp_do_panic)
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index acf1c37fa49c..3b058a5d7c5f 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -77,7 +77,7 @@ static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
77static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) 77static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
78{ 78{
79 struct kvm *kvm = kern_hyp_va(vcpu->kvm); 79 struct kvm *kvm = kern_hyp_va(vcpu->kvm);
80 write_sysreg(kvm->arch.vttbr, VTTBR); 80 write_sysreg(kvm_get_vttbr(kvm), VTTBR);
81 write_sysreg(vcpu->arch.midr, VPIDR); 81 write_sysreg(vcpu->arch.midr, VPIDR);
82} 82}
83 83
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c
index c0edd450e104..8e4afba73635 100644
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -41,7 +41,7 @@ void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm)
41 41
42 /* Switch to requested VMID */ 42 /* Switch to requested VMID */
43 kvm = kern_hyp_va(kvm); 43 kvm = kern_hyp_va(kvm);
44 write_sysreg(kvm->arch.vttbr, VTTBR); 44 write_sysreg(kvm_get_vttbr(kvm), VTTBR);
45 isb(); 45 isb();
46 46
47 write_sysreg(0, TLBIALLIS); 47 write_sysreg(0, TLBIALLIS);
@@ -61,7 +61,7 @@ void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu)
61 struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); 61 struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm);
62 62
63 /* Switch to requested VMID */ 63 /* Switch to requested VMID */
64 write_sysreg(kvm->arch.vttbr, VTTBR); 64 write_sysreg(kvm_get_vttbr(kvm), VTTBR);
65 isb(); 65 isb();
66 66
67 write_sysreg(0, TLBIALL); 67 write_sysreg(0, TLBIALL);
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 80a1d6cd261c..a08e6419ebe9 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -42,7 +42,7 @@
42 * r12: caller save 42 * r12: caller save
43 * rest: callee save 43 * rest: callee save
44 */ 44 */
45ENTRY(kvm_call_hyp) 45ENTRY(__kvm_call_hyp)
46 hvc #0 46 hvc #0
47 bx lr 47 bx lr
48ENDPROC(kvm_call_hyp) 48ENDPROC(__kvm_call_hyp)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 506386a3edde..d3842791e1c4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -77,6 +77,10 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
77 */ 77 */
78 if (!vcpu_el1_is_32bit(vcpu)) 78 if (!vcpu_el1_is_32bit(vcpu))
79 vcpu->arch.hcr_el2 |= HCR_TID3; 79 vcpu->arch.hcr_el2 |= HCR_TID3;
80
81 if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
82 vcpu_el1_is_32bit(vcpu))
83 vcpu->arch.hcr_el2 |= HCR_TID2;
80} 84}
81 85
82static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) 86static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
@@ -331,6 +335,14 @@ static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
331 return ESR_ELx_SYS64_ISS_RT(esr); 335 return ESR_ELx_SYS64_ISS_RT(esr);
332} 336}
333 337
338static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
339{
340 if (kvm_vcpu_trap_is_iabt(vcpu))
341 return false;
342
343 return kvm_vcpu_dabt_iswrite(vcpu);
344}
345
334static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) 346static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
335{ 347{
336 return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; 348 return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 222af1d2c3e4..a01fe087e022 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -31,6 +31,7 @@
31#include <asm/kvm.h> 31#include <asm/kvm.h>
32#include <asm/kvm_asm.h> 32#include <asm/kvm_asm.h>
33#include <asm/kvm_mmio.h> 33#include <asm/kvm_mmio.h>
34#include <asm/smp_plat.h>
34#include <asm/thread_info.h> 35#include <asm/thread_info.h>
35 36
36#define __KVM_HAVE_ARCH_INTC_INITIALIZED 37#define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -58,16 +59,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
58int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); 59int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
59void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); 60void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
60 61
61struct kvm_arch { 62struct kvm_vmid {
62 /* The VMID generation used for the virt. memory system */ 63 /* The VMID generation used for the virt. memory system */
63 u64 vmid_gen; 64 u64 vmid_gen;
64 u32 vmid; 65 u32 vmid;
66};
67
68struct kvm_arch {
69 struct kvm_vmid vmid;
65 70
66 /* stage2 entry level table */ 71 /* stage2 entry level table */
67 pgd_t *pgd; 72 pgd_t *pgd;
73 phys_addr_t pgd_phys;
68 74
69 /* VTTBR value associated with above pgd and vmid */
70 u64 vttbr;
71 /* VTCR_EL2 value for this VM */ 75 /* VTCR_EL2 value for this VM */
72 u64 vtcr; 76 u64 vtcr;
73 77
@@ -382,7 +386,36 @@ void kvm_arm_halt_guest(struct kvm *kvm);
382void kvm_arm_resume_guest(struct kvm *kvm); 386void kvm_arm_resume_guest(struct kvm *kvm);
383 387
384u64 __kvm_call_hyp(void *hypfn, ...); 388u64 __kvm_call_hyp(void *hypfn, ...);
385#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) 389
390/*
391 * The couple of isb() below are there to guarantee the same behaviour
392 * on VHE as on !VHE, where the eret to EL1 acts as a context
393 * synchronization event.
394 */
395#define kvm_call_hyp(f, ...) \
396 do { \
397 if (has_vhe()) { \
398 f(__VA_ARGS__); \
399 isb(); \
400 } else { \
401 __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \
402 } \
403 } while(0)
404
405#define kvm_call_hyp_ret(f, ...) \
406 ({ \
407 typeof(f(__VA_ARGS__)) ret; \
408 \
409 if (has_vhe()) { \
410 ret = f(__VA_ARGS__); \
411 isb(); \
412 } else { \
413 ret = __kvm_call_hyp(kvm_ksym_ref(f), \
414 ##__VA_ARGS__); \
415 } \
416 \
417 ret; \
418 })
386 419
387void force_vm_exit(const cpumask_t *mask); 420void force_vm_exit(const cpumask_t *mask);
388void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); 421void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
@@ -401,6 +434,13 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
401 434
402DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); 435DECLARE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
403 436
437static inline void kvm_init_host_cpu_context(kvm_cpu_context_t *cpu_ctxt,
438 int cpu)
439{
440 /* The host's MPIDR is immutable, so let's set it up at boot time */
441 cpu_ctxt->sys_regs[MPIDR_EL1] = cpu_logical_map(cpu);
442}
443
404void __kvm_enable_ssbs(void); 444void __kvm_enable_ssbs(void);
405 445
406static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, 446static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index a80a7ef57325..4da765f2cca5 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -21,6 +21,7 @@
21#include <linux/compiler.h> 21#include <linux/compiler.h>
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <asm/alternative.h> 23#include <asm/alternative.h>
24#include <asm/kvm_mmu.h>
24#include <asm/sysreg.h> 25#include <asm/sysreg.h>
25 26
26#define __hyp_text __section(.hyp.text) notrace 27#define __hyp_text __section(.hyp.text) notrace
@@ -163,7 +164,7 @@ void __noreturn __hyp_do_panic(unsigned long, ...);
163static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) 164static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
164{ 165{
165 write_sysreg(kvm->arch.vtcr, vtcr_el2); 166 write_sysreg(kvm->arch.vtcr, vtcr_el2);
166 write_sysreg(kvm->arch.vttbr, vttbr_el2); 167 write_sysreg(kvm_get_vttbr(kvm), vttbr_el2);
167 168
168 /* 169 /*
169 * ARM erratum 1165522 requires the actual execution of the above 170 * ARM erratum 1165522 requires the actual execution of the above
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 8af4b1befa42..b0742a16c6c9 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -138,7 +138,8 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
138 }) 138 })
139 139
140/* 140/*
141 * We currently only support a 40bit IPA. 141 * We currently support using a VM-specified IPA size. For backward
142 * compatibility, the default IPA size is fixed to 40bits.
142 */ 143 */
143#define KVM_PHYS_SHIFT (40) 144#define KVM_PHYS_SHIFT (40)
144 145
@@ -591,9 +592,15 @@ static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
591 return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); 592 return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
592} 593}
593 594
594static inline bool kvm_cpu_has_cnp(void) 595static __always_inline u64 kvm_get_vttbr(struct kvm *kvm)
595{ 596{
596 return system_supports_cnp(); 597 struct kvm_vmid *vmid = &kvm->arch.vmid;
598 u64 vmid_field, baddr;
599 u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;
600
601 baddr = kvm->arch.pgd_phys;
602 vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
603 return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
597} 604}
598 605
599#endif /* __ASSEMBLY__ */ 606#endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 72dc4c011014..5b267dec6194 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -361,6 +361,7 @@
361 361
362#define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) 362#define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0)
363 363
364#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0)
364#define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) 365#define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1)
365#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) 366#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7)
366 367
@@ -392,6 +393,10 @@
392#define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) 393#define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1)
393#define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) 394#define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2)
394 395
396#define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0)
397#define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1)
398#define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0)
399
395#define __PMEV_op2(n) ((n) & 0x7) 400#define __PMEV_op2(n) ((n) & 0x7)
396#define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) 401#define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3))
397#define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n)) 402#define SYS_PMEVCNTRn_EL0(n) sys_reg(3, 3, 14, __CNTR_CRm(n), __PMEV_op2(n))
@@ -426,7 +431,7 @@
426#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) 431#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1)
427#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) 432#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2)
428#define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) 433#define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3)
429#define SYS_ICH_ELSR_EL2 sys_reg(3, 4, 12, 11, 5) 434#define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5)
430#define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) 435#define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7)
431 436
432#define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x) 437#define __SYS__LR0_EL2(x) sys_reg(3, 4, 12, 12, x)
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 0f2a135ba15b..690e033a91c0 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -3,9 +3,7 @@
3# Makefile for Kernel-based Virtual Machine module 3# Makefile for Kernel-based Virtual Machine module
4# 4#
5 5
6ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic 6ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic
7CFLAGS_arm.o := -I.
8CFLAGS_mmu.o := -I.
9 7
10KVM=../../../virt/kvm 8KVM=../../../virt/kvm
11 9
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index f39801e4136c..fd917d6d12af 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -76,7 +76,7 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
76 76
77void kvm_arm_init_debug(void) 77void kvm_arm_init_debug(void)
78{ 78{
79 __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2)); 79 __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2));
80} 80}
81 81
82/** 82/**
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 952f6cb9cf72..2845aa680841 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -40,9 +40,6 @@
40 * arch/arm64/kernel/hyp_stub.S. 40 * arch/arm64/kernel/hyp_stub.S.
41 */ 41 */
42ENTRY(__kvm_call_hyp) 42ENTRY(__kvm_call_hyp)
43alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
44 hvc #0 43 hvc #0
45 ret 44 ret
46alternative_else_nop_endif
47 b __vhe_hyp_call
48ENDPROC(__kvm_call_hyp) 45ENDPROC(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 73c1b483ec39..2b1e686772bf 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -43,18 +43,6 @@
43 ldr lr, [sp], #16 43 ldr lr, [sp], #16
44.endm 44.endm
45 45
46ENTRY(__vhe_hyp_call)
47 do_el2_call
48 /*
49 * We used to rely on having an exception return to get
50 * an implicit isb. In the E2H case, we don't have it anymore.
51 * rather than changing all the leaf functions, just do it here
52 * before returning to the rest of the kernel.
53 */
54 isb
55 ret
56ENDPROC(__vhe_hyp_call)
57
58el1_sync: // Guest trapped into EL2 46el1_sync: // Guest trapped into EL2
59 47
60 mrs x0, esr_el2 48 mrs x0, esr_el2
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index b426e2cf973c..c52a8451637c 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -53,7 +53,6 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
53 53
54static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) 54static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
55{ 55{
56 ctxt->sys_regs[MPIDR_EL1] = read_sysreg(vmpidr_el2);
57 ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1); 56 ctxt->sys_regs[CSSELR_EL1] = read_sysreg(csselr_el1);
58 ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr); 57 ctxt->sys_regs[SCTLR_EL1] = read_sysreg_el1(sctlr);
59 ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1); 58 ctxt->sys_regs[ACTLR_EL1] = read_sysreg(actlr_el1);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c936aa40c3f4..539feecda5b8 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -982,6 +982,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
982 return true; 982 return true;
983} 983}
984 984
985#define reg_to_encoding(x) \
986 sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \
987 (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2);
988
985/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ 989/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
986#define DBG_BCR_BVR_WCR_WVR_EL1(n) \ 990#define DBG_BCR_BVR_WCR_WVR_EL1(n) \
987 { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ 991 { SYS_DESC(SYS_DBGBVRn_EL1(n)), \
@@ -1003,44 +1007,38 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1003 { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ 1007 { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \
1004 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } 1008 access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
1005 1009
1006static bool access_cntp_tval(struct kvm_vcpu *vcpu, 1010static bool access_arch_timer(struct kvm_vcpu *vcpu,
1007 struct sys_reg_params *p, 1011 struct sys_reg_params *p,
1008 const struct sys_reg_desc *r) 1012 const struct sys_reg_desc *r)
1009{ 1013{
1010 u64 now = kvm_phys_timer_read(); 1014 enum kvm_arch_timers tmr;
1011 u64 cval; 1015 enum kvm_arch_timer_regs treg;
1016 u64 reg = reg_to_encoding(r);
1012 1017
1013 if (p->is_write) { 1018 switch (reg) {
1014 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, 1019 case SYS_CNTP_TVAL_EL0:
1015 p->regval + now); 1020 case SYS_AARCH32_CNTP_TVAL:
1016 } else { 1021 tmr = TIMER_PTIMER;
1017 cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 1022 treg = TIMER_REG_TVAL;
1018 p->regval = cval - now; 1023 break;
1024 case SYS_CNTP_CTL_EL0:
1025 case SYS_AARCH32_CNTP_CTL:
1026 tmr = TIMER_PTIMER;
1027 treg = TIMER_REG_CTL;
1028 break;
1029 case SYS_CNTP_CVAL_EL0:
1030 case SYS_AARCH32_CNTP_CVAL:
1031 tmr = TIMER_PTIMER;
1032 treg = TIMER_REG_CVAL;
1033 break;
1034 default:
1035 BUG();
1019 } 1036 }
1020 1037
1021 return true;
1022}
1023
1024static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
1025 struct sys_reg_params *p,
1026 const struct sys_reg_desc *r)
1027{
1028 if (p->is_write)
1029 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
1030 else
1031 p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
1032
1033 return true;
1034}
1035
1036static bool access_cntp_cval(struct kvm_vcpu *vcpu,
1037 struct sys_reg_params *p,
1038 const struct sys_reg_desc *r)
1039{
1040 if (p->is_write) 1038 if (p->is_write)
1041 kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval); 1039 kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval);
1042 else 1040 else
1043 p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL); 1041 p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg);
1044 1042
1045 return true; 1043 return true;
1046} 1044}
@@ -1160,6 +1158,64 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
1160 return __set_id_reg(rd, uaddr, true); 1158 return __set_id_reg(rd, uaddr, true);
1161} 1159}
1162 1160
1161static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1162 const struct sys_reg_desc *r)
1163{
1164 if (p->is_write)
1165 return write_to_read_only(vcpu, p, r);
1166
1167 p->regval = read_sanitised_ftr_reg(SYS_CTR_EL0);
1168 return true;
1169}
1170
1171static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1172 const struct sys_reg_desc *r)
1173{
1174 if (p->is_write)
1175 return write_to_read_only(vcpu, p, r);
1176
1177 p->regval = read_sysreg(clidr_el1);
1178 return true;
1179}
1180
1181static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1182 const struct sys_reg_desc *r)
1183{
1184 if (p->is_write)
1185 vcpu_write_sys_reg(vcpu, p->regval, r->reg);
1186 else
1187 p->regval = vcpu_read_sys_reg(vcpu, r->reg);
1188 return true;
1189}
1190
1191static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
1192 const struct sys_reg_desc *r)
1193{
1194 u32 csselr;
1195
1196 if (p->is_write)
1197 return write_to_read_only(vcpu, p, r);
1198
1199 csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1);
1200 p->regval = get_ccsidr(csselr);
1201
1202 /*
1203 * Guests should not be doing cache operations by set/way at all, and
1204 * for this reason, we trap them and attempt to infer the intent, so
1205 * that we can flush the entire guest's address space at the appropriate
1206 * time.
1207 * To prevent this trapping from causing performance problems, let's
1208 * expose the geometry of all data and unified caches (which are
1209 * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way.
1210 * [If guests should attempt to infer aliasing properties from the
1211 * geometry (which is not permitted by the architecture), they would
1212 * only do so for virtually indexed caches.]
1213 */
1214 if (!(csselr & 1)) // data or unified cache
1215 p->regval &= ~GENMASK(27, 3);
1216 return true;
1217}
1218
1163/* sys_reg_desc initialiser for known cpufeature ID registers */ 1219/* sys_reg_desc initialiser for known cpufeature ID registers */
1164#define ID_SANITISED(name) { \ 1220#define ID_SANITISED(name) { \
1165 SYS_DESC(SYS_##name), \ 1221 SYS_DESC(SYS_##name), \
@@ -1377,7 +1433,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1377 1433
1378 { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, 1434 { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0},
1379 1435
1380 { SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 }, 1436 { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
1437 { SYS_DESC(SYS_CLIDR_EL1), access_clidr },
1438 { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
1439 { SYS_DESC(SYS_CTR_EL0), access_ctr },
1381 1440
1382 { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, 1441 { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, },
1383 { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, 1442 { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
@@ -1400,9 +1459,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1400 { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, 1459 { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
1401 { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, 1460 { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
1402 1461
1403 { SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval }, 1462 { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
1404 { SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl }, 1463 { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
1405 { SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval }, 1464 { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },
1406 1465
1407 /* PMEVCNTRn_EL0 */ 1466 /* PMEVCNTRn_EL0 */
1408 PMU_PMEVCNTR_EL0(0), 1467 PMU_PMEVCNTR_EL0(0),
@@ -1476,7 +1535,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
1476 1535
1477 { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, 1536 { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
1478 { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, 1537 { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
1479 { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 }, 1538 { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
1480}; 1539};
1481 1540
1482static bool trap_dbgidr(struct kvm_vcpu *vcpu, 1541static bool trap_dbgidr(struct kvm_vcpu *vcpu,
@@ -1677,6 +1736,7 @@ static const struct sys_reg_desc cp14_64_regs[] = {
1677 * register). 1736 * register).
1678 */ 1737 */
1679static const struct sys_reg_desc cp15_regs[] = { 1738static const struct sys_reg_desc cp15_regs[] = {
1739 { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr },
1680 { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR }, 1740 { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, c1_SCTLR },
1681 { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 }, 1741 { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
1682 { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 }, 1742 { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
@@ -1723,10 +1783,9 @@ static const struct sys_reg_desc cp15_regs[] = {
1723 1783
1724 { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, 1784 { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
1725 1785
1726 /* CNTP_TVAL */ 1786 /* Arch Tmers */
1727 { Op1( 0), CRn(14), CRm( 2), Op2( 0), access_cntp_tval }, 1787 { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer },
1728 /* CNTP_CTL */ 1788 { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer },
1729 { Op1( 0), CRn(14), CRm( 2), Op2( 1), access_cntp_ctl },
1730 1789
1731 /* PMEVCNTRn */ 1790 /* PMEVCNTRn */
1732 PMU_PMEVCNTR(0), 1791 PMU_PMEVCNTR(0),
@@ -1794,6 +1853,10 @@ static const struct sys_reg_desc cp15_regs[] = {
1794 PMU_PMEVTYPER(30), 1853 PMU_PMEVTYPER(30),
1795 /* PMCCFILTR */ 1854 /* PMCCFILTR */
1796 { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper }, 1855 { Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper },
1856
1857 { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
1858 { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },
1859 { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, c0_CSSELR },
1797}; 1860};
1798 1861
1799static const struct sys_reg_desc cp15_64_regs[] = { 1862static const struct sys_reg_desc cp15_64_regs[] = {
@@ -1803,7 +1866,7 @@ static const struct sys_reg_desc cp15_64_regs[] = {
1803 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 }, 1866 { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
1804 { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ 1867 { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
1805 { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ 1868 { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
1806 { Op1( 2), CRn( 0), CRm(14), Op2( 0), access_cntp_cval }, 1869 { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
1807}; 1870};
1808 1871
1809/* Target specific emulation tables */ 1872/* Target specific emulation tables */
@@ -1832,30 +1895,19 @@ static const struct sys_reg_desc *get_target_table(unsigned target,
1832 } 1895 }
1833} 1896}
1834 1897
1835#define reg_to_match_value(x) \
1836 ({ \
1837 unsigned long val; \
1838 val = (x)->Op0 << 14; \
1839 val |= (x)->Op1 << 11; \
1840 val |= (x)->CRn << 7; \
1841 val |= (x)->CRm << 3; \
1842 val |= (x)->Op2; \
1843 val; \
1844 })
1845
1846static int match_sys_reg(const void *key, const void *elt) 1898static int match_sys_reg(const void *key, const void *elt)
1847{ 1899{
1848 const unsigned long pval = (unsigned long)key; 1900 const unsigned long pval = (unsigned long)key;
1849 const struct sys_reg_desc *r = elt; 1901 const struct sys_reg_desc *r = elt;
1850 1902
1851 return pval - reg_to_match_value(r); 1903 return pval - reg_to_encoding(r);
1852} 1904}
1853 1905
1854static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, 1906static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
1855 const struct sys_reg_desc table[], 1907 const struct sys_reg_desc table[],
1856 unsigned int num) 1908 unsigned int num)
1857{ 1909{
1858 unsigned long pval = reg_to_match_value(params); 1910 unsigned long pval = reg_to_encoding(params);
1859 1911
1860 return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); 1912 return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
1861} 1913}
@@ -2218,11 +2270,15 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
2218 } 2270 }
2219 2271
2220FUNCTION_INVARIANT(midr_el1) 2272FUNCTION_INVARIANT(midr_el1)
2221FUNCTION_INVARIANT(ctr_el0)
2222FUNCTION_INVARIANT(revidr_el1) 2273FUNCTION_INVARIANT(revidr_el1)
2223FUNCTION_INVARIANT(clidr_el1) 2274FUNCTION_INVARIANT(clidr_el1)
2224FUNCTION_INVARIANT(aidr_el1) 2275FUNCTION_INVARIANT(aidr_el1)
2225 2276
2277static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
2278{
2279 ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
2280}
2281
2226/* ->val is filled in by kvm_sys_reg_table_init() */ 2282/* ->val is filled in by kvm_sys_reg_table_init() */
2227static struct sys_reg_desc invariant_sys_regs[] = { 2283static struct sys_reg_desc invariant_sys_regs[] = {
2228 { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, 2284 { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index d2abd98471e8..41204a49cf95 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
1134static inline void kvm_arch_sync_events(struct kvm *kvm) {} 1134static inline void kvm_arch_sync_events(struct kvm *kvm) {}
1135static inline void kvm_arch_free_memslot(struct kvm *kvm, 1135static inline void kvm_arch_free_memslot(struct kvm *kvm,
1136 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} 1136 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
1137static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} 1137static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
1138static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 1138static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
1139static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} 1139static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
1140static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} 1140static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f98f00da2ea..e6b5bb012ccb 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -99,6 +99,8 @@ struct kvm_nested_guest;
99 99
100struct kvm_vm_stat { 100struct kvm_vm_stat {
101 ulong remote_tlb_flush; 101 ulong remote_tlb_flush;
102 ulong num_2M_pages;
103 ulong num_1G_pages;
102}; 104};
103 105
104struct kvm_vcpu_stat { 106struct kvm_vcpu_stat {
@@ -377,6 +379,7 @@ struct kvmppc_mmu {
377 void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs); 379 void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs);
378 u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr); 380 u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr);
379 u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr); 381 u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr);
382 int (*slbfee)(struct kvm_vcpu *vcpu, gva_t eaddr, ulong *ret_slb);
380 void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr); 383 void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr);
381 void (*slbia)(struct kvm_vcpu *vcpu); 384 void (*slbia)(struct kvm_vcpu *vcpu);
382 /* book3s */ 385 /* book3s */
@@ -837,7 +840,7 @@ struct kvm_vcpu_arch {
837static inline void kvm_arch_hardware_disable(void) {} 840static inline void kvm_arch_hardware_disable(void) {}
838static inline void kvm_arch_hardware_unsetup(void) {} 841static inline void kvm_arch_hardware_unsetup(void) {}
839static inline void kvm_arch_sync_events(struct kvm *kvm) {} 842static inline void kvm_arch_sync_events(struct kvm *kvm) {}
840static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} 843static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
841static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 844static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
842static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 845static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
843static inline void kvm_arch_exit(void) {} 846static inline void kvm_arch_exit(void) {}
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a6c8548ed9fa..ac22b28ae78d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -36,6 +36,8 @@
36#endif 36#endif
37#ifdef CONFIG_KVM_BOOK3S_64_HANDLER 37#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
38#include <asm/paca.h> 38#include <asm/paca.h>
39#include <asm/xive.h>
40#include <asm/cpu_has_feature.h>
39#endif 41#endif
40 42
41/* 43/*
@@ -617,6 +619,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir
617static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } 619static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
618#endif /* CONFIG_KVM_XIVE */ 620#endif /* CONFIG_KVM_XIVE */
619 621
622#if defined(CONFIG_PPC_POWERNV) && defined(CONFIG_KVM_BOOK3S_64_HANDLER)
623static inline bool xics_on_xive(void)
624{
625 return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE);
626}
627#else
628static inline bool xics_on_xive(void)
629{
630 return false;
631}
632#endif
633
620/* 634/*
621 * Prototypes for functions called only from assembler code. 635 * Prototypes for functions called only from assembler code.
622 * Having prototypes reduces sparse errors. 636 * Having prototypes reduces sparse errors.
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 8c876c166ef2..26ca425f4c2c 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -463,10 +463,12 @@ struct kvm_ppc_cpu_char {
463#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58) 463#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED (1ULL << 58)
464#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57) 464#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF (1ULL << 57)
465#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56) 465#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS (1ULL << 56)
466#define KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST (1ull << 54)
466 467
467#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63) 468#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY (1ULL << 63)
468#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62) 469#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR (1ULL << 62)
469#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61) 470#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ULL << 61)
471#define KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE (1ull << 58)
470 472
471/* Per-vcpu XICS interrupt controller state */ 473/* Per-vcpu XICS interrupt controller state */
472#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) 474#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 9a7dadbe1f17..10c5579d20ce 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -39,6 +39,7 @@
39#include "book3s.h" 39#include "book3s.h"
40#include "trace.h" 40#include "trace.h"
41 41
42#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
42#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 43#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
43 44
44/* #define EXIT_DEBUG */ 45/* #define EXIT_DEBUG */
@@ -71,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
71 { "pthru_all", VCPU_STAT(pthru_all) }, 72 { "pthru_all", VCPU_STAT(pthru_all) },
72 { "pthru_host", VCPU_STAT(pthru_host) }, 73 { "pthru_host", VCPU_STAT(pthru_host) },
73 { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, 74 { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) },
75 { "largepages_2M", VM_STAT(num_2M_pages) },
76 { "largepages_1G", VM_STAT(num_1G_pages) },
74 { NULL } 77 { NULL }
75}; 78};
76 79
@@ -642,7 +645,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
642 r = -ENXIO; 645 r = -ENXIO;
643 break; 646 break;
644 } 647 }
645 if (xive_enabled()) 648 if (xics_on_xive())
646 *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu)); 649 *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
647 else 650 else
648 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); 651 *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
@@ -715,7 +718,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
715 r = -ENXIO; 718 r = -ENXIO;
716 break; 719 break;
717 } 720 }
718 if (xive_enabled()) 721 if (xics_on_xive())
719 r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val)); 722 r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
720 else 723 else
721 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); 724 r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
@@ -991,7 +994,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
991int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, 994int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
992 bool line_status) 995 bool line_status)
993{ 996{
994 if (xive_enabled()) 997 if (xics_on_xive())
995 return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, 998 return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
996 line_status); 999 line_status);
997 else 1000 else
@@ -1044,7 +1047,7 @@ static int kvmppc_book3s_init(void)
1044 1047
1045#ifdef CONFIG_KVM_XICS 1048#ifdef CONFIG_KVM_XICS
1046#ifdef CONFIG_KVM_XIVE 1049#ifdef CONFIG_KVM_XIVE
1047 if (xive_enabled()) { 1050 if (xics_on_xive()) {
1048 kvmppc_xive_init_module(); 1051 kvmppc_xive_init_module();
1049 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); 1052 kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
1050 } else 1053 } else
@@ -1057,7 +1060,7 @@ static int kvmppc_book3s_init(void)
1057static void kvmppc_book3s_exit(void) 1060static void kvmppc_book3s_exit(void)
1058{ 1061{
1059#ifdef CONFIG_KVM_XICS 1062#ifdef CONFIG_KVM_XICS
1060 if (xive_enabled()) 1063 if (xics_on_xive())
1061 kvmppc_xive_exit_module(); 1064 kvmppc_xive_exit_module();
1062#endif 1065#endif
1063#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 1066#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 612169988a3d..6f789f674048 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -425,6 +425,7 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu)
425 mmu->slbmte = NULL; 425 mmu->slbmte = NULL;
426 mmu->slbmfee = NULL; 426 mmu->slbmfee = NULL;
427 mmu->slbmfev = NULL; 427 mmu->slbmfev = NULL;
428 mmu->slbfee = NULL;
428 mmu->slbie = NULL; 429 mmu->slbie = NULL;
429 mmu->slbia = NULL; 430 mmu->slbia = NULL;
430} 431}
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index c92dd25bed23..d4b967f0e8d4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -435,6 +435,19 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
435 kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); 435 kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT);
436} 436}
437 437
438static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr,
439 ulong *ret_slb)
440{
441 struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
442
443 if (slbe) {
444 *ret_slb = slbe->origv;
445 return 0;
446 }
447 *ret_slb = 0;
448 return -ENOENT;
449}
450
438static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) 451static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
439{ 452{
440 struct kvmppc_slb *slbe; 453 struct kvmppc_slb *slbe;
@@ -670,6 +683,7 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
670 mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; 683 mmu->slbmte = kvmppc_mmu_book3s_64_slbmte;
671 mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; 684 mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee;
672 mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; 685 mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev;
686 mmu->slbfee = kvmppc_mmu_book3s_64_slbfee;
673 mmu->slbie = kvmppc_mmu_book3s_64_slbie; 687 mmu->slbie = kvmppc_mmu_book3s_64_slbie;
674 mmu->slbia = kvmppc_mmu_book3s_64_slbia; 688 mmu->slbia = kvmppc_mmu_book3s_64_slbia;
675 mmu->xlate = kvmppc_mmu_book3s_64_xlate; 689 mmu->xlate = kvmppc_mmu_book3s_64_xlate;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bd2dcfbf00cd..be7bc070eae5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -442,6 +442,24 @@ int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
442 u32 last_inst; 442 u32 last_inst;
443 443
444 /* 444 /*
445 * Fast path - check if the guest physical address corresponds to a
446 * device on the FAST_MMIO_BUS, if so we can avoid loading the
447 * instruction all together, then we can just handle it and return.
448 */
449 if (is_store) {
450 int idx, ret;
451
452 idx = srcu_read_lock(&vcpu->kvm->srcu);
453 ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0,
454 NULL);
455 srcu_read_unlock(&vcpu->kvm->srcu, idx);
456 if (!ret) {
457 kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
458 return RESUME_GUEST;
459 }
460 }
461
462 /*
445 * If we fail, we just return to the guest and try executing it again. 463 * If we fail, we just return to the guest and try executing it again.
446 */ 464 */
447 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != 465 if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 1b821c6efdef..f55ef071883f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -403,8 +403,13 @@ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
403 if (!memslot) 403 if (!memslot)
404 return; 404 return;
405 } 405 }
406 if (shift) 406 if (shift) { /* 1GB or 2MB page */
407 page_size = 1ul << shift; 407 page_size = 1ul << shift;
408 if (shift == PMD_SHIFT)
409 kvm->stat.num_2M_pages--;
410 else if (shift == PUD_SHIFT)
411 kvm->stat.num_1G_pages--;
412 }
408 413
409 gpa &= ~(page_size - 1); 414 gpa &= ~(page_size - 1);
410 hpa = old & PTE_RPN_MASK; 415 hpa = old & PTE_RPN_MASK;
@@ -878,6 +883,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
878 put_page(page); 883 put_page(page);
879 } 884 }
880 885
886 /* Increment number of large pages if we (successfully) inserted one */
887 if (!ret) {
888 if (level == 1)
889 kvm->stat.num_2M_pages++;
890 else if (level == 2)
891 kvm->stat.num_1G_pages++;
892 }
893
881 return ret; 894 return ret;
882} 895}
883 896
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 532ab79734c7..f02b04973710 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -133,7 +133,6 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
133 continue; 133 continue;
134 134
135 kref_put(&stit->kref, kvm_spapr_tce_liobn_put); 135 kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
136 return;
137 } 136 }
138 } 137 }
139 } 138 }
@@ -338,14 +337,15 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
338 } 337 }
339 } 338 }
340 339
340 kvm_get_kvm(kvm);
341 if (!ret) 341 if (!ret)
342 ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, 342 ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
343 stt, O_RDWR | O_CLOEXEC); 343 stt, O_RDWR | O_CLOEXEC);
344 344
345 if (ret >= 0) { 345 if (ret >= 0)
346 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); 346 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
347 kvm_get_kvm(kvm); 347 else
348 } 348 kvm_put_kvm(kvm);
349 349
350 mutex_unlock(&kvm->lock); 350 mutex_unlock(&kvm->lock);
351 351
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 8c7e933e942e..6ef7c5f00a49 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -47,6 +47,7 @@
47#define OP_31_XOP_SLBMFEV 851 47#define OP_31_XOP_SLBMFEV 851
48#define OP_31_XOP_EIOIO 854 48#define OP_31_XOP_EIOIO 854
49#define OP_31_XOP_SLBMFEE 915 49#define OP_31_XOP_SLBMFEE 915
50#define OP_31_XOP_SLBFEE 979
50 51
51#define OP_31_XOP_TBEGIN 654 52#define OP_31_XOP_TBEGIN 654
52#define OP_31_XOP_TABORT 910 53#define OP_31_XOP_TABORT 910
@@ -416,6 +417,23 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
416 417
417 vcpu->arch.mmu.slbia(vcpu); 418 vcpu->arch.mmu.slbia(vcpu);
418 break; 419 break;
420 case OP_31_XOP_SLBFEE:
421 if (!(inst & 1) || !vcpu->arch.mmu.slbfee) {
422 return EMULATE_FAIL;
423 } else {
424 ulong b, t;
425 ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK;
426
427 b = kvmppc_get_gpr(vcpu, rb);
428 if (!vcpu->arch.mmu.slbfee(vcpu, b, &t))
429 cr |= 2 << CR0_SHIFT;
430 kvmppc_set_gpr(vcpu, rt, t);
431 /* copy XER[SO] bit to CR0[SO] */
432 cr |= (vcpu->arch.regs.xer & 0x80000000) >>
433 (31 - CR0_SHIFT);
434 kvmppc_set_cr(vcpu, cr);
435 }
436 break;
419 case OP_31_XOP_SLBMFEE: 437 case OP_31_XOP_SLBMFEE:
420 if (!vcpu->arch.mmu.slbmfee) { 438 if (!vcpu->arch.mmu.slbmfee) {
421 emulated = EMULATE_FAIL; 439 emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a3d5318f5d1e..06964350b97a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
922 case H_IPOLL: 922 case H_IPOLL:
923 case H_XIRR_X: 923 case H_XIRR_X:
924 if (kvmppc_xics_enabled(vcpu)) { 924 if (kvmppc_xics_enabled(vcpu)) {
925 if (xive_enabled()) { 925 if (xics_on_xive()) {
926 ret = H_NOT_AVAILABLE; 926 ret = H_NOT_AVAILABLE;
927 return RESUME_GUEST; 927 return RESUME_GUEST;
928 } 928 }
@@ -937,6 +937,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
937 ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), 937 ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
938 kvmppc_get_gpr(vcpu, 5)); 938 kvmppc_get_gpr(vcpu, 5));
939 break; 939 break;
940#ifdef CONFIG_SPAPR_TCE_IOMMU
940 case H_GET_TCE: 941 case H_GET_TCE:
941 ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), 942 ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
942 kvmppc_get_gpr(vcpu, 5)); 943 kvmppc_get_gpr(vcpu, 5));
@@ -966,6 +967,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
966 if (ret == H_TOO_HARD) 967 if (ret == H_TOO_HARD)
967 return RESUME_HOST; 968 return RESUME_HOST;
968 break; 969 break;
970#endif
969 case H_RANDOM: 971 case H_RANDOM:
970 if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) 972 if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
971 ret = H_HARDWARE; 973 ret = H_HARDWARE;
@@ -1445,7 +1447,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
1445 case BOOK3S_INTERRUPT_HV_RM_HARD: 1447 case BOOK3S_INTERRUPT_HV_RM_HARD:
1446 vcpu->arch.trap = 0; 1448 vcpu->arch.trap = 0;
1447 r = RESUME_GUEST; 1449 r = RESUME_GUEST;
1448 if (!xive_enabled()) 1450 if (!xics_on_xive())
1449 kvmppc_xics_rm_complete(vcpu, 0); 1451 kvmppc_xics_rm_complete(vcpu, 0);
1450 break; 1452 break;
1451 default: 1453 default:
@@ -3648,11 +3650,12 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
3648 3650
3649static void grow_halt_poll_ns(struct kvmppc_vcore *vc) 3651static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
3650{ 3652{
3651 /* 10us base */ 3653 if (!halt_poll_ns_grow)
3652 if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) 3654 return;
3653 vc->halt_poll_ns = 10000; 3655
3654 else 3656 vc->halt_poll_ns *= halt_poll_ns_grow;
3655 vc->halt_poll_ns *= halt_poll_ns_grow; 3657 if (vc->halt_poll_ns < halt_poll_ns_grow_start)
3658 vc->halt_poll_ns = halt_poll_ns_grow_start;
3656} 3659}
3657 3660
3658static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) 3661static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
@@ -3666,7 +3669,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
3666#ifdef CONFIG_KVM_XICS 3669#ifdef CONFIG_KVM_XICS
3667static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) 3670static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
3668{ 3671{
3669 if (!xive_enabled()) 3672 if (!xics_on_xive())
3670 return false; 3673 return false;
3671 return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < 3674 return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
3672 vcpu->arch.xive_saved_state.cppr; 3675 vcpu->arch.xive_saved_state.cppr;
@@ -4226,7 +4229,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
4226 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 4229 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
4227 srcu_read_unlock(&kvm->srcu, srcu_idx); 4230 srcu_read_unlock(&kvm->srcu, srcu_idx);
4228 } else if (r == RESUME_PASSTHROUGH) { 4231 } else if (r == RESUME_PASSTHROUGH) {
4229 if (WARN_ON(xive_enabled())) 4232 if (WARN_ON(xics_on_xive()))
4230 r = H_SUCCESS; 4233 r = H_SUCCESS;
4231 else 4234 else
4232 r = kvmppc_xics_rm_complete(vcpu, 0); 4235 r = kvmppc_xics_rm_complete(vcpu, 0);
@@ -4750,7 +4753,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
4750 * If xive is enabled, we route 0x500 interrupts directly 4753 * If xive is enabled, we route 0x500 interrupts directly
4751 * to the guest. 4754 * to the guest.
4752 */ 4755 */
4753 if (xive_enabled()) 4756 if (xics_on_xive())
4754 lpcr |= LPCR_LPES; 4757 lpcr |= LPCR_LPES;
4755 } 4758 }
4756 4759
@@ -4986,7 +4989,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
4986 if (i == pimap->n_mapped) 4989 if (i == pimap->n_mapped)
4987 pimap->n_mapped++; 4990 pimap->n_mapped++;
4988 4991
4989 if (xive_enabled()) 4992 if (xics_on_xive())
4990 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); 4993 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
4991 else 4994 else
4992 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); 4995 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
@@ -5027,7 +5030,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
5027 return -ENODEV; 5030 return -ENODEV;
5028 } 5031 }
5029 5032
5030 if (xive_enabled()) 5033 if (xics_on_xive())
5031 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); 5034 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
5032 else 5035 else
5033 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); 5036 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
@@ -5359,13 +5362,11 @@ static int kvm_init_subcore_bitmap(void)
5359 continue; 5362 continue;
5360 5363
5361 sibling_subcore_state = 5364 sibling_subcore_state =
5362 kmalloc_node(sizeof(struct sibling_subcore_state), 5365 kzalloc_node(sizeof(struct sibling_subcore_state),
5363 GFP_KERNEL, node); 5366 GFP_KERNEL, node);
5364 if (!sibling_subcore_state) 5367 if (!sibling_subcore_state)
5365 return -ENOMEM; 5368 return -ENOMEM;
5366 5369
5367 memset(sibling_subcore_state, 0,
5368 sizeof(struct sibling_subcore_state));
5369 5370
5370 for (j = 0; j < threads_per_core; j++) { 5371 for (j = 0; j < threads_per_core; j++) {
5371 int cpu = first_cpu + j; 5372 int cpu = first_cpu + j;
@@ -5406,7 +5407,7 @@ static int kvmppc_book3s_init_hv(void)
5406 * indirectly, via OPAL. 5407 * indirectly, via OPAL.
5407 */ 5408 */
5408#ifdef CONFIG_SMP 5409#ifdef CONFIG_SMP
5409 if (!xive_enabled() && !kvmhv_on_pseries() && 5410 if (!xics_on_xive() && !kvmhv_on_pseries() &&
5410 !local_paca->kvm_hstate.xics_phys) { 5411 !local_paca->kvm_hstate.xics_phys) {
5411 struct device_node *np; 5412 struct device_node *np;
5412 5413
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index a71e2fc00a4e..b0cf22477e87 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu)
257 } 257 }
258 258
259 /* We should never reach this */ 259 /* We should never reach this */
260 if (WARN_ON_ONCE(xive_enabled())) 260 if (WARN_ON_ONCE(xics_on_xive()))
261 return; 261 return;
262 262
263 /* Else poke the target with an IPI */ 263 /* Else poke the target with an IPI */
@@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
577{ 577{
578 if (!kvmppc_xics_enabled(vcpu)) 578 if (!kvmppc_xics_enabled(vcpu))
579 return H_TOO_HARD; 579 return H_TOO_HARD;
580 if (xive_enabled()) { 580 if (xics_on_xive()) {
581 if (is_rm()) 581 if (is_rm())
582 return xive_rm_h_xirr(vcpu); 582 return xive_rm_h_xirr(vcpu);
583 if (unlikely(!__xive_vm_h_xirr)) 583 if (unlikely(!__xive_vm_h_xirr))
@@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
592 if (!kvmppc_xics_enabled(vcpu)) 592 if (!kvmppc_xics_enabled(vcpu))
593 return H_TOO_HARD; 593 return H_TOO_HARD;
594 vcpu->arch.regs.gpr[5] = get_tb(); 594 vcpu->arch.regs.gpr[5] = get_tb();
595 if (xive_enabled()) { 595 if (xics_on_xive()) {
596 if (is_rm()) 596 if (is_rm())
597 return xive_rm_h_xirr(vcpu); 597 return xive_rm_h_xirr(vcpu);
598 if (unlikely(!__xive_vm_h_xirr)) 598 if (unlikely(!__xive_vm_h_xirr))
@@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
606{ 606{
607 if (!kvmppc_xics_enabled(vcpu)) 607 if (!kvmppc_xics_enabled(vcpu))
608 return H_TOO_HARD; 608 return H_TOO_HARD;
609 if (xive_enabled()) { 609 if (xics_on_xive()) {
610 if (is_rm()) 610 if (is_rm())
611 return xive_rm_h_ipoll(vcpu, server); 611 return xive_rm_h_ipoll(vcpu, server);
612 if (unlikely(!__xive_vm_h_ipoll)) 612 if (unlikely(!__xive_vm_h_ipoll))
@@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
621{ 621{
622 if (!kvmppc_xics_enabled(vcpu)) 622 if (!kvmppc_xics_enabled(vcpu))
623 return H_TOO_HARD; 623 return H_TOO_HARD;
624 if (xive_enabled()) { 624 if (xics_on_xive()) {
625 if (is_rm()) 625 if (is_rm())
626 return xive_rm_h_ipi(vcpu, server, mfrr); 626 return xive_rm_h_ipi(vcpu, server, mfrr);
627 if (unlikely(!__xive_vm_h_ipi)) 627 if (unlikely(!__xive_vm_h_ipi))
@@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
635{ 635{
636 if (!kvmppc_xics_enabled(vcpu)) 636 if (!kvmppc_xics_enabled(vcpu))
637 return H_TOO_HARD; 637 return H_TOO_HARD;
638 if (xive_enabled()) { 638 if (xics_on_xive()) {
639 if (is_rm()) 639 if (is_rm())
640 return xive_rm_h_cppr(vcpu, cppr); 640 return xive_rm_h_cppr(vcpu, cppr);
641 if (unlikely(!__xive_vm_h_cppr)) 641 if (unlikely(!__xive_vm_h_cppr))
@@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
649{ 649{
650 if (!kvmppc_xics_enabled(vcpu)) 650 if (!kvmppc_xics_enabled(vcpu))
651 return H_TOO_HARD; 651 return H_TOO_HARD;
652 if (xive_enabled()) { 652 if (xics_on_xive()) {
653 if (is_rm()) 653 if (is_rm())
654 return xive_rm_h_eoi(vcpu, xirr); 654 return xive_rm_h_eoi(vcpu, xirr);
655 if (unlikely(!__xive_vm_h_eoi)) 655 if (unlikely(!__xive_vm_h_eoi))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index b3f5786b20dc..3b9662a4207e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
144 return; 144 return;
145 } 145 }
146 146
147 if (xive_enabled() && kvmhv_on_pseries()) {
148 /* No XICS access or hypercalls available, too hard */
149 this_icp->rm_action |= XICS_RM_KICK_VCPU;
150 this_icp->rm_kick_target = vcpu;
151 return;
152 }
153
147 /* 154 /*
148 * Check if the core is loaded, 155 * Check if the core is loaded,
149 * if not, find an available host core to post to wake the VCPU, 156 * if not, find an available host core to post to wake the VCPU,
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 25043b50cb30..3a5e719ef032 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -2272,8 +2272,13 @@ hcall_real_table:
2272 .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table 2272 .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
2273 .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table 2273 .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
2274 .long DOTSYM(kvmppc_h_protect) - hcall_real_table 2274 .long DOTSYM(kvmppc_h_protect) - hcall_real_table
2275#ifdef CONFIG_SPAPR_TCE_IOMMU
2275 .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table 2276 .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table
2276 .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table 2277 .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table
2278#else
2279 .long 0 /* 0x1c */
2280 .long 0 /* 0x20 */
2281#endif
2277 .long 0 /* 0x24 - H_SET_SPRG0 */ 2282 .long 0 /* 0x24 - H_SET_SPRG0 */
2278 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table 2283 .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
2279 .long 0 /* 0x2c */ 2284 .long 0 /* 0x2c */
@@ -2351,8 +2356,13 @@ hcall_real_table:
2351 .long 0 /* 0x12c */ 2356 .long 0 /* 0x12c */
2352 .long 0 /* 0x130 */ 2357 .long 0 /* 0x130 */
2353 .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table 2358 .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
2359#ifdef CONFIG_SPAPR_TCE_IOMMU
2354 .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table 2360 .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table
2355 .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table 2361 .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
2362#else
2363 .long 0 /* 0x138 */
2364 .long 0 /* 0x13c */
2365#endif
2356 .long 0 /* 0x140 */ 2366 .long 0 /* 0x140 */
2357 .long 0 /* 0x144 */ 2367 .long 0 /* 0x144 */
2358 .long 0 /* 0x148 */ 2368 .long 0 /* 0x148 */
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 2d3b2b1cc272..4e178c4c1ea5 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
33 server = be32_to_cpu(args->args[1]); 33 server = be32_to_cpu(args->args[1]);
34 priority = be32_to_cpu(args->args[2]); 34 priority = be32_to_cpu(args->args[2]);
35 35
36 if (xive_enabled()) 36 if (xics_on_xive())
37 rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); 37 rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
38 else 38 else
39 rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); 39 rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
@@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
56 irq = be32_to_cpu(args->args[0]); 56 irq = be32_to_cpu(args->args[0]);
57 57
58 server = priority = 0; 58 server = priority = 0;
59 if (xive_enabled()) 59 if (xics_on_xive())
60 rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); 60 rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
61 else 61 else
62 rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); 62 rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
@@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
83 83
84 irq = be32_to_cpu(args->args[0]); 84 irq = be32_to_cpu(args->args[0]);
85 85
86 if (xive_enabled()) 86 if (xics_on_xive())
87 rc = kvmppc_xive_int_off(vcpu->kvm, irq); 87 rc = kvmppc_xive_int_off(vcpu->kvm, irq);
88 else 88 else
89 rc = kvmppc_xics_int_off(vcpu->kvm, irq); 89 rc = kvmppc_xics_int_off(vcpu->kvm, irq);
@@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
105 105
106 irq = be32_to_cpu(args->args[0]); 106 irq = be32_to_cpu(args->args[0]);
107 107
108 if (xive_enabled()) 108 if (xics_on_xive())
109 rc = kvmppc_xive_int_on(vcpu->kvm, irq); 109 rc = kvmppc_xive_int_on(vcpu->kvm, irq);
110 else 110 else
111 rc = kvmppc_xics_int_on(vcpu->kvm, irq); 111 rc = kvmppc_xics_int_on(vcpu->kvm, irq);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b90a7d154180..8885377ec3e0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
748 kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); 748 kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
749 break; 749 break;
750 case KVMPPC_IRQ_XICS: 750 case KVMPPC_IRQ_XICS:
751 if (xive_enabled()) 751 if (xics_on_xive())
752 kvmppc_xive_cleanup_vcpu(vcpu); 752 kvmppc_xive_cleanup_vcpu(vcpu);
753 else 753 else
754 kvmppc_xics_free_icp(vcpu); 754 kvmppc_xics_free_icp(vcpu);
@@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
1931 r = -EPERM; 1931 r = -EPERM;
1932 dev = kvm_device_from_filp(f.file); 1932 dev = kvm_device_from_filp(f.file);
1933 if (dev) { 1933 if (dev) {
1934 if (xive_enabled()) 1934 if (xics_on_xive())
1935 r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]); 1935 r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
1936 else 1936 else
1937 r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); 1937 r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
@@ -2189,10 +2189,12 @@ static int pseries_get_cpu_char(struct kvm_ppc_cpu_char *cp)
2189 KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | 2189 KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
2190 KVM_PPC_CPU_CHAR_BR_HINT_HONOURED | 2190 KVM_PPC_CPU_CHAR_BR_HINT_HONOURED |
2191 KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF | 2191 KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF |
2192 KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; 2192 KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
2193 KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
2193 cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | 2194 cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
2194 KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | 2195 KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
2195 KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; 2196 KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
2197 KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
2196 } 2198 }
2197 return 0; 2199 return 0;
2198} 2200}
@@ -2251,12 +2253,16 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
2251 if (have_fw_feat(fw_features, "enabled", 2253 if (have_fw_feat(fw_features, "enabled",
2252 "fw-count-cache-disabled")) 2254 "fw-count-cache-disabled"))
2253 cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; 2255 cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
2256 if (have_fw_feat(fw_features, "enabled",
2257 "fw-count-cache-flush-bcctr2,0,0"))
2258 cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
2254 cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 | 2259 cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
2255 KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED | 2260 KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
2256 KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 | 2261 KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
2257 KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 | 2262 KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
2258 KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV | 2263 KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
2259 KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS; 2264 KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
2265 KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
2260 2266
2261 if (have_fw_feat(fw_features, "enabled", 2267 if (have_fw_feat(fw_features, "enabled",
2262 "speculation-policy-favor-security")) 2268 "speculation-policy-favor-security"))
@@ -2267,9 +2273,13 @@ static int kvmppc_get_cpu_char(struct kvm_ppc_cpu_char *cp)
2267 if (!have_fw_feat(fw_features, "disabled", 2273 if (!have_fw_feat(fw_features, "disabled",
2268 "needs-spec-barrier-for-bound-checks")) 2274 "needs-spec-barrier-for-bound-checks"))
2269 cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; 2275 cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
2276 if (have_fw_feat(fw_features, "enabled",
2277 "needs-count-cache-flush-on-context-switch"))
2278 cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
2270 cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY | 2279 cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
2271 KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR | 2280 KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
2272 KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR; 2281 KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
2282 KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
2273 2283
2274 of_node_put(fw_features); 2284 of_node_put(fw_features);
2275 } 2285 }
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index 225667652069..1727180e8ca1 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -331,5 +331,6 @@ extern void css_schedule_reprobe(void);
331/* Function from drivers/s390/cio/chsc.c */ 331/* Function from drivers/s390/cio/chsc.c */
332int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); 332int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
333int chsc_sstpi(void *page, void *result, size_t size); 333int chsc_sstpi(void *page, void *result, size_t size);
334int chsc_sgib(u32 origin);
334 335
335#endif 336#endif
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 2f7f27e5493f..afaf5e3c57fd 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -62,6 +62,7 @@ enum interruption_class {
62 IRQIO_MSI, 62 IRQIO_MSI,
63 IRQIO_VIR, 63 IRQIO_VIR,
64 IRQIO_VAI, 64 IRQIO_VAI,
65 IRQIO_GAL,
65 NMI_NMI, 66 NMI_NMI,
66 CPU_RST, 67 CPU_RST,
67 NR_ARCH_IRQS 68 NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/isc.h b/arch/s390/include/asm/isc.h
index 6cb9e2ed05b6..b2cc1ec78d06 100644
--- a/arch/s390/include/asm/isc.h
+++ b/arch/s390/include/asm/isc.h
@@ -21,6 +21,7 @@
21/* Adapter interrupts. */ 21/* Adapter interrupts. */
22#define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ 22#define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */
23#define PCI_ISC 2 /* PCI I/O subchannels */ 23#define PCI_ISC 2 /* PCI I/O subchannels */
24#define GAL_ISC 5 /* GIB alert */
24#define AP_ISC 6 /* adjunct processor (crypto) devices */ 25#define AP_ISC 6 /* adjunct processor (crypto) devices */
25 26
26/* Functions for registration of I/O interruption subclasses */ 27/* Functions for registration of I/O interruption subclasses */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d5d24889c3bc..c47e22bba87f 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -591,7 +591,6 @@ struct kvm_s390_float_interrupt {
591 struct kvm_s390_mchk_info mchk; 591 struct kvm_s390_mchk_info mchk;
592 struct kvm_s390_ext_info srv_signal; 592 struct kvm_s390_ext_info srv_signal;
593 int next_rr_cpu; 593 int next_rr_cpu;
594 unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
595 struct mutex ais_lock; 594 struct mutex ais_lock;
596 u8 simm; 595 u8 simm;
597 u8 nimm; 596 u8 nimm;
@@ -712,6 +711,7 @@ struct s390_io_adapter {
712struct kvm_s390_cpu_model { 711struct kvm_s390_cpu_model {
713 /* facility mask supported by kvm & hosting machine */ 712 /* facility mask supported by kvm & hosting machine */
714 __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64]; 713 __u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
714 struct kvm_s390_vm_cpu_subfunc subfuncs;
715 /* facility list requested by guest (in dma page) */ 715 /* facility list requested by guest (in dma page) */
716 __u64 *fac_list; 716 __u64 *fac_list;
717 u64 cpuid; 717 u64 cpuid;
@@ -782,9 +782,21 @@ struct kvm_s390_gisa {
782 u8 reserved03[11]; 782 u8 reserved03[11];
783 u32 airq_count; 783 u32 airq_count;
784 } g1; 784 } g1;
785 struct {
786 u64 word[4];
787 } u64;
785 }; 788 };
786}; 789};
787 790
791struct kvm_s390_gib {
792 u32 alert_list_origin;
793 u32 reserved01;
794 u8:5;
795 u8 nisc:3;
796 u8 reserved03[3];
797 u32 reserved04[5];
798};
799
788/* 800/*
789 * sie_page2 has to be allocated as DMA because fac_list, crycb and 801 * sie_page2 has to be allocated as DMA because fac_list, crycb and
790 * gisa need 31bit addresses in the sie control block. 802 * gisa need 31bit addresses in the sie control block.
@@ -793,7 +805,8 @@ struct sie_page2 {
793 __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */ 805 __u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64]; /* 0x0000 */
794 struct kvm_s390_crypto_cb crycb; /* 0x0800 */ 806 struct kvm_s390_crypto_cb crycb; /* 0x0800 */
795 struct kvm_s390_gisa gisa; /* 0x0900 */ 807 struct kvm_s390_gisa gisa; /* 0x0900 */
796 u8 reserved920[0x1000 - 0x920]; /* 0x0920 */ 808 struct kvm *kvm; /* 0x0920 */
809 u8 reserved928[0x1000 - 0x928]; /* 0x0928 */
797}; 810};
798 811
799struct kvm_s390_vsie { 812struct kvm_s390_vsie {
@@ -804,6 +817,20 @@ struct kvm_s390_vsie {
804 struct page *pages[KVM_MAX_VCPUS]; 817 struct page *pages[KVM_MAX_VCPUS];
805}; 818};
806 819
820struct kvm_s390_gisa_iam {
821 u8 mask;
822 spinlock_t ref_lock;
823 u32 ref_count[MAX_ISC + 1];
824};
825
826struct kvm_s390_gisa_interrupt {
827 struct kvm_s390_gisa *origin;
828 struct kvm_s390_gisa_iam alert;
829 struct hrtimer timer;
830 u64 expires;
831 DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
832};
833
807struct kvm_arch{ 834struct kvm_arch{
808 void *sca; 835 void *sca;
809 int use_esca; 836 int use_esca;
@@ -837,7 +864,8 @@ struct kvm_arch{
837 atomic64_t cmma_dirty_pages; 864 atomic64_t cmma_dirty_pages;
838 /* subset of available cpu features enabled by user space */ 865 /* subset of available cpu features enabled by user space */
839 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); 866 DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
840 struct kvm_s390_gisa *gisa; 867 DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
868 struct kvm_s390_gisa_interrupt gisa_int;
841}; 869};
842 870
843#define KVM_HVA_ERR_BAD (-1UL) 871#define KVM_HVA_ERR_BAD (-1UL)
@@ -871,6 +899,9 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm,
871extern int sie64a(struct kvm_s390_sie_block *, u64 *); 899extern int sie64a(struct kvm_s390_sie_block *, u64 *);
872extern char sie_exit; 900extern char sie_exit;
873 901
902extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
903extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
904
874static inline void kvm_arch_hardware_disable(void) {} 905static inline void kvm_arch_hardware_disable(void) {}
875static inline void kvm_arch_check_processor_compat(void *rtn) {} 906static inline void kvm_arch_check_processor_compat(void *rtn) {}
876static inline void kvm_arch_sync_events(struct kvm *kvm) {} 907static inline void kvm_arch_sync_events(struct kvm *kvm) {}
@@ -878,7 +909,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
878static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} 909static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
879static inline void kvm_arch_free_memslot(struct kvm *kvm, 910static inline void kvm_arch_free_memslot(struct kvm *kvm,
880 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} 911 struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
881static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} 912static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
882static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} 913static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
883static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 914static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
884 struct kvm_memory_slot *slot) {} 915 struct kvm_memory_slot *slot) {}
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 0e8d68bac82c..0cd5a5f96729 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -88,6 +88,7 @@ static const struct irq_class irqclass_sub_desc[] = {
88 {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, 88 {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" },
89 {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, 89 {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
90 {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, 90 {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"},
91 {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"},
91 {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, 92 {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"},
92 {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, 93 {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"},
93}; 94};
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index fcb55b02990e..82162867f378 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -7,6 +7,9 @@
7 * Author(s): Carsten Otte <cotte@de.ibm.com> 7 * Author(s): Carsten Otte <cotte@de.ibm.com>
8 */ 8 */
9 9
10#define KMSG_COMPONENT "kvm-s390"
11#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
12
10#include <linux/interrupt.h> 13#include <linux/interrupt.h>
11#include <linux/kvm_host.h> 14#include <linux/kvm_host.h>
12#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
@@ -23,6 +26,7 @@
23#include <asm/gmap.h> 26#include <asm/gmap.h>
24#include <asm/switch_to.h> 27#include <asm/switch_to.h>
25#include <asm/nmi.h> 28#include <asm/nmi.h>
29#include <asm/airq.h>
26#include "kvm-s390.h" 30#include "kvm-s390.h"
27#include "gaccess.h" 31#include "gaccess.h"
28#include "trace-s390.h" 32#include "trace-s390.h"
@@ -31,6 +35,8 @@
31#define PFAULT_DONE 0x0680 35#define PFAULT_DONE 0x0680
32#define VIRTIO_PARAM 0x0d00 36#define VIRTIO_PARAM 0x0d00
33 37
38static struct kvm_s390_gib *gib;
39
34/* handle external calls via sigp interpretation facility */ 40/* handle external calls via sigp interpretation facility */
35static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id) 41static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
36{ 42{
@@ -217,22 +223,100 @@ static inline u8 int_word_to_isc(u32 int_word)
217 */ 223 */
218#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE) 224#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE)
219 225
220static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) 226/**
227 * gisa_set_iam - change the GISA interruption alert mask
228 *
229 * @gisa: gisa to operate on
230 * @iam: new IAM value to use
231 *
232 * Change the IAM atomically with the next alert address and the IPM
233 * of the GISA if the GISA is not part of the GIB alert list. All three
234 * fields are located in the first long word of the GISA.
235 *
236 * Returns: 0 on success
237 * -EBUSY in case the gisa is part of the alert list
238 */
239static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam)
240{
241 u64 word, _word;
242
243 do {
244 word = READ_ONCE(gisa->u64.word[0]);
245 if ((u64)gisa != word >> 32)
246 return -EBUSY;
247 _word = (word & ~0xffUL) | iam;
248 } while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
249
250 return 0;
251}
252
253/**
254 * gisa_clear_ipm - clear the GISA interruption pending mask
255 *
256 * @gisa: gisa to operate on
257 *
258 * Clear the IPM atomically with the next alert address and the IAM
259 * of the GISA unconditionally. All three fields are located in the
260 * first long word of the GISA.
261 */
262static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa)
263{
264 u64 word, _word;
265
266 do {
267 word = READ_ONCE(gisa->u64.word[0]);
268 _word = word & ~(0xffUL << 24);
269 } while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
270}
271
272/**
273 * gisa_get_ipm_or_restore_iam - return IPM or restore GISA IAM
274 *
275 * @gi: gisa interrupt struct to work on
276 *
277 * Atomically restores the interruption alert mask if none of the
278 * relevant ISCs are pending and return the IPM.
279 *
280 * Returns: the relevant pending ISCs
281 */
282static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi)
283{
284 u8 pending_mask, alert_mask;
285 u64 word, _word;
286
287 do {
288 word = READ_ONCE(gi->origin->u64.word[0]);
289 alert_mask = READ_ONCE(gi->alert.mask);
290 pending_mask = (u8)(word >> 24) & alert_mask;
291 if (pending_mask)
292 return pending_mask;
293 _word = (word & ~0xffUL) | alert_mask;
294 } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word);
295
296 return 0;
297}
298
299static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa)
300{
301 return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa;
302}
303
304static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
221{ 305{
222 set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); 306 set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
223} 307}
224 308
225static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa) 309static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa)
226{ 310{
227 return READ_ONCE(gisa->ipm); 311 return READ_ONCE(gisa->ipm);
228} 312}
229 313
230static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) 314static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
231{ 315{
232 clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); 316 clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
233} 317}
234 318
235static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) 319static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
236{ 320{
237 return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); 321 return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
238} 322}
@@ -245,8 +329,13 @@ static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu)
245 329
246static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) 330static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
247{ 331{
248 return pending_irqs_no_gisa(vcpu) | 332 struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int;
249 kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; 333 unsigned long pending_mask;
334
335 pending_mask = pending_irqs_no_gisa(vcpu);
336 if (gi->origin)
337 pending_mask |= gisa_get_ipm(gi->origin) << IRQ_PEND_IO_ISC_7;
338 return pending_mask;
250} 339}
251 340
252static inline int isc_to_irq_type(unsigned long isc) 341static inline int isc_to_irq_type(unsigned long isc)
@@ -318,13 +407,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
318static void __set_cpu_idle(struct kvm_vcpu *vcpu) 407static void __set_cpu_idle(struct kvm_vcpu *vcpu)
319{ 408{
320 kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); 409 kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
321 set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); 410 set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
322} 411}
323 412
324static void __unset_cpu_idle(struct kvm_vcpu *vcpu) 413static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
325{ 414{
326 kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); 415 kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
327 clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); 416 clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
328} 417}
329 418
330static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) 419static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -345,7 +434,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
345{ 434{
346 if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) 435 if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK))
347 return; 436 return;
348 else if (psw_ioint_disabled(vcpu)) 437 if (psw_ioint_disabled(vcpu))
349 kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); 438 kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT);
350 else 439 else
351 vcpu->arch.sie_block->lctl |= LCTL_CR6; 440 vcpu->arch.sie_block->lctl |= LCTL_CR6;
@@ -353,7 +442,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
353 442
354static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu) 443static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
355{ 444{
356 if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK)) 445 if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_EXT_MASK))
357 return; 446 return;
358 if (psw_extint_disabled(vcpu)) 447 if (psw_extint_disabled(vcpu))
359 kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT); 448 kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
@@ -363,7 +452,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
363 452
364static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu) 453static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
365{ 454{
366 if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK)) 455 if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_MCHK_MASK))
367 return; 456 return;
368 if (psw_mchk_disabled(vcpu)) 457 if (psw_mchk_disabled(vcpu))
369 vcpu->arch.sie_block->ictl |= ICTL_LPSW; 458 vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@@ -956,6 +1045,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
956{ 1045{
957 struct list_head *isc_list; 1046 struct list_head *isc_list;
958 struct kvm_s390_float_interrupt *fi; 1047 struct kvm_s390_float_interrupt *fi;
1048 struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int;
959 struct kvm_s390_interrupt_info *inti = NULL; 1049 struct kvm_s390_interrupt_info *inti = NULL;
960 struct kvm_s390_io_info io; 1050 struct kvm_s390_io_info io;
961 u32 isc; 1051 u32 isc;
@@ -998,8 +1088,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
998 goto out; 1088 goto out;
999 } 1089 }
1000 1090
1001 if (vcpu->kvm->arch.gisa && 1091 if (gi->origin && gisa_tac_ipm_gisc(gi->origin, isc)) {
1002 kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) {
1003 /* 1092 /*
1004 * in case an adapter interrupt was not delivered 1093 * in case an adapter interrupt was not delivered
1005 * in SIE context KVM will handle the delivery 1094 * in SIE context KVM will handle the delivery
@@ -1089,6 +1178,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
1089 1178
1090int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) 1179int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
1091{ 1180{
1181 struct kvm_s390_gisa_interrupt *gi = &vcpu->kvm->arch.gisa_int;
1092 u64 sltime; 1182 u64 sltime;
1093 1183
1094 vcpu->stat.exit_wait_state++; 1184 vcpu->stat.exit_wait_state++;
@@ -1102,6 +1192,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
1102 return -EOPNOTSUPP; /* disabled wait */ 1192 return -EOPNOTSUPP; /* disabled wait */
1103 } 1193 }
1104 1194
1195 if (gi->origin &&
1196 (gisa_get_ipm_or_restore_iam(gi) &
1197 vcpu->arch.sie_block->gcr[6] >> 24))
1198 return 0;
1199
1105 if (!ckc_interrupts_enabled(vcpu) && 1200 if (!ckc_interrupts_enabled(vcpu) &&
1106 !cpu_timer_interrupts_enabled(vcpu)) { 1201 !cpu_timer_interrupts_enabled(vcpu)) {
1107 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer"); 1202 VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
@@ -1533,18 +1628,19 @@ static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm,
1533 1628
1534static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid) 1629static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid)
1535{ 1630{
1631 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
1536 unsigned long active_mask; 1632 unsigned long active_mask;
1537 int isc; 1633 int isc;
1538 1634
1539 if (schid) 1635 if (schid)
1540 goto out; 1636 goto out;
1541 if (!kvm->arch.gisa) 1637 if (!gi->origin)
1542 goto out; 1638 goto out;
1543 1639
1544 active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32; 1640 active_mask = (isc_mask & gisa_get_ipm(gi->origin) << 24) << 32;
1545 while (active_mask) { 1641 while (active_mask) {
1546 isc = __fls(active_mask) ^ (BITS_PER_LONG - 1); 1642 isc = __fls(active_mask) ^ (BITS_PER_LONG - 1);
1547 if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc)) 1643 if (gisa_tac_ipm_gisc(gi->origin, isc))
1548 return isc; 1644 return isc;
1549 clear_bit_inv(isc, &active_mask); 1645 clear_bit_inv(isc, &active_mask);
1550 } 1646 }
@@ -1567,6 +1663,7 @@ out:
1567struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, 1663struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
1568 u64 isc_mask, u32 schid) 1664 u64 isc_mask, u32 schid)
1569{ 1665{
1666 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
1570 struct kvm_s390_interrupt_info *inti, *tmp_inti; 1667 struct kvm_s390_interrupt_info *inti, *tmp_inti;
1571 int isc; 1668 int isc;
1572 1669
@@ -1584,7 +1681,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
1584 /* both types of interrupts present */ 1681 /* both types of interrupts present */
1585 if (int_word_to_isc(inti->io.io_int_word) <= isc) { 1682 if (int_word_to_isc(inti->io.io_int_word) <= isc) {
1586 /* classical IO int with higher priority */ 1683 /* classical IO int with higher priority */
1587 kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); 1684 gisa_set_ipm_gisc(gi->origin, isc);
1588 goto out; 1685 goto out;
1589 } 1686 }
1590gisa_out: 1687gisa_out:
@@ -1596,7 +1693,7 @@ gisa_out:
1596 kvm_s390_reinject_io_int(kvm, inti); 1693 kvm_s390_reinject_io_int(kvm, inti);
1597 inti = tmp_inti; 1694 inti = tmp_inti;
1598 } else 1695 } else
1599 kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); 1696 gisa_set_ipm_gisc(gi->origin, isc);
1600out: 1697out:
1601 return inti; 1698 return inti;
1602} 1699}
@@ -1685,6 +1782,7 @@ static int __inject_float_mchk(struct kvm *kvm,
1685 1782
1686static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) 1783static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1687{ 1784{
1785 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
1688 struct kvm_s390_float_interrupt *fi; 1786 struct kvm_s390_float_interrupt *fi;
1689 struct list_head *list; 1787 struct list_head *list;
1690 int isc; 1788 int isc;
@@ -1692,9 +1790,9 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1692 kvm->stat.inject_io++; 1790 kvm->stat.inject_io++;
1693 isc = int_word_to_isc(inti->io.io_int_word); 1791 isc = int_word_to_isc(inti->io.io_int_word);
1694 1792
1695 if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) { 1793 if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) {
1696 VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); 1794 VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
1697 kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc); 1795 gisa_set_ipm_gisc(gi->origin, isc);
1698 kfree(inti); 1796 kfree(inti);
1699 return 0; 1797 return 0;
1700 } 1798 }
@@ -1726,7 +1824,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
1726 */ 1824 */
1727static void __floating_irq_kick(struct kvm *kvm, u64 type) 1825static void __floating_irq_kick(struct kvm *kvm, u64 type)
1728{ 1826{
1729 struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
1730 struct kvm_vcpu *dst_vcpu; 1827 struct kvm_vcpu *dst_vcpu;
1731 int sigcpu, online_vcpus, nr_tries = 0; 1828 int sigcpu, online_vcpus, nr_tries = 0;
1732 1829
@@ -1735,11 +1832,11 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
1735 return; 1832 return;
1736 1833
1737 /* find idle VCPUs first, then round robin */ 1834 /* find idle VCPUs first, then round robin */
1738 sigcpu = find_first_bit(fi->idle_mask, online_vcpus); 1835 sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus);
1739 if (sigcpu == online_vcpus) { 1836 if (sigcpu == online_vcpus) {
1740 do { 1837 do {
1741 sigcpu = fi->next_rr_cpu; 1838 sigcpu = kvm->arch.float_int.next_rr_cpu++;
1742 fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus; 1839 kvm->arch.float_int.next_rr_cpu %= online_vcpus;
1743 /* avoid endless loops if all vcpus are stopped */ 1840 /* avoid endless loops if all vcpus are stopped */
1744 if (nr_tries++ >= online_vcpus) 1841 if (nr_tries++ >= online_vcpus)
1745 return; 1842 return;
@@ -1753,7 +1850,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
1753 kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); 1850 kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT);
1754 break; 1851 break;
1755 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: 1852 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
1756 if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa)) 1853 if (!(type & KVM_S390_INT_IO_AI_MASK &&
1854 kvm->arch.gisa_int.origin))
1757 kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); 1855 kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
1758 break; 1856 break;
1759 default: 1857 default:
@@ -2003,6 +2101,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
2003 2101
2004static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) 2102static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
2005{ 2103{
2104 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
2006 struct kvm_s390_interrupt_info *inti; 2105 struct kvm_s390_interrupt_info *inti;
2007 struct kvm_s390_float_interrupt *fi; 2106 struct kvm_s390_float_interrupt *fi;
2008 struct kvm_s390_irq *buf; 2107 struct kvm_s390_irq *buf;
@@ -2026,15 +2125,14 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
2026 2125
2027 max_irqs = len / sizeof(struct kvm_s390_irq); 2126 max_irqs = len / sizeof(struct kvm_s390_irq);
2028 2127
2029 if (kvm->arch.gisa && 2128 if (gi->origin && gisa_get_ipm(gi->origin)) {
2030 kvm_s390_gisa_get_ipm(kvm->arch.gisa)) {
2031 for (i = 0; i <= MAX_ISC; i++) { 2129 for (i = 0; i <= MAX_ISC; i++) {
2032 if (n == max_irqs) { 2130 if (n == max_irqs) {
2033 /* signal userspace to try again */ 2131 /* signal userspace to try again */
2034 ret = -ENOMEM; 2132 ret = -ENOMEM;
2035 goto out_nolock; 2133 goto out_nolock;
2036 } 2134 }
2037 if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) { 2135 if (gisa_tac_ipm_gisc(gi->origin, i)) {
2038 irq = (struct kvm_s390_irq *) &buf[n]; 2136 irq = (struct kvm_s390_irq *) &buf[n];
2039 irq->type = KVM_S390_INT_IO(1, 0, 0, 0); 2137 irq->type = KVM_S390_INT_IO(1, 0, 0, 0);
2040 irq->u.io.io_int_word = isc_to_int_word(i); 2138 irq->u.io.io_int_word = isc_to_int_word(i);
@@ -2831,7 +2929,7 @@ static void store_local_irq(struct kvm_s390_local_interrupt *li,
2831int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) 2929int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
2832{ 2930{
2833 int scn; 2931 int scn;
2834 unsigned long sigp_emerg_pending[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 2932 DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
2835 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 2933 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
2836 unsigned long pending_irqs; 2934 unsigned long pending_irqs;
2837 struct kvm_s390_irq irq; 2935 struct kvm_s390_irq irq;
@@ -2884,27 +2982,278 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
2884 return n; 2982 return n;
2885} 2983}
2886 2984
2887void kvm_s390_gisa_clear(struct kvm *kvm) 2985static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask)
2888{ 2986{
2889 if (kvm->arch.gisa) { 2987 int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus);
2890 memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa)); 2988 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
2891 kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa; 2989 struct kvm_vcpu *vcpu;
2892 VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa); 2990
2991 for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) {
2992 vcpu = kvm_get_vcpu(kvm, vcpu_id);
2993 if (psw_ioint_disabled(vcpu))
2994 continue;
2995 deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24);
2996 if (deliverable_mask) {
2997 /* lately kicked but not yet running */
2998 if (test_and_set_bit(vcpu_id, gi->kicked_mask))
2999 return;
3000 kvm_s390_vcpu_wakeup(vcpu);
3001 return;
3002 }
2893 } 3003 }
2894} 3004}
2895 3005
3006static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
3007{
3008 struct kvm_s390_gisa_interrupt *gi =
3009 container_of(timer, struct kvm_s390_gisa_interrupt, timer);
3010 struct kvm *kvm =
3011 container_of(gi->origin, struct sie_page2, gisa)->kvm;
3012 u8 pending_mask;
3013
3014 pending_mask = gisa_get_ipm_or_restore_iam(gi);
3015 if (pending_mask) {
3016 __airqs_kick_single_vcpu(kvm, pending_mask);
3017 hrtimer_forward_now(timer, ns_to_ktime(gi->expires));
3018 return HRTIMER_RESTART;
3019 };
3020
3021 return HRTIMER_NORESTART;
3022}
3023
3024#define NULL_GISA_ADDR 0x00000000UL
3025#define NONE_GISA_ADDR 0x00000001UL
3026#define GISA_ADDR_MASK 0xfffff000UL
3027
3028static void process_gib_alert_list(void)
3029{
3030 struct kvm_s390_gisa_interrupt *gi;
3031 struct kvm_s390_gisa *gisa;
3032 struct kvm *kvm;
3033 u32 final, origin = 0UL;
3034
3035 do {
3036 /*
3037 * If the NONE_GISA_ADDR is still stored in the alert list
3038 * origin, we will leave the outer loop. No further GISA has
3039 * been added to the alert list by millicode while processing
3040 * the current alert list.
3041 */
3042 final = (origin & NONE_GISA_ADDR);
3043 /*
3044 * Cut off the alert list and store the NONE_GISA_ADDR in the
3045 * alert list origin to avoid further GAL interruptions.
3046 * A new alert list can be build up by millicode in parallel
3047 * for guests not in the yet cut-off alert list. When in the
3048 * final loop, store the NULL_GISA_ADDR instead. This will re-
3049 * enable GAL interruptions on the host again.
3050 */
3051 origin = xchg(&gib->alert_list_origin,
3052 (!final) ? NONE_GISA_ADDR : NULL_GISA_ADDR);
3053 /*
3054 * Loop through the just cut-off alert list and start the
3055 * gisa timers to kick idle vcpus to consume the pending
3056 * interruptions asap.
3057 */
3058 while (origin & GISA_ADDR_MASK) {
3059 gisa = (struct kvm_s390_gisa *)(u64)origin;
3060 origin = gisa->next_alert;
3061 gisa->next_alert = (u32)(u64)gisa;
3062 kvm = container_of(gisa, struct sie_page2, gisa)->kvm;
3063 gi = &kvm->arch.gisa_int;
3064 if (hrtimer_active(&gi->timer))
3065 hrtimer_cancel(&gi->timer);
3066 hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
3067 }
3068 } while (!final);
3069
3070}
3071
3072void kvm_s390_gisa_clear(struct kvm *kvm)
3073{
3074 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
3075
3076 if (!gi->origin)
3077 return;
3078 gisa_clear_ipm(gi->origin);
3079 VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin);
3080}
3081
2896void kvm_s390_gisa_init(struct kvm *kvm) 3082void kvm_s390_gisa_init(struct kvm *kvm)
2897{ 3083{
2898 if (css_general_characteristics.aiv) { 3084 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
2899 kvm->arch.gisa = &kvm->arch.sie_page2->gisa; 3085
2900 VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa); 3086 if (!css_general_characteristics.aiv)
2901 kvm_s390_gisa_clear(kvm); 3087 return;
2902 } 3088 gi->origin = &kvm->arch.sie_page2->gisa;
3089 gi->alert.mask = 0;
3090 spin_lock_init(&gi->alert.ref_lock);
3091 gi->expires = 50 * 1000; /* 50 usec */
3092 hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3093 gi->timer.function = gisa_vcpu_kicker;
3094 memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
3095 gi->origin->next_alert = (u32)(u64)gi->origin;
3096 VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
2903} 3097}
2904 3098
2905void kvm_s390_gisa_destroy(struct kvm *kvm) 3099void kvm_s390_gisa_destroy(struct kvm *kvm)
2906{ 3100{
2907 if (!kvm->arch.gisa) 3101 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
3102
3103 if (!gi->origin)
3104 return;
3105 if (gi->alert.mask)
3106 KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x",
3107 kvm, gi->alert.mask);
3108 while (gisa_in_alert_list(gi->origin))
3109 cpu_relax();
3110 hrtimer_cancel(&gi->timer);
3111 gi->origin = NULL;
3112}
3113
3114/**
3115 * kvm_s390_gisc_register - register a guest ISC
3116 *
3117 * @kvm: the kernel vm to work with
3118 * @gisc: the guest interruption sub class to register
3119 *
3120 * The function extends the vm specific alert mask to use.
3121 * The effective IAM mask in the GISA is updated as well
3122 * in case the GISA is not part of the GIB alert list.
3123 * It will be updated latest when the IAM gets restored
3124 * by gisa_get_ipm_or_restore_iam().
3125 *
3126 * Returns: the nonspecific ISC (NISC) the gib alert mechanism
3127 * has registered with the channel subsystem.
3128 * -ENODEV in case the vm uses no GISA
3129 * -ERANGE in case the guest ISC is invalid
3130 */
3131int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc)
3132{
3133 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
3134
3135 if (!gi->origin)
3136 return -ENODEV;
3137 if (gisc > MAX_ISC)
3138 return -ERANGE;
3139
3140 spin_lock(&gi->alert.ref_lock);
3141 gi->alert.ref_count[gisc]++;
3142 if (gi->alert.ref_count[gisc] == 1) {
3143 gi->alert.mask |= 0x80 >> gisc;
3144 gisa_set_iam(gi->origin, gi->alert.mask);
3145 }
3146 spin_unlock(&gi->alert.ref_lock);
3147
3148 return gib->nisc;
3149}
3150EXPORT_SYMBOL_GPL(kvm_s390_gisc_register);
3151
3152/**
3153 * kvm_s390_gisc_unregister - unregister a guest ISC
3154 *
3155 * @kvm: the kernel vm to work with
3156 * @gisc: the guest interruption sub class to register
3157 *
3158 * The function reduces the vm specific alert mask to use.
3159 * The effective IAM mask in the GISA is updated as well
3160 * in case the GISA is not part of the GIB alert list.
3161 * It will be updated latest when the IAM gets restored
3162 * by gisa_get_ipm_or_restore_iam().
3163 *
3164 * Returns: the nonspecific ISC (NISC) the gib alert mechanism
3165 * has registered with the channel subsystem.
3166 * -ENODEV in case the vm uses no GISA
3167 * -ERANGE in case the guest ISC is invalid
3168 * -EINVAL in case the guest ISC is not registered
3169 */
3170int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc)
3171{
3172 struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int;
3173 int rc = 0;
3174
3175 if (!gi->origin)
3176 return -ENODEV;
3177 if (gisc > MAX_ISC)
3178 return -ERANGE;
3179
3180 spin_lock(&gi->alert.ref_lock);
3181 if (gi->alert.ref_count[gisc] == 0) {
3182 rc = -EINVAL;
3183 goto out;
3184 }
3185 gi->alert.ref_count[gisc]--;
3186 if (gi->alert.ref_count[gisc] == 0) {
3187 gi->alert.mask &= ~(0x80 >> gisc);
3188 gisa_set_iam(gi->origin, gi->alert.mask);
3189 }
3190out:
3191 spin_unlock(&gi->alert.ref_lock);
3192
3193 return rc;
3194}
3195EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
3196
3197static void gib_alert_irq_handler(struct airq_struct *airq)
3198{
3199 inc_irq_stat(IRQIO_GAL);
3200 process_gib_alert_list();
3201}
3202
3203static struct airq_struct gib_alert_irq = {
3204 .handler = gib_alert_irq_handler,
3205 .lsi_ptr = &gib_alert_irq.lsi_mask,
3206};
3207
3208void kvm_s390_gib_destroy(void)
3209{
3210 if (!gib)
2908 return; 3211 return;
2909 kvm->arch.gisa = NULL; 3212 chsc_sgib(0);
3213 unregister_adapter_interrupt(&gib_alert_irq);
3214 free_page((unsigned long)gib);
3215 gib = NULL;
3216}
3217
3218int kvm_s390_gib_init(u8 nisc)
3219{
3220 int rc = 0;
3221
3222 if (!css_general_characteristics.aiv) {
3223 KVM_EVENT(3, "%s", "gib not initialized, no AIV facility");
3224 goto out;
3225 }
3226
3227 gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
3228 if (!gib) {
3229 rc = -ENOMEM;
3230 goto out;
3231 }
3232
3233 gib_alert_irq.isc = nisc;
3234 if (register_adapter_interrupt(&gib_alert_irq)) {
3235 pr_err("Registering the GIB alert interruption handler failed\n");
3236 rc = -EIO;
3237 goto out_free_gib;
3238 }
3239
3240 gib->nisc = nisc;
3241 if (chsc_sgib((u32)(u64)gib)) {
3242 pr_err("Associating the GIB with the AIV facility failed\n");
3243 free_page((unsigned long)gib);
3244 gib = NULL;
3245 rc = -EIO;
3246 goto out_unreg_gal;
3247 }
3248
3249 KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
3250 goto out;
3251
3252out_unreg_gal:
3253 unregister_adapter_interrupt(&gib_alert_irq);
3254out_free_gib:
3255 free_page((unsigned long)gib);
3256 gib = NULL;
3257out:
3258 return rc;
2910} 3259}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7f4bc58a53b9..4638303ba6a8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -432,11 +432,18 @@ int kvm_arch_init(void *opaque)
432 /* Register floating interrupt controller interface. */ 432 /* Register floating interrupt controller interface. */
433 rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); 433 rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
434 if (rc) { 434 if (rc) {
435 pr_err("Failed to register FLIC rc=%d\n", rc); 435 pr_err("A FLIC registration call failed with rc=%d\n", rc);
436 goto out_debug_unreg; 436 goto out_debug_unreg;
437 } 437 }
438
439 rc = kvm_s390_gib_init(GAL_ISC);
440 if (rc)
441 goto out_gib_destroy;
442
438 return 0; 443 return 0;
439 444
445out_gib_destroy:
446 kvm_s390_gib_destroy();
440out_debug_unreg: 447out_debug_unreg:
441 debug_unregister(kvm_s390_dbf); 448 debug_unregister(kvm_s390_dbf);
442 return rc; 449 return rc;
@@ -444,6 +451,7 @@ out_debug_unreg:
444 451
445void kvm_arch_exit(void) 452void kvm_arch_exit(void)
446{ 453{
454 kvm_s390_gib_destroy();
447 debug_unregister(kvm_s390_dbf); 455 debug_unregister(kvm_s390_dbf);
448} 456}
449 457
@@ -1258,11 +1266,65 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
1258static int kvm_s390_set_processor_subfunc(struct kvm *kvm, 1266static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
1259 struct kvm_device_attr *attr) 1267 struct kvm_device_attr *attr)
1260{ 1268{
1261 /* 1269 mutex_lock(&kvm->lock);
1262 * Once supported by kernel + hw, we have to store the subfunctions 1270 if (kvm->created_vcpus) {
1263 * in kvm->arch and remember that user space configured them. 1271 mutex_unlock(&kvm->lock);
1264 */ 1272 return -EBUSY;
1265 return -ENXIO; 1273 }
1274
1275 if (copy_from_user(&kvm->arch.model.subfuncs, (void __user *)attr->addr,
1276 sizeof(struct kvm_s390_vm_cpu_subfunc))) {
1277 mutex_unlock(&kvm->lock);
1278 return -EFAULT;
1279 }
1280 mutex_unlock(&kvm->lock);
1281
1282 VM_EVENT(kvm, 3, "SET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1283 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0],
1284 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1],
1285 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2],
1286 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]);
1287 VM_EVENT(kvm, 3, "SET: guest PTFF subfunc 0x%16.16lx.%16.16lx",
1288 ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0],
1289 ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]);
1290 VM_EVENT(kvm, 3, "SET: guest KMAC subfunc 0x%16.16lx.%16.16lx",
1291 ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0],
1292 ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]);
1293 VM_EVENT(kvm, 3, "SET: guest KMC subfunc 0x%16.16lx.%16.16lx",
1294 ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0],
1295 ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]);
1296 VM_EVENT(kvm, 3, "SET: guest KM subfunc 0x%16.16lx.%16.16lx",
1297 ((unsigned long *) &kvm->arch.model.subfuncs.km)[0],
1298 ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]);
1299 VM_EVENT(kvm, 3, "SET: guest KIMD subfunc 0x%16.16lx.%16.16lx",
1300 ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0],
1301 ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]);
1302 VM_EVENT(kvm, 3, "SET: guest KLMD subfunc 0x%16.16lx.%16.16lx",
1303 ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0],
1304 ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]);
1305 VM_EVENT(kvm, 3, "SET: guest PCKMO subfunc 0x%16.16lx.%16.16lx",
1306 ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0],
1307 ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]);
1308 VM_EVENT(kvm, 3, "SET: guest KMCTR subfunc 0x%16.16lx.%16.16lx",
1309 ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0],
1310 ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]);
1311 VM_EVENT(kvm, 3, "SET: guest KMF subfunc 0x%16.16lx.%16.16lx",
1312 ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0],
1313 ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]);
1314 VM_EVENT(kvm, 3, "SET: guest KMO subfunc 0x%16.16lx.%16.16lx",
1315 ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0],
1316 ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]);
1317 VM_EVENT(kvm, 3, "SET: guest PCC subfunc 0x%16.16lx.%16.16lx",
1318 ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0],
1319 ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]);
1320 VM_EVENT(kvm, 3, "SET: guest PPNO subfunc 0x%16.16lx.%16.16lx",
1321 ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0],
1322 ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]);
1323 VM_EVENT(kvm, 3, "SET: guest KMA subfunc 0x%16.16lx.%16.16lx",
1324 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0],
1325 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]);
1326
1327 return 0;
1266} 1328}
1267 1329
1268static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) 1330static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
@@ -1381,12 +1443,56 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
1381static int kvm_s390_get_processor_subfunc(struct kvm *kvm, 1443static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
1382 struct kvm_device_attr *attr) 1444 struct kvm_device_attr *attr)
1383{ 1445{
1384 /* 1446 if (copy_to_user((void __user *)attr->addr, &kvm->arch.model.subfuncs,
1385 * Once we can actually configure subfunctions (kernel + hw support), 1447 sizeof(struct kvm_s390_vm_cpu_subfunc)))
1386 * we have to check if they were already set by user space, if so copy 1448 return -EFAULT;
1387 * them from kvm->arch. 1449
1388 */ 1450 VM_EVENT(kvm, 3, "GET: guest PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1389 return -ENXIO; 1451 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[0],
1452 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[1],
1453 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[2],
1454 ((unsigned long *) &kvm->arch.model.subfuncs.plo)[3]);
1455 VM_EVENT(kvm, 3, "GET: guest PTFF subfunc 0x%16.16lx.%16.16lx",
1456 ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[0],
1457 ((unsigned long *) &kvm->arch.model.subfuncs.ptff)[1]);
1458 VM_EVENT(kvm, 3, "GET: guest KMAC subfunc 0x%16.16lx.%16.16lx",
1459 ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[0],
1460 ((unsigned long *) &kvm->arch.model.subfuncs.kmac)[1]);
1461 VM_EVENT(kvm, 3, "GET: guest KMC subfunc 0x%16.16lx.%16.16lx",
1462 ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[0],
1463 ((unsigned long *) &kvm->arch.model.subfuncs.kmc)[1]);
1464 VM_EVENT(kvm, 3, "GET: guest KM subfunc 0x%16.16lx.%16.16lx",
1465 ((unsigned long *) &kvm->arch.model.subfuncs.km)[0],
1466 ((unsigned long *) &kvm->arch.model.subfuncs.km)[1]);
1467 VM_EVENT(kvm, 3, "GET: guest KIMD subfunc 0x%16.16lx.%16.16lx",
1468 ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[0],
1469 ((unsigned long *) &kvm->arch.model.subfuncs.kimd)[1]);
1470 VM_EVENT(kvm, 3, "GET: guest KLMD subfunc 0x%16.16lx.%16.16lx",
1471 ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[0],
1472 ((unsigned long *) &kvm->arch.model.subfuncs.klmd)[1]);
1473 VM_EVENT(kvm, 3, "GET: guest PCKMO subfunc 0x%16.16lx.%16.16lx",
1474 ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[0],
1475 ((unsigned long *) &kvm->arch.model.subfuncs.pckmo)[1]);
1476 VM_EVENT(kvm, 3, "GET: guest KMCTR subfunc 0x%16.16lx.%16.16lx",
1477 ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[0],
1478 ((unsigned long *) &kvm->arch.model.subfuncs.kmctr)[1]);
1479 VM_EVENT(kvm, 3, "GET: guest KMF subfunc 0x%16.16lx.%16.16lx",
1480 ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[0],
1481 ((unsigned long *) &kvm->arch.model.subfuncs.kmf)[1]);
1482 VM_EVENT(kvm, 3, "GET: guest KMO subfunc 0x%16.16lx.%16.16lx",
1483 ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[0],
1484 ((unsigned long *) &kvm->arch.model.subfuncs.kmo)[1]);
1485 VM_EVENT(kvm, 3, "GET: guest PCC subfunc 0x%16.16lx.%16.16lx",
1486 ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[0],
1487 ((unsigned long *) &kvm->arch.model.subfuncs.pcc)[1]);
1488 VM_EVENT(kvm, 3, "GET: guest PPNO subfunc 0x%16.16lx.%16.16lx",
1489 ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[0],
1490 ((unsigned long *) &kvm->arch.model.subfuncs.ppno)[1]);
1491 VM_EVENT(kvm, 3, "GET: guest KMA subfunc 0x%16.16lx.%16.16lx",
1492 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[0],
1493 ((unsigned long *) &kvm->arch.model.subfuncs.kma)[1]);
1494
1495 return 0;
1390} 1496}
1391 1497
1392static int kvm_s390_get_machine_subfunc(struct kvm *kvm, 1498static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
@@ -1395,8 +1501,55 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
1395 if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, 1501 if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
1396 sizeof(struct kvm_s390_vm_cpu_subfunc))) 1502 sizeof(struct kvm_s390_vm_cpu_subfunc)))
1397 return -EFAULT; 1503 return -EFAULT;
1504
1505 VM_EVENT(kvm, 3, "GET: host PLO subfunc 0x%16.16lx.%16.16lx.%16.16lx.%16.16lx",
1506 ((unsigned long *) &kvm_s390_available_subfunc.plo)[0],
1507 ((unsigned long *) &kvm_s390_available_subfunc.plo)[1],
1508 ((unsigned long *) &kvm_s390_available_subfunc.plo)[2],
1509 ((unsigned long *) &kvm_s390_available_subfunc.plo)[3]);
1510 VM_EVENT(kvm, 3, "GET: host PTFF subfunc 0x%16.16lx.%16.16lx",
1511 ((unsigned long *) &kvm_s390_available_subfunc.ptff)[0],
1512 ((unsigned long *) &kvm_s390_available_subfunc.ptff)[1]);
1513 VM_EVENT(kvm, 3, "GET: host KMAC subfunc 0x%16.16lx.%16.16lx",
1514 ((unsigned long *) &kvm_s390_available_subfunc.kmac)[0],
1515 ((unsigned long *) &kvm_s390_available_subfunc.kmac)[1]);
1516 VM_EVENT(kvm, 3, "GET: host KMC subfunc 0x%16.16lx.%16.16lx",
1517 ((unsigned long *) &kvm_s390_available_subfunc.kmc)[0],
1518 ((unsigned long *) &kvm_s390_available_subfunc.kmc)[1]);
1519 VM_EVENT(kvm, 3, "GET: host KM subfunc 0x%16.16lx.%16.16lx",
1520 ((unsigned long *) &kvm_s390_available_subfunc.km)[0],
1521 ((unsigned long *) &kvm_s390_available_subfunc.km)[1]);
1522 VM_EVENT(kvm, 3, "GET: host KIMD subfunc 0x%16.16lx.%16.16lx",
1523 ((unsigned long *) &kvm_s390_available_subfunc.kimd)[0],
1524 ((unsigned long *) &kvm_s390_available_subfunc.kimd)[1]);
1525 VM_EVENT(kvm, 3, "GET: host KLMD subfunc 0x%16.16lx.%16.16lx",
1526 ((unsigned long *) &kvm_s390_available_subfunc.klmd)[0],
1527 ((unsigned long *) &kvm_s390_available_subfunc.klmd)[1]);
1528 VM_EVENT(kvm, 3, "GET: host PCKMO subfunc 0x%16.16lx.%16.16lx",
1529 ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[0],
1530 ((unsigned long *) &kvm_s390_available_subfunc.pckmo)[1]);
1531 VM_EVENT(kvm, 3, "GET: host KMCTR subfunc 0x%16.16lx.%16.16lx",
1532 ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[0],
1533 ((unsigned long *) &kvm_s390_available_subfunc.kmctr)[1]);
1534 VM_EVENT(kvm, 3, "GET: host KMF subfunc 0x%16.16lx.%16.16lx",
1535 ((unsigned long *) &kvm_s390_available_subfunc.kmf)[0],
1536 ((unsigned long *) &kvm_s390_available_subfunc.kmf)[1]);
1537 VM_EVENT(kvm, 3, "GET: host KMO subfunc 0x%16.16lx.%16.16lx",
1538 ((unsigned long *) &kvm_s390_available_subfunc.kmo)[0],
1539 ((unsigned long *) &kvm_s390_available_subfunc.kmo)[1]);
1540 VM_EVENT(kvm, 3, "GET: host PCC subfunc 0x%16.16lx.%16.16lx",
1541 ((unsigned long *) &kvm_s390_available_subfunc.pcc)[0],
1542 ((unsigned long *) &kvm_s390_available_subfunc.pcc)[1]);
1543 VM_EVENT(kvm, 3, "GET: host PPNO subfunc 0x%16.16lx.%16.16lx",
1544 ((unsigned long *) &kvm_s390_available_subfunc.ppno)[0],
1545 ((unsigned long *) &kvm_s390_available_subfunc.ppno)[1]);
1546 VM_EVENT(kvm, 3, "GET: host KMA subfunc 0x%16.16lx.%16.16lx",
1547 ((unsigned long *) &kvm_s390_available_subfunc.kma)[0],
1548 ((unsigned long *) &kvm_s390_available_subfunc.kma)[1]);
1549
1398 return 0; 1550 return 0;
1399} 1551}
1552
1400static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) 1553static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
1401{ 1554{
1402 int ret = -ENXIO; 1555 int ret = -ENXIO;
@@ -1514,10 +1667,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1514 case KVM_S390_VM_CPU_PROCESSOR_FEAT: 1667 case KVM_S390_VM_CPU_PROCESSOR_FEAT:
1515 case KVM_S390_VM_CPU_MACHINE_FEAT: 1668 case KVM_S390_VM_CPU_MACHINE_FEAT:
1516 case KVM_S390_VM_CPU_MACHINE_SUBFUNC: 1669 case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
1670 case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
1517 ret = 0; 1671 ret = 0;
1518 break; 1672 break;
1519 /* configuring subfunctions is not supported yet */
1520 case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
1521 default: 1673 default:
1522 ret = -ENXIO; 1674 ret = -ENXIO;
1523 break; 1675 break;
@@ -2209,6 +2361,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
2209 if (!kvm->arch.sie_page2) 2361 if (!kvm->arch.sie_page2)
2210 goto out_err; 2362 goto out_err;
2211 2363
2364 kvm->arch.sie_page2->kvm = kvm;
2212 kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; 2365 kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
2213 2366
2214 for (i = 0; i < kvm_s390_fac_size(); i++) { 2367 for (i = 0; i < kvm_s390_fac_size(); i++) {
@@ -2218,6 +2371,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
2218 kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & 2371 kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] &
2219 kvm_s390_fac_base[i]; 2372 kvm_s390_fac_base[i];
2220 } 2373 }
2374 kvm->arch.model.subfuncs = kvm_s390_available_subfunc;
2221 2375
2222 /* we are always in czam mode - even on pre z14 machines */ 2376 /* we are always in czam mode - even on pre z14 machines */
2223 set_kvm_facility(kvm->arch.model.fac_mask, 138); 2377 set_kvm_facility(kvm->arch.model.fac_mask, 138);
@@ -2812,7 +2966,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2812 2966
2813 vcpu->arch.sie_block->icpua = id; 2967 vcpu->arch.sie_block->icpua = id;
2814 spin_lock_init(&vcpu->arch.local_int.lock); 2968 spin_lock_init(&vcpu->arch.local_int.lock);
2815 vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa; 2969 vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin;
2816 if (vcpu->arch.sie_block->gd && sclp.has_gisaf) 2970 if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
2817 vcpu->arch.sie_block->gd |= GISA_FORMAT1; 2971 vcpu->arch.sie_block->gd |= GISA_FORMAT1;
2818 seqcount_init(&vcpu->arch.cputm_seqcount); 2972 seqcount_init(&vcpu->arch.cputm_seqcount);
@@ -3458,6 +3612,8 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
3458 kvm_s390_patch_guest_per_regs(vcpu); 3612 kvm_s390_patch_guest_per_regs(vcpu);
3459 } 3613 }
3460 3614
3615 clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask);
3616
3461 vcpu->arch.sie_block->icptcode = 0; 3617 vcpu->arch.sie_block->icptcode = 0;
3462 cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); 3618 cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
3463 VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags); 3619 VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
@@ -4293,12 +4449,12 @@ static int __init kvm_s390_init(void)
4293 int i; 4449 int i;
4294 4450
4295 if (!sclp.has_sief2) { 4451 if (!sclp.has_sief2) {
4296 pr_info("SIE not available\n"); 4452 pr_info("SIE is not available\n");
4297 return -ENODEV; 4453 return -ENODEV;
4298 } 4454 }
4299 4455
4300 if (nested && hpage) { 4456 if (nested && hpage) {
4301 pr_info("nested (vSIE) and hpage (huge page backing) can currently not be activated concurrently"); 4457 pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n");
4302 return -EINVAL; 4458 return -EINVAL;
4303 } 4459 }
4304 4460
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 1f6e36cdce0d..6d9448dbd052 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -67,7 +67,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
67 67
68static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) 68static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
69{ 69{
70 return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask); 70 return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask);
71} 71}
72 72
73static inline int kvm_is_ucontrol(struct kvm *kvm) 73static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -381,6 +381,8 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
381void kvm_s390_gisa_init(struct kvm *kvm); 381void kvm_s390_gisa_init(struct kvm *kvm);
382void kvm_s390_gisa_clear(struct kvm *kvm); 382void kvm_s390_gisa_clear(struct kvm *kvm);
383void kvm_s390_gisa_destroy(struct kvm *kvm); 383void kvm_s390_gisa_destroy(struct kvm *kvm);
384int kvm_s390_gib_init(u8 nisc);
385void kvm_s390_gib_destroy(void);
384 386
385/* implemented in guestdbg.c */ 387/* implemented in guestdbg.c */
386void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu); 388void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 180373360e34..a5db4475e72d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
35#include <asm/msr-index.h> 35#include <asm/msr-index.h>
36#include <asm/asm.h> 36#include <asm/asm.h>
37#include <asm/kvm_page_track.h> 37#include <asm/kvm_page_track.h>
38#include <asm/kvm_vcpu_regs.h>
38#include <asm/hyperv-tlfs.h> 39#include <asm/hyperv-tlfs.h>
39 40
40#define KVM_MAX_VCPUS 288 41#define KVM_MAX_VCPUS 288
@@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
137#define ASYNC_PF_PER_VCPU 64 138#define ASYNC_PF_PER_VCPU 64
138 139
139enum kvm_reg { 140enum kvm_reg {
140 VCPU_REGS_RAX = 0, 141 VCPU_REGS_RAX = __VCPU_REGS_RAX,
141 VCPU_REGS_RCX = 1, 142 VCPU_REGS_RCX = __VCPU_REGS_RCX,
142 VCPU_REGS_RDX = 2, 143 VCPU_REGS_RDX = __VCPU_REGS_RDX,
143 VCPU_REGS_RBX = 3, 144 VCPU_REGS_RBX = __VCPU_REGS_RBX,
144 VCPU_REGS_RSP = 4, 145 VCPU_REGS_RSP = __VCPU_REGS_RSP,
145 VCPU_REGS_RBP = 5, 146 VCPU_REGS_RBP = __VCPU_REGS_RBP,
146 VCPU_REGS_RSI = 6, 147 VCPU_REGS_RSI = __VCPU_REGS_RSI,
147 VCPU_REGS_RDI = 7, 148 VCPU_REGS_RDI = __VCPU_REGS_RDI,
148#ifdef CONFIG_X86_64 149#ifdef CONFIG_X86_64
149 VCPU_REGS_R8 = 8, 150 VCPU_REGS_R8 = __VCPU_REGS_R8,
150 VCPU_REGS_R9 = 9, 151 VCPU_REGS_R9 = __VCPU_REGS_R9,
151 VCPU_REGS_R10 = 10, 152 VCPU_REGS_R10 = __VCPU_REGS_R10,
152 VCPU_REGS_R11 = 11, 153 VCPU_REGS_R11 = __VCPU_REGS_R11,
153 VCPU_REGS_R12 = 12, 154 VCPU_REGS_R12 = __VCPU_REGS_R12,
154 VCPU_REGS_R13 = 13, 155 VCPU_REGS_R13 = __VCPU_REGS_R13,
155 VCPU_REGS_R14 = 14, 156 VCPU_REGS_R14 = __VCPU_REGS_R14,
156 VCPU_REGS_R15 = 15, 157 VCPU_REGS_R15 = __VCPU_REGS_R15,
157#endif 158#endif
158 VCPU_REGS_RIP, 159 VCPU_REGS_RIP,
159 NR_VCPU_REGS 160 NR_VCPU_REGS
@@ -319,6 +320,7 @@ struct kvm_mmu_page {
319 struct list_head link; 320 struct list_head link;
320 struct hlist_node hash_link; 321 struct hlist_node hash_link;
321 bool unsync; 322 bool unsync;
323 bool mmio_cached;
322 324
323 /* 325 /*
324 * The following two entries are used to key the shadow page in the 326 * The following two entries are used to key the shadow page in the
@@ -333,10 +335,6 @@ struct kvm_mmu_page {
333 int root_count; /* Currently serving as active root */ 335 int root_count; /* Currently serving as active root */
334 unsigned int unsync_children; 336 unsigned int unsync_children;
335 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ 337 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
336
337 /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
338 unsigned long mmu_valid_gen;
339
340 DECLARE_BITMAP(unsync_child_bitmap, 512); 338 DECLARE_BITMAP(unsync_child_bitmap, 512);
341 339
342#ifdef CONFIG_X86_32 340#ifdef CONFIG_X86_32
@@ -848,13 +846,11 @@ struct kvm_arch {
848 unsigned int n_requested_mmu_pages; 846 unsigned int n_requested_mmu_pages;
849 unsigned int n_max_mmu_pages; 847 unsigned int n_max_mmu_pages;
850 unsigned int indirect_shadow_pages; 848 unsigned int indirect_shadow_pages;
851 unsigned long mmu_valid_gen;
852 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 849 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
853 /* 850 /*
854 * Hash table of struct kvm_mmu_page. 851 * Hash table of struct kvm_mmu_page.
855 */ 852 */
856 struct list_head active_mmu_pages; 853 struct list_head active_mmu_pages;
857 struct list_head zapped_obsolete_pages;
858 struct kvm_page_track_notifier_node mmu_sp_tracker; 854 struct kvm_page_track_notifier_node mmu_sp_tracker;
859 struct kvm_page_track_notifier_head track_notifier_head; 855 struct kvm_page_track_notifier_head track_notifier_head;
860 856
@@ -1255,7 +1251,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1255 struct kvm_memory_slot *slot, 1251 struct kvm_memory_slot *slot,
1256 gfn_t gfn_offset, unsigned long mask); 1252 gfn_t gfn_offset, unsigned long mask);
1257void kvm_mmu_zap_all(struct kvm *kvm); 1253void kvm_mmu_zap_all(struct kvm *kvm);
1258void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); 1254void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
1259unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 1255unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
1260void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 1256void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
1261 1257
diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h
new file mode 100644
index 000000000000..1af2cb59233b
--- /dev/null
+++ b/arch/x86/include/asm/kvm_vcpu_regs.h
@@ -0,0 +1,25 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _ASM_X86_KVM_VCPU_REGS_H
3#define _ASM_X86_KVM_VCPU_REGS_H
4
5#define __VCPU_REGS_RAX 0
6#define __VCPU_REGS_RCX 1
7#define __VCPU_REGS_RDX 2
8#define __VCPU_REGS_RBX 3
9#define __VCPU_REGS_RSP 4
10#define __VCPU_REGS_RBP 5
11#define __VCPU_REGS_RSI 6
12#define __VCPU_REGS_RDI 7
13
14#ifdef CONFIG_X86_64
15#define __VCPU_REGS_R8 8
16#define __VCPU_REGS_R9 9
17#define __VCPU_REGS_R10 10
18#define __VCPU_REGS_R11 11
19#define __VCPU_REGS_R12 12
20#define __VCPU_REGS_R13 13
21#define __VCPU_REGS_R14 14
22#define __VCPU_REGS_R15 15
23#endif
24
25#endif /* _ASM_X86_KVM_VCPU_REGS_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e811d4d1c824..904494b924c1 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void)
104 104
105static inline void kvm_sched_clock_init(bool stable) 105static inline void kvm_sched_clock_init(bool stable)
106{ 106{
107 if (!stable) { 107 if (!stable)
108 pv_ops.time.sched_clock = kvm_clock_read;
109 clear_sched_clock_stable(); 108 clear_sched_clock_stable();
110 return;
111 }
112
113 kvm_sched_clock_offset = kvm_clock_read(); 109 kvm_sched_clock_offset = kvm_clock_read();
114 pv_ops.time.sched_clock = kvm_sched_clock_read; 110 pv_ops.time.sched_clock = kvm_sched_clock_read;
115 111
@@ -355,6 +351,20 @@ void __init kvmclock_init(void)
355 machine_ops.crash_shutdown = kvm_crash_shutdown; 351 machine_ops.crash_shutdown = kvm_crash_shutdown;
356#endif 352#endif
357 kvm_get_preset_lpj(); 353 kvm_get_preset_lpj();
354
355 /*
356 * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
357 * with P/T states and does not stop in deep C-states.
358 *
359 * Invariant TSC exposed by host means kvmclock is not necessary:
360 * can use TSC as clocksource.
361 *
362 */
363 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
364 boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
365 !check_tsc_unstable())
366 kvm_clock.rating = 299;
367
358 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); 368 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
359 pv_info.name = "KVM"; 369 pv_info.name = "KVM";
360} 370}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c07958b59f50..fd3951638ae4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -405,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
405 F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | 405 F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
406 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | 406 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
407 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | 407 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
408 F(CLDEMOTE); 408 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
409 409
410 /* cpuid 7.0.edx*/ 410 /* cpuid 7.0.edx*/
411 const u32 kvm_cpuid_7_0_edx_x86_features = 411 const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 89d20ed1d2e8..27c43525a05f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
1729 1729
1730 mutex_lock(&hv->hv_lock); 1730 mutex_lock(&hv->hv_lock);
1731 ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, 1731 ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
1732 GFP_KERNEL); 1732 GFP_KERNEL_ACCOUNT);
1733 mutex_unlock(&hv->hv_lock); 1733 mutex_unlock(&hv->hv_lock);
1734 1734
1735 if (ret >= 0) 1735 if (ret >= 0)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895b1fc..4a6dc54cc12b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
653 pid_t pid_nr; 653 pid_t pid_nr;
654 int ret; 654 int ret;
655 655
656 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 656 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
657 if (!pit) 657 if (!pit)
658 return NULL; 658 return NULL;
659 659
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bdcd4139eca9..8b38bb4868a6 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
583 struct kvm_pic *s; 583 struct kvm_pic *s;
584 int ret; 584 int ret;
585 585
586 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 586 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
587 if (!s) 587 if (!s)
588 return -ENOMEM; 588 return -ENOMEM;
589 spin_lock_init(&s->lock); 589 spin_lock_init(&s->lock);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 4e822ad363f3..1add1bc881e2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
622 struct kvm_ioapic *ioapic; 622 struct kvm_ioapic *ioapic;
623 int ret; 623 int ret;
624 624
625 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 625 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
626 if (!ioapic) 626 if (!ioapic)
627 return -ENOMEM; 627 return -ENOMEM;
628 spin_lock_init(&ioapic->lock); 628 spin_lock_init(&ioapic->lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4b6c2da7265c..991fdf7fc17f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
181 max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); 181 max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
182 182
183 new = kvzalloc(sizeof(struct kvm_apic_map) + 183 new = kvzalloc(sizeof(struct kvm_apic_map) +
184 sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); 184 sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
185 GFP_KERNEL_ACCOUNT);
185 186
186 if (!new) 187 if (!new)
187 goto out; 188 goto out;
@@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2259 ASSERT(vcpu != NULL); 2260 ASSERT(vcpu != NULL);
2260 apic_debug("apic_init %d\n", vcpu->vcpu_id); 2261 apic_debug("apic_init %d\n", vcpu->vcpu_id);
2261 2262
2262 apic = kzalloc(sizeof(*apic), GFP_KERNEL); 2263 apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
2263 if (!apic) 2264 if (!apic)
2264 goto nomem; 2265 goto nomem;
2265 2266
2266 vcpu->arch.apic = apic; 2267 vcpu->arch.apic = apic;
2267 2268
2268 apic->regs = (void *)get_zeroed_page(GFP_KERNEL); 2269 apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
2269 if (!apic->regs) { 2270 if (!apic->regs) {
2270 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 2271 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
2271 vcpu->vcpu_id); 2272 vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2d1d230d5b8..7837ab001d80 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -109,9 +109,11 @@ module_param(dbg, bool, 0644);
109 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 109 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
110 110
111 111
112#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) 112#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
113#define PT64_DIR_BASE_ADDR_MASK \ 113#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
114 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 114#else
115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
116#endif
115#define PT64_LVL_ADDR_MASK(level) \ 117#define PT64_LVL_ADDR_MASK(level) \
116 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 118 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
117 * PT64_LEVEL_BITS))) - 1)) 119 * PT64_LEVEL_BITS))) - 1))
@@ -330,53 +332,56 @@ static inline bool is_access_track_spte(u64 spte)
330} 332}
331 333
332/* 334/*
333 * the low bit of the generation number is always presumed to be zero. 335 * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
334 * This disables mmio caching during memslot updates. The concept is 336 * the memslots generation and is derived as follows:
335 * similar to a seqcount but instead of retrying the access we just punt
336 * and ignore the cache.
337 * 337 *
338 * spte bits 3-11 are used as bits 1-9 of the generation number, 338 * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
339 * the bits 52-61 are used as bits 10-19 of the generation number. 339 * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
340 *
341 * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
342 * the MMIO generation number, as doing so would require stealing a bit from
343 * the "real" generation number and thus effectively halve the maximum number
344 * of MMIO generations that can be handled before encountering a wrap (which
345 * requires a full MMU zap). The flag is instead explicitly queried when
346 * checking for MMIO spte cache hits.
340 */ 347 */
341#define MMIO_SPTE_GEN_LOW_SHIFT 2 348#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
342#define MMIO_SPTE_GEN_HIGH_SHIFT 52
343 349
344#define MMIO_GEN_SHIFT 20 350#define MMIO_SPTE_GEN_LOW_START 3
345#define MMIO_GEN_LOW_SHIFT 10 351#define MMIO_SPTE_GEN_LOW_END 11
346#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) 352#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
347#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) 353 MMIO_SPTE_GEN_LOW_START)
348 354
349static u64 generation_mmio_spte_mask(unsigned int gen) 355#define MMIO_SPTE_GEN_HIGH_START 52
356#define MMIO_SPTE_GEN_HIGH_END 61
357#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
358 MMIO_SPTE_GEN_HIGH_START)
359static u64 generation_mmio_spte_mask(u64 gen)
350{ 360{
351 u64 mask; 361 u64 mask;
352 362
353 WARN_ON(gen & ~MMIO_GEN_MASK); 363 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
354 364
355 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; 365 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
356 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; 366 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
357 return mask; 367 return mask;
358} 368}
359 369
360static unsigned int get_mmio_spte_generation(u64 spte) 370static u64 get_mmio_spte_generation(u64 spte)
361{ 371{
362 unsigned int gen; 372 u64 gen;
363 373
364 spte &= ~shadow_mmio_mask; 374 spte &= ~shadow_mmio_mask;
365 375
366 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; 376 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
367 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; 377 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
368 return gen; 378 return gen;
369} 379}
370 380
371static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
372{
373 return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
374}
375
376static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 381static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
377 unsigned access) 382 unsigned access)
378{ 383{
379 unsigned int gen = kvm_current_mmio_generation(vcpu); 384 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
380 u64 mask = generation_mmio_spte_mask(gen); 385 u64 mask = generation_mmio_spte_mask(gen);
381 u64 gpa = gfn << PAGE_SHIFT; 386 u64 gpa = gfn << PAGE_SHIFT;
382 387
@@ -386,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
386 mask |= (gpa & shadow_nonpresent_or_rsvd_mask) 391 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
387 << shadow_nonpresent_or_rsvd_mask_len; 392 << shadow_nonpresent_or_rsvd_mask_len;
388 393
394 page_header(__pa(sptep))->mmio_cached = true;
395
389 trace_mark_mmio_spte(sptep, gfn, access, gen); 396 trace_mark_mmio_spte(sptep, gfn, access, gen);
390 mmu_spte_set(sptep, mask); 397 mmu_spte_set(sptep, mask);
391} 398}
@@ -407,7 +414,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
407 414
408static unsigned get_mmio_spte_access(u64 spte) 415static unsigned get_mmio_spte_access(u64 spte)
409{ 416{
410 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 417 u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
411 return (spte & ~mask) & ~PAGE_MASK; 418 return (spte & ~mask) & ~PAGE_MASK;
412} 419}
413 420
@@ -424,9 +431,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
424 431
425static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 432static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
426{ 433{
427 unsigned int kvm_gen, spte_gen; 434 u64 kvm_gen, spte_gen, gen;
428 435
429 kvm_gen = kvm_current_mmio_generation(vcpu); 436 gen = kvm_vcpu_memslots(vcpu)->generation;
437 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
438 return false;
439
440 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
430 spte_gen = get_mmio_spte_generation(spte); 441 spte_gen = get_mmio_spte_generation(spte);
431 442
432 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 443 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -959,7 +970,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
959 if (cache->nobjs >= min) 970 if (cache->nobjs >= min)
960 return 0; 971 return 0;
961 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 972 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
962 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); 973 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
963 if (!obj) 974 if (!obj)
964 return cache->nobjs >= min ? 0 : -ENOMEM; 975 return cache->nobjs >= min ? 0 : -ENOMEM;
965 cache->objects[cache->nobjs++] = obj; 976 cache->objects[cache->nobjs++] = obj;
@@ -2049,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
2049 if (!direct) 2060 if (!direct)
2050 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 2061 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2051 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 2062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2052
2053 /*
2054 * The active_mmu_pages list is the FIFO list, do not move the
2055 * page until it is zapped. kvm_zap_obsolete_pages depends on
2056 * this feature. See the comments in kvm_zap_obsolete_pages().
2057 */
2058 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 2063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2059 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 2064 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2060 return sp; 2065 return sp;
@@ -2195,23 +2200,15 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2195 --kvm->stat.mmu_unsync; 2200 --kvm->stat.mmu_unsync;
2196} 2201}
2197 2202
2198static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2203static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2199 struct list_head *invalid_list); 2204 struct list_head *invalid_list);
2200static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2205static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2201 struct list_head *invalid_list); 2206 struct list_head *invalid_list);
2202 2207
2203/*
2204 * NOTE: we should pay more attention on the zapped-obsolete page
2205 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
2206 * since it has been deleted from active_mmu_pages but still can be found
2207 * at hast list.
2208 *
2209 * for_each_valid_sp() has skipped that kind of pages.
2210 */
2211#define for_each_valid_sp(_kvm, _sp, _gfn) \ 2208#define for_each_valid_sp(_kvm, _sp, _gfn) \
2212 hlist_for_each_entry(_sp, \ 2209 hlist_for_each_entry(_sp, \
2213 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 2210 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2214 if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ 2211 if ((_sp)->role.invalid) { \
2215 } else 2212 } else
2216 2213
2217#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 2214#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2231,18 +2228,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2231 return true; 2228 return true;
2232} 2229}
2233 2230
2231static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2232 struct list_head *invalid_list,
2233 bool remote_flush)
2234{
2235 if (!remote_flush && !list_empty(invalid_list))
2236 return false;
2237
2238 if (!list_empty(invalid_list))
2239 kvm_mmu_commit_zap_page(kvm, invalid_list);
2240 else
2241 kvm_flush_remote_tlbs(kvm);
2242 return true;
2243}
2244
2234static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, 2245static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2235 struct list_head *invalid_list, 2246 struct list_head *invalid_list,
2236 bool remote_flush, bool local_flush) 2247 bool remote_flush, bool local_flush)
2237{ 2248{
2238 if (!list_empty(invalid_list)) { 2249 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2239 kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
2240 return; 2250 return;
2241 }
2242 2251
2243 if (remote_flush) 2252 if (local_flush)
2244 kvm_flush_remote_tlbs(vcpu->kvm);
2245 else if (local_flush)
2246 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2253 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2247} 2254}
2248 2255
@@ -2253,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2253static void mmu_audit_disable(void) { } 2260static void mmu_audit_disable(void) { }
2254#endif 2261#endif
2255 2262
2256static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2257{
2258 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2259}
2260
2261static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2263static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2262 struct list_head *invalid_list) 2264 struct list_head *invalid_list)
2263{ 2265{
@@ -2482,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2482 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2484 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2483 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); 2485 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2484 } 2486 }
2485 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2486 clear_page(sp->spt); 2487 clear_page(sp->spt);
2487 trace_kvm_mmu_get_page(sp, true); 2488 trace_kvm_mmu_get_page(sp, true);
2488 2489
@@ -2668,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
2668 return zapped; 2669 return zapped;
2669} 2670}
2670 2671
2671static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2672static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2672 struct list_head *invalid_list) 2673 struct kvm_mmu_page *sp,
2674 struct list_head *invalid_list,
2675 int *nr_zapped)
2673{ 2676{
2674 int ret; 2677 bool list_unstable;
2675 2678
2676 trace_kvm_mmu_prepare_zap_page(sp); 2679 trace_kvm_mmu_prepare_zap_page(sp);
2677 ++kvm->stat.mmu_shadow_zapped; 2680 ++kvm->stat.mmu_shadow_zapped;
2678 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2681 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2679 kvm_mmu_page_unlink_children(kvm, sp); 2682 kvm_mmu_page_unlink_children(kvm, sp);
2680 kvm_mmu_unlink_parents(kvm, sp); 2683 kvm_mmu_unlink_parents(kvm, sp);
2681 2684
2685 /* Zapping children means active_mmu_pages has become unstable. */
2686 list_unstable = *nr_zapped;
2687
2682 if (!sp->role.invalid && !sp->role.direct) 2688 if (!sp->role.invalid && !sp->role.direct)
2683 unaccount_shadowed(kvm, sp); 2689 unaccount_shadowed(kvm, sp);
2684 2690
@@ -2686,22 +2692,27 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2686 kvm_unlink_unsync_page(kvm, sp); 2692 kvm_unlink_unsync_page(kvm, sp);
2687 if (!sp->root_count) { 2693 if (!sp->root_count) {
2688 /* Count self */ 2694 /* Count self */
2689 ret++; 2695 (*nr_zapped)++;
2690 list_move(&sp->link, invalid_list); 2696 list_move(&sp->link, invalid_list);
2691 kvm_mod_used_mmu_pages(kvm, -1); 2697 kvm_mod_used_mmu_pages(kvm, -1);
2692 } else { 2698 } else {
2693 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2699 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2694 2700
2695 /* 2701 if (!sp->role.invalid)
2696 * The obsolete pages can not be used on any vcpus.
2697 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2698 */
2699 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2700 kvm_reload_remote_mmus(kvm); 2702 kvm_reload_remote_mmus(kvm);
2701 } 2703 }
2702 2704
2703 sp->role.invalid = 1; 2705 sp->role.invalid = 1;
2704 return ret; 2706 return list_unstable;
2707}
2708
2709static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2710 struct list_head *invalid_list)
2711{
2712 int nr_zapped;
2713
2714 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2715 return nr_zapped;
2705} 2716}
2706 2717
2707static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2718static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -3703,7 +3714,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3703 3714
3704 u64 *lm_root; 3715 u64 *lm_root;
3705 3716
3706 lm_root = (void*)get_zeroed_page(GFP_KERNEL); 3717 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3707 if (lm_root == NULL) 3718 if (lm_root == NULL)
3708 return 1; 3719 return 1;
3709 3720
@@ -4204,14 +4215,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4204 return false; 4215 return false;
4205 4216
4206 if (cached_root_available(vcpu, new_cr3, new_role)) { 4217 if (cached_root_available(vcpu, new_cr3, new_role)) {
4207 /*
4208 * It is possible that the cached previous root page is
4209 * obsolete because of a change in the MMU
4210 * generation number. However, that is accompanied by
4211 * KVM_REQ_MMU_RELOAD, which will free the root that we
4212 * have set here and allocate a new one.
4213 */
4214
4215 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); 4218 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4216 if (!skip_tlb_flush) { 4219 if (!skip_tlb_flush) {
4217 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4220 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5486,6 +5489,76 @@ void kvm_disable_tdp(void)
5486} 5489}
5487EXPORT_SYMBOL_GPL(kvm_disable_tdp); 5490EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5488 5491
5492
5493/* The return value indicates if tlb flush on all vcpus is needed. */
5494typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5495
5496/* The caller should hold mmu-lock before calling this function. */
5497static __always_inline bool
5498slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5499 slot_level_handler fn, int start_level, int end_level,
5500 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5501{
5502 struct slot_rmap_walk_iterator iterator;
5503 bool flush = false;
5504
5505 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5506 end_gfn, &iterator) {
5507 if (iterator.rmap)
5508 flush |= fn(kvm, iterator.rmap);
5509
5510 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5511 if (flush && lock_flush_tlb) {
5512 kvm_flush_remote_tlbs(kvm);
5513 flush = false;
5514 }
5515 cond_resched_lock(&kvm->mmu_lock);
5516 }
5517 }
5518
5519 if (flush && lock_flush_tlb) {
5520 kvm_flush_remote_tlbs(kvm);
5521 flush = false;
5522 }
5523
5524 return flush;
5525}
5526
5527static __always_inline bool
5528slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5529 slot_level_handler fn, int start_level, int end_level,
5530 bool lock_flush_tlb)
5531{
5532 return slot_handle_level_range(kvm, memslot, fn, start_level,
5533 end_level, memslot->base_gfn,
5534 memslot->base_gfn + memslot->npages - 1,
5535 lock_flush_tlb);
5536}
5537
5538static __always_inline bool
5539slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5540 slot_level_handler fn, bool lock_flush_tlb)
5541{
5542 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5543 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5544}
5545
5546static __always_inline bool
5547slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5548 slot_level_handler fn, bool lock_flush_tlb)
5549{
5550 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5551 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5552}
5553
5554static __always_inline bool
5555slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5556 slot_level_handler fn, bool lock_flush_tlb)
5557{
5558 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5559 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5560}
5561
5489static void free_mmu_pages(struct kvm_vcpu *vcpu) 5562static void free_mmu_pages(struct kvm_vcpu *vcpu)
5490{ 5563{
5491 free_page((unsigned long)vcpu->arch.mmu->pae_root); 5564 free_page((unsigned long)vcpu->arch.mmu->pae_root);
@@ -5505,7 +5578,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
5505 * Therefore we need to allocate shadow page tables in the first 5578 * Therefore we need to allocate shadow page tables in the first
5506 * 4GB of memory, which happens to fit the DMA32 zone. 5579 * 4GB of memory, which happens to fit the DMA32 zone.
5507 */ 5580 */
5508 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 5581 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5509 if (!page) 5582 if (!page)
5510 return -ENOMEM; 5583 return -ENOMEM;
5511 5584
@@ -5543,105 +5616,62 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5543 struct kvm_memory_slot *slot, 5616 struct kvm_memory_slot *slot,
5544 struct kvm_page_track_notifier_node *node) 5617 struct kvm_page_track_notifier_node *node)
5545{ 5618{
5546 kvm_mmu_invalidate_zap_all_pages(kvm); 5619 struct kvm_mmu_page *sp;
5547} 5620 LIST_HEAD(invalid_list);
5548 5621 unsigned long i;
5549void kvm_mmu_init_vm(struct kvm *kvm) 5622 bool flush;
5550{ 5623 gfn_t gfn;
5551 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5552
5553 node->track_write = kvm_mmu_pte_write;
5554 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5555 kvm_page_track_register_notifier(kvm, node);
5556}
5557 5624
5558void kvm_mmu_uninit_vm(struct kvm *kvm) 5625 spin_lock(&kvm->mmu_lock);
5559{
5560 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5561 5626
5562 kvm_page_track_unregister_notifier(kvm, node); 5627 if (list_empty(&kvm->arch.active_mmu_pages))
5563} 5628 goto out_unlock;
5564 5629
5565/* The return value indicates if tlb flush on all vcpus is needed. */ 5630 flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
5566typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5567 5631
5568/* The caller should hold mmu-lock before calling this function. */ 5632 for (i = 0; i < slot->npages; i++) {
5569static __always_inline bool 5633 gfn = slot->base_gfn + i;
5570slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5571 slot_level_handler fn, int start_level, int end_level,
5572 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5573{
5574 struct slot_rmap_walk_iterator iterator;
5575 bool flush = false;
5576 5634
5577 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 5635 for_each_valid_sp(kvm, sp, gfn) {
5578 end_gfn, &iterator) { 5636 if (sp->gfn != gfn)
5579 if (iterator.rmap) 5637 continue;
5580 flush |= fn(kvm, iterator.rmap);
5581 5638
5639 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
5640 }
5582 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 5641 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5583 if (flush && lock_flush_tlb) { 5642 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5584 kvm_flush_remote_tlbs(kvm); 5643 flush = false;
5585 flush = false;
5586 }
5587 cond_resched_lock(&kvm->mmu_lock); 5644 cond_resched_lock(&kvm->mmu_lock);
5588 } 5645 }
5589 } 5646 }
5647 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
5590 5648
5591 if (flush && lock_flush_tlb) { 5649out_unlock:
5592 kvm_flush_remote_tlbs(kvm); 5650 spin_unlock(&kvm->mmu_lock);
5593 flush = false;
5594 }
5595
5596 return flush;
5597} 5651}
5598 5652
5599static __always_inline bool 5653void kvm_mmu_init_vm(struct kvm *kvm)
5600slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5601 slot_level_handler fn, int start_level, int end_level,
5602 bool lock_flush_tlb)
5603{ 5654{
5604 return slot_handle_level_range(kvm, memslot, fn, start_level, 5655 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5605 end_level, memslot->base_gfn,
5606 memslot->base_gfn + memslot->npages - 1,
5607 lock_flush_tlb);
5608}
5609 5656
5610static __always_inline bool 5657 node->track_write = kvm_mmu_pte_write;
5611slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 5658 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5612 slot_level_handler fn, bool lock_flush_tlb) 5659 kvm_page_track_register_notifier(kvm, node);
5613{
5614 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5615 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5616} 5660}
5617 5661
5618static __always_inline bool 5662void kvm_mmu_uninit_vm(struct kvm *kvm)
5619slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5620 slot_level_handler fn, bool lock_flush_tlb)
5621{ 5663{
5622 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, 5664 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5623 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5624}
5625 5665
5626static __always_inline bool 5666 kvm_page_track_unregister_notifier(kvm, node);
5627slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5628 slot_level_handler fn, bool lock_flush_tlb)
5629{
5630 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5631 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5632} 5667}
5633 5668
5634void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5669void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5635{ 5670{
5636 struct kvm_memslots *slots; 5671 struct kvm_memslots *slots;
5637 struct kvm_memory_slot *memslot; 5672 struct kvm_memory_slot *memslot;
5638 bool flush_tlb = true;
5639 bool flush = false;
5640 int i; 5673 int i;
5641 5674
5642 if (kvm_available_flush_tlb_with_range())
5643 flush_tlb = false;
5644
5645 spin_lock(&kvm->mmu_lock); 5675 spin_lock(&kvm->mmu_lock);
5646 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5676 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5647 slots = __kvm_memslots(kvm, i); 5677 slots = __kvm_memslots(kvm, i);
@@ -5653,17 +5683,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5653 if (start >= end) 5683 if (start >= end)
5654 continue; 5684 continue;
5655 5685
5656 flush |= slot_handle_level_range(kvm, memslot, 5686 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5657 kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, 5687 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5658 PT_MAX_HUGEPAGE_LEVEL, start, 5688 start, end - 1, true);
5659 end - 1, flush_tlb);
5660 } 5689 }
5661 } 5690 }
5662 5691
5663 if (flush)
5664 kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
5665 gfn_end - gfn_start + 1);
5666
5667 spin_unlock(&kvm->mmu_lock); 5692 spin_unlock(&kvm->mmu_lock);
5668} 5693}
5669 5694
@@ -5815,101 +5840,58 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5815} 5840}
5816EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); 5841EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5817 5842
5818#define BATCH_ZAP_PAGES 10 5843static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
5819static void kvm_zap_obsolete_pages(struct kvm *kvm)
5820{ 5844{
5821 struct kvm_mmu_page *sp, *node; 5845 struct kvm_mmu_page *sp, *node;
5822 int batch = 0; 5846 LIST_HEAD(invalid_list);
5847 int ign;
5823 5848
5849 spin_lock(&kvm->mmu_lock);
5824restart: 5850restart:
5825 list_for_each_entry_safe_reverse(sp, node, 5851 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5826 &kvm->arch.active_mmu_pages, link) { 5852 if (mmio_only && !sp->mmio_cached)
5827 int ret;
5828
5829 /*
5830 * No obsolete page exists before new created page since
5831 * active_mmu_pages is the FIFO list.
5832 */
5833 if (!is_obsolete_sp(kvm, sp))
5834 break;
5835
5836 /*
5837 * Since we are reversely walking the list and the invalid
5838 * list will be moved to the head, skip the invalid page
5839 * can help us to avoid the infinity list walking.
5840 */
5841 if (sp->role.invalid)
5842 continue; 5853 continue;
5843 5854 if (sp->role.invalid && sp->root_count)
5844 /* 5855 continue;
5845 * Need not flush tlb since we only zap the sp with invalid 5856 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
5846 * generation number. 5857 WARN_ON_ONCE(mmio_only);
5847 */
5848 if (batch >= BATCH_ZAP_PAGES &&
5849 cond_resched_lock(&kvm->mmu_lock)) {
5850 batch = 0;
5851 goto restart; 5858 goto restart;
5852 } 5859 }
5853 5860 if (cond_resched_lock(&kvm->mmu_lock))
5854 ret = kvm_mmu_prepare_zap_page(kvm, sp,
5855 &kvm->arch.zapped_obsolete_pages);
5856 batch += ret;
5857
5858 if (ret)
5859 goto restart; 5861 goto restart;
5860 } 5862 }
5861 5863
5862 /* 5864 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5863 * Should flush tlb before free page tables since lockless-walking
5864 * may use the pages.
5865 */
5866 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5867}
5868
5869/*
5870 * Fast invalidate all shadow pages and use lock-break technique
5871 * to zap obsolete pages.
5872 *
5873 * It's required when memslot is being deleted or VM is being
5874 * destroyed, in these cases, we should ensure that KVM MMU does
5875 * not use any resource of the being-deleted slot or all slots
5876 * after calling the function.
5877 */
5878void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
5879{
5880 spin_lock(&kvm->mmu_lock);
5881 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
5882 kvm->arch.mmu_valid_gen++;
5883
5884 /*
5885 * Notify all vcpus to reload its shadow page table
5886 * and flush TLB. Then all vcpus will switch to new
5887 * shadow page table with the new mmu_valid_gen.
5888 *
5889 * Note: we should do this under the protection of
5890 * mmu-lock, otherwise, vcpu would purge shadow page
5891 * but miss tlb flush.
5892 */
5893 kvm_reload_remote_mmus(kvm);
5894
5895 kvm_zap_obsolete_pages(kvm);
5896 spin_unlock(&kvm->mmu_lock); 5865 spin_unlock(&kvm->mmu_lock);
5897} 5866}
5898 5867
5899static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 5868void kvm_mmu_zap_all(struct kvm *kvm)
5900{ 5869{
5901 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 5870 return __kvm_mmu_zap_all(kvm, false);
5902} 5871}
5903 5872
5904void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) 5873void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5905{ 5874{
5875 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5876
5877 gen &= MMIO_SPTE_GEN_MASK;
5878
5906 /* 5879 /*
5907 * The very rare case: if the generation-number is round, 5880 * Generation numbers are incremented in multiples of the number of
5881 * address spaces in order to provide unique generations across all
5882 * address spaces. Strip what is effectively the address space
5883 * modifier prior to checking for a wrap of the MMIO generation so
5884 * that a wrap in any address space is detected.
5885 */
5886 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
5887
5888 /*
5889 * The very rare case: if the MMIO generation number has wrapped,
5908 * zap all shadow pages. 5890 * zap all shadow pages.
5909 */ 5891 */
5910 if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { 5892 if (unlikely(gen == 0)) {
5911 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); 5893 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
5912 kvm_mmu_invalidate_zap_all_pages(kvm); 5894 __kvm_mmu_zap_all(kvm, true);
5913 } 5895 }
5914} 5896}
5915 5897
@@ -5940,24 +5922,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
5940 * want to shrink a VM that only started to populate its MMU 5922 * want to shrink a VM that only started to populate its MMU
5941 * anyway. 5923 * anyway.
5942 */ 5924 */
5943 if (!kvm->arch.n_used_mmu_pages && 5925 if (!kvm->arch.n_used_mmu_pages)
5944 !kvm_has_zapped_obsolete_pages(kvm))
5945 continue; 5926 continue;
5946 5927
5947 idx = srcu_read_lock(&kvm->srcu); 5928 idx = srcu_read_lock(&kvm->srcu);
5948 spin_lock(&kvm->mmu_lock); 5929 spin_lock(&kvm->mmu_lock);
5949 5930
5950 if (kvm_has_zapped_obsolete_pages(kvm)) {
5951 kvm_mmu_commit_zap_page(kvm,
5952 &kvm->arch.zapped_obsolete_pages);
5953 goto unlock;
5954 }
5955
5956 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 5931 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
5957 freed++; 5932 freed++;
5958 kvm_mmu_commit_zap_page(kvm, &invalid_list); 5933 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5959 5934
5960unlock:
5961 spin_unlock(&kvm->mmu_lock); 5935 spin_unlock(&kvm->mmu_lock);
5962 srcu_read_unlock(&kvm->srcu, idx); 5936 srcu_read_unlock(&kvm->srcu, idx);
5963 5937
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7b333147c4a..bbdc60f2fae8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
203 return -(u32)fault & errcode; 203 return -(u32)fault & errcode;
204} 204}
205 205
206void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
207void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); 206void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
208 207
209void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); 208void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c73bf4e4988c..9f6c855a0043 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,18 +8,16 @@
8#undef TRACE_SYSTEM 8#undef TRACE_SYSTEM
9#define TRACE_SYSTEM kvmmmu 9#define TRACE_SYSTEM kvmmmu
10 10
11#define KVM_MMU_PAGE_FIELDS \ 11#define KVM_MMU_PAGE_FIELDS \
12 __field(unsigned long, mmu_valid_gen) \ 12 __field(__u64, gfn) \
13 __field(__u64, gfn) \ 13 __field(__u32, role) \
14 __field(__u32, role) \ 14 __field(__u32, root_count) \
15 __field(__u32, root_count) \
16 __field(bool, unsync) 15 __field(bool, unsync)
17 16
18#define KVM_MMU_PAGE_ASSIGN(sp) \ 17#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->mmu_valid_gen = sp->mmu_valid_gen; \ 18 __entry->gfn = sp->gfn; \
20 __entry->gfn = sp->gfn; \ 19 __entry->role = sp->role.word; \
21 __entry->role = sp->role.word; \ 20 __entry->root_count = sp->root_count; \
22 __entry->root_count = sp->root_count; \
23 __entry->unsync = sp->unsync; 21 __entry->unsync = sp->unsync;
24 22
25#define KVM_MMU_PAGE_PRINTK() ({ \ 23#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -31,9 +29,8 @@
31 \ 29 \
32 role.word = __entry->role; \ 30 role.word = __entry->role; \
33 \ 31 \
34 trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ 32 trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
35 " %snxe %sad root %u %s%c", \ 33 " %snxe %sad root %u %s%c", \
36 __entry->mmu_valid_gen, \
37 __entry->gfn, role.level, \ 34 __entry->gfn, role.level, \
38 role.cr4_pae ? " pae" : "", \ 35 role.cr4_pae ? " pae" : "", \
39 role.quadrant, \ 36 role.quadrant, \
@@ -283,27 +280,6 @@ TRACE_EVENT(
283); 280);
284 281
285TRACE_EVENT( 282TRACE_EVENT(
286 kvm_mmu_invalidate_zap_all_pages,
287 TP_PROTO(struct kvm *kvm),
288 TP_ARGS(kvm),
289
290 TP_STRUCT__entry(
291 __field(unsigned long, mmu_valid_gen)
292 __field(unsigned int, mmu_used_pages)
293 ),
294
295 TP_fast_assign(
296 __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
297 __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
298 ),
299
300 TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
301 __entry->mmu_valid_gen, __entry->mmu_used_pages
302 )
303);
304
305
306TRACE_EVENT(
307 check_mmio_spte, 283 check_mmio_spte,
308 TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), 284 TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
309 TP_ARGS(spte, kvm_gen, spte_gen), 285 TP_ARGS(spte, kvm_gen, spte_gen),
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a3065..fd04d462fdae 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
42 for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { 42 for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
43 slot->arch.gfn_track[i] = 43 slot->arch.gfn_track[i] =
44 kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), 44 kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
45 GFP_KERNEL); 45 GFP_KERNEL_ACCOUNT);
46 if (!slot->arch.gfn_track[i]) 46 if (!slot->arch.gfn_track[i])
47 goto track_free; 47 goto track_free;
48 } 48 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f13a3a24d360..b5b128a0a051 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -145,7 +145,6 @@ struct kvm_svm {
145 145
146 /* Struct members for AVIC */ 146 /* Struct members for AVIC */
147 u32 avic_vm_id; 147 u32 avic_vm_id;
148 u32 ldr_mode;
149 struct page *avic_logical_id_table_page; 148 struct page *avic_logical_id_table_page;
150 struct page *avic_physical_id_table_page; 149 struct page *avic_physical_id_table_page;
151 struct hlist_node hnode; 150 struct hlist_node hnode;
@@ -236,6 +235,7 @@ struct vcpu_svm {
236 bool nrips_enabled : 1; 235 bool nrips_enabled : 1;
237 236
238 u32 ldr_reg; 237 u32 ldr_reg;
238 u32 dfr_reg;
239 struct page *avic_backing_page; 239 struct page *avic_backing_page;
240 u64 *avic_physical_id_cache; 240 u64 *avic_physical_id_cache;
241 bool avic_is_running; 241 bool avic_is_running;
@@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
1795 /* Avoid using vmalloc for smaller buffers. */ 1795 /* Avoid using vmalloc for smaller buffers. */
1796 size = npages * sizeof(struct page *); 1796 size = npages * sizeof(struct page *);
1797 if (size > PAGE_SIZE) 1797 if (size > PAGE_SIZE)
1798 pages = vmalloc(size); 1798 pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
1799 PAGE_KERNEL);
1799 else 1800 else
1800 pages = kmalloc(size, GFP_KERNEL); 1801 pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
1801 1802
1802 if (!pages) 1803 if (!pages)
1803 return NULL; 1804 return NULL;
@@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
1865 1866
1866static struct kvm *svm_vm_alloc(void) 1867static struct kvm *svm_vm_alloc(void)
1867{ 1868{
1868 struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); 1869 struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
1870 GFP_KERNEL_ACCOUNT | __GFP_ZERO,
1871 PAGE_KERNEL);
1869 return &kvm_svm->kvm; 1872 return &kvm_svm->kvm;
1870} 1873}
1871 1874
@@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm)
1940 return 0; 1943 return 0;
1941 1944
1942 /* Allocating physical APIC ID table (4KB) */ 1945 /* Allocating physical APIC ID table (4KB) */
1943 p_page = alloc_page(GFP_KERNEL); 1946 p_page = alloc_page(GFP_KERNEL_ACCOUNT);
1944 if (!p_page) 1947 if (!p_page)
1945 goto free_avic; 1948 goto free_avic;
1946 1949
@@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm)
1948 clear_page(page_address(p_page)); 1951 clear_page(page_address(p_page));
1949 1952
1950 /* Allocating logical APIC ID table (4KB) */ 1953 /* Allocating logical APIC ID table (4KB) */
1951 l_page = alloc_page(GFP_KERNEL); 1954 l_page = alloc_page(GFP_KERNEL_ACCOUNT);
1952 if (!l_page) 1955 if (!l_page)
1953 goto free_avic; 1956 goto free_avic;
1954 1957
@@ -2106,6 +2109,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
2106 2109
2107 INIT_LIST_HEAD(&svm->ir_list); 2110 INIT_LIST_HEAD(&svm->ir_list);
2108 spin_lock_init(&svm->ir_list_lock); 2111 spin_lock_init(&svm->ir_list_lock);
2112 svm->dfr_reg = APIC_DFR_FLAT;
2109 2113
2110 return ret; 2114 return ret;
2111} 2115}
@@ -2119,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
2119 struct page *nested_msrpm_pages; 2123 struct page *nested_msrpm_pages;
2120 int err; 2124 int err;
2121 2125
2122 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 2126 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
2123 if (!svm) { 2127 if (!svm) {
2124 err = -ENOMEM; 2128 err = -ENOMEM;
2125 goto out; 2129 goto out;
2126 } 2130 }
2127 2131
2128 svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); 2132 svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
2133 GFP_KERNEL_ACCOUNT);
2129 if (!svm->vcpu.arch.guest_fpu) { 2134 if (!svm->vcpu.arch.guest_fpu) {
2130 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 2135 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
2131 err = -ENOMEM; 2136 err = -ENOMEM;
@@ -2137,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
2137 goto free_svm; 2142 goto free_svm;
2138 2143
2139 err = -ENOMEM; 2144 err = -ENOMEM;
2140 page = alloc_page(GFP_KERNEL); 2145 page = alloc_page(GFP_KERNEL_ACCOUNT);
2141 if (!page) 2146 if (!page)
2142 goto uninit; 2147 goto uninit;
2143 2148
2144 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 2149 msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
2145 if (!msrpm_pages) 2150 if (!msrpm_pages)
2146 goto free_page1; 2151 goto free_page1;
2147 2152
2148 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 2153 nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
2149 if (!nested_msrpm_pages) 2154 if (!nested_msrpm_pages)
2150 goto free_page2; 2155 goto free_page2;
2151 2156
2152 hsave_page = alloc_page(GFP_KERNEL); 2157 hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
2153 if (!hsave_page) 2158 if (!hsave_page)
2154 goto free_page3; 2159 goto free_page3;
2155 2160
@@ -4565,8 +4570,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
4565 return &logical_apic_id_table[index]; 4570 return &logical_apic_id_table[index];
4566} 4571}
4567 4572
4568static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, 4573static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
4569 bool valid)
4570{ 4574{
4571 bool flat; 4575 bool flat;
4572 u32 *entry, new_entry; 4576 u32 *entry, new_entry;
@@ -4579,31 +4583,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
4579 new_entry = READ_ONCE(*entry); 4583 new_entry = READ_ONCE(*entry);
4580 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; 4584 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
4581 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); 4585 new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
4582 if (valid) 4586 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4583 new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4584 else
4585 new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4586 WRITE_ONCE(*entry, new_entry); 4587 WRITE_ONCE(*entry, new_entry);
4587 4588
4588 return 0; 4589 return 0;
4589} 4590}
4590 4591
4592static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
4593{
4594 struct vcpu_svm *svm = to_svm(vcpu);
4595 bool flat = svm->dfr_reg == APIC_DFR_FLAT;
4596 u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
4597
4598 if (entry)
4599 WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK);
4600}
4601
4591static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) 4602static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
4592{ 4603{
4593 int ret; 4604 int ret = 0;
4594 struct vcpu_svm *svm = to_svm(vcpu); 4605 struct vcpu_svm *svm = to_svm(vcpu);
4595 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); 4606 u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
4596 4607
4597 if (!ldr) 4608 if (ldr == svm->ldr_reg)
4598 return 1; 4609 return 0;
4599 4610
4600 ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); 4611 avic_invalidate_logical_id_entry(vcpu);
4601 if (ret && svm->ldr_reg) { 4612
4602 avic_ldr_write(vcpu, 0, svm->ldr_reg, false); 4613 if (ldr)
4603 svm->ldr_reg = 0; 4614 ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
4604 } else { 4615
4616 if (!ret)
4605 svm->ldr_reg = ldr; 4617 svm->ldr_reg = ldr;
4606 } 4618
4607 return ret; 4619 return ret;
4608} 4620}
4609 4621
@@ -4637,27 +4649,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
4637 return 0; 4649 return 0;
4638} 4650}
4639 4651
4640static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) 4652static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
4641{ 4653{
4642 struct vcpu_svm *svm = to_svm(vcpu); 4654 struct vcpu_svm *svm = to_svm(vcpu);
4643 struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
4644 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); 4655 u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
4645 u32 mod = (dfr >> 28) & 0xf;
4646 4656
4647 /* 4657 if (svm->dfr_reg == dfr)
4648 * We assume that all local APICs are using the same type. 4658 return;
4649 * If this changes, we need to flush the AVIC logical
4650 * APID id table.
4651 */
4652 if (kvm_svm->ldr_mode == mod)
4653 return 0;
4654
4655 clear_page(page_address(kvm_svm->avic_logical_id_table_page));
4656 kvm_svm->ldr_mode = mod;
4657 4659
4658 if (svm->ldr_reg) 4660 avic_invalidate_logical_id_entry(vcpu);
4659 avic_handle_ldr_update(vcpu); 4661 svm->dfr_reg = dfr;
4660 return 0;
4661} 4662}
4662 4663
4663static int avic_unaccel_trap_write(struct vcpu_svm *svm) 4664static int avic_unaccel_trap_write(struct vcpu_svm *svm)
@@ -5125,11 +5126,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5125 struct vcpu_svm *svm = to_svm(vcpu); 5126 struct vcpu_svm *svm = to_svm(vcpu);
5126 struct vmcb *vmcb = svm->vmcb; 5127 struct vmcb *vmcb = svm->vmcb;
5127 5128
5128 if (!kvm_vcpu_apicv_active(&svm->vcpu)) 5129 if (kvm_vcpu_apicv_active(vcpu))
5129 return; 5130 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
5130 5131 else
5131 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; 5132 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
5132 mark_dirty(vmcb, VMCB_INTR); 5133 mark_dirty(vmcb, VMCB_AVIC);
5133} 5134}
5134 5135
5135static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 5136static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5195,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
5195 * Allocating new amd_iommu_pi_data, which will get 5196 * Allocating new amd_iommu_pi_data, which will get
5196 * add to the per-vcpu ir_list. 5197 * add to the per-vcpu ir_list.
5197 */ 5198 */
5198 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); 5199 ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
5199 if (!ir) { 5200 if (!ir) {
5200 ret = -ENOMEM; 5201 ret = -ENOMEM;
5201 goto out; 5202 goto out;
@@ -6163,8 +6164,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
6163{ 6164{
6164 if (avic_handle_apic_id_update(vcpu) != 0) 6165 if (avic_handle_apic_id_update(vcpu) != 0)
6165 return; 6166 return;
6166 if (avic_handle_dfr_update(vcpu) != 0) 6167 avic_handle_dfr_update(vcpu);
6167 return;
6168 avic_handle_ldr_update(vcpu); 6168 avic_handle_ldr_update(vcpu);
6169} 6169}
6170 6170
@@ -6311,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
6311 if (ret) 6311 if (ret)
6312 return ret; 6312 return ret;
6313 6313
6314 data = kzalloc(sizeof(*data), GFP_KERNEL); 6314 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6315 if (!data) 6315 if (!data)
6316 return -ENOMEM; 6316 return -ENOMEM;
6317 6317
@@ -6361,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
6361 if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params))) 6361 if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
6362 return -EFAULT; 6362 return -EFAULT;
6363 6363
6364 start = kzalloc(sizeof(*start), GFP_KERNEL); 6364 start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
6365 if (!start) 6365 if (!start)
6366 return -ENOMEM; 6366 return -ENOMEM;
6367 6367
@@ -6458,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
6458 if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params))) 6458 if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
6459 return -EFAULT; 6459 return -EFAULT;
6460 6460
6461 data = kzalloc(sizeof(*data), GFP_KERNEL); 6461 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6462 if (!data) 6462 if (!data)
6463 return -ENOMEM; 6463 return -ENOMEM;
6464 6464
@@ -6535,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
6535 if (copy_from_user(&params, measure, sizeof(params))) 6535 if (copy_from_user(&params, measure, sizeof(params)))
6536 return -EFAULT; 6536 return -EFAULT;
6537 6537
6538 data = kzalloc(sizeof(*data), GFP_KERNEL); 6538 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6539 if (!data) 6539 if (!data)
6540 return -ENOMEM; 6540 return -ENOMEM;
6541 6541
@@ -6597,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
6597 if (!sev_guest(kvm)) 6597 if (!sev_guest(kvm))
6598 return -ENOTTY; 6598 return -ENOTTY;
6599 6599
6600 data = kzalloc(sizeof(*data), GFP_KERNEL); 6600 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6601 if (!data) 6601 if (!data)
6602 return -ENOMEM; 6602 return -ENOMEM;
6603 6603
@@ -6618,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
6618 if (!sev_guest(kvm)) 6618 if (!sev_guest(kvm))
6619 return -ENOTTY; 6619 return -ENOTTY;
6620 6620
6621 data = kzalloc(sizeof(*data), GFP_KERNEL); 6621 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6622 if (!data) 6622 if (!data)
6623 return -ENOMEM; 6623 return -ENOMEM;
6624 6624
@@ -6646,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
6646 struct sev_data_dbg *data; 6646 struct sev_data_dbg *data;
6647 int ret; 6647 int ret;
6648 6648
6649 data = kzalloc(sizeof(*data), GFP_KERNEL); 6649 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6650 if (!data) 6650 if (!data)
6651 return -ENOMEM; 6651 return -ENOMEM;
6652 6652
@@ -6901,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
6901 } 6901 }
6902 6902
6903 ret = -ENOMEM; 6903 ret = -ENOMEM;
6904 data = kzalloc(sizeof(*data), GFP_KERNEL); 6904 data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6905 if (!data) 6905 if (!data)
6906 goto e_unpin_memory; 6906 goto e_unpin_memory;
6907 6907
@@ -7007,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm,
7007 if (range->addr > ULONG_MAX || range->size > ULONG_MAX) 7007 if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
7008 return -EINVAL; 7008 return -EINVAL;
7009 7009
7010 region = kzalloc(sizeof(*region), GFP_KERNEL); 7010 region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
7011 if (!region) 7011 if (!region)
7012 return -ENOMEM; 7012 return -ENOMEM;
7013 7013
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d737a51a53ca..f24a2c225070 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
211 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 211 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
212 return; 212 return;
213 213
214 hrtimer_cancel(&vmx->nested.preemption_timer);
215 vmx->nested.vmxon = false; 214 vmx->nested.vmxon = false;
216 vmx->nested.smm.vmxon = false; 215 vmx->nested.smm.vmxon = false;
217 free_vpid(vmx->nested.vpid02); 216 free_vpid(vmx->nested.vpid02);
@@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
274void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 273void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
275{ 274{
276 vcpu_load(vcpu); 275 vcpu_load(vcpu);
276 vmx_leave_nested(vcpu);
277 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 277 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
278 free_nested(vcpu); 278 free_nested(vcpu);
279 vcpu_put(vcpu); 279 vcpu_put(vcpu);
@@ -1980,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1980 prepare_vmcs02_early_full(vmx, vmcs12); 1980 prepare_vmcs02_early_full(vmx, vmcs12);
1981 1981
1982 /* 1982 /*
1983 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
1984 * entry, but only if the current (host) sp changed from the value
1985 * we wrote last (vmx->host_rsp). This cache is no longer relevant
1986 * if we switch vmcs, and rather than hold a separate cache per vmcs,
1987 * here we just force the write to happen on entry. host_rsp will
1988 * also be written unconditionally by nested_vmx_check_vmentry_hw()
1989 * if we are doing early consistency checks via hardware.
1990 */
1991 vmx->host_rsp = 0;
1992
1993 /*
1994 * PIN CONTROLS 1983 * PIN CONTROLS
1995 */ 1984 */
1996 exec_control = vmcs12->pin_based_vm_exec_control; 1985 exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2289,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2289 } 2278 }
2290 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2279 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2291 2280
2292 vmx->nested.preemption_timer_expired = false;
2293 if (nested_cpu_has_preemption_timer(vmcs12))
2294 vmx_start_preemption_timer(vcpu);
2295
2296 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2281 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2297 * bitwise-or of what L1 wants to trap for L2, and what we want to 2282 * bitwise-or of what L1 wants to trap for L2, and what we want to
2298 * trap. Note that CR0.TS also needs updating - we do this later. 2283 * trap. Note that CR0.TS also needs updating - we do this later.
@@ -2722,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2722{ 2707{
2723 struct vcpu_vmx *vmx = to_vmx(vcpu); 2708 struct vcpu_vmx *vmx = to_vmx(vcpu);
2724 unsigned long cr3, cr4; 2709 unsigned long cr3, cr4;
2710 bool vm_fail;
2725 2711
2726 if (!nested_early_check) 2712 if (!nested_early_check)
2727 return 0; 2713 return 0;
@@ -2755,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2755 vmx->loaded_vmcs->host_state.cr4 = cr4; 2741 vmx->loaded_vmcs->host_state.cr4 = cr4;
2756 } 2742 }
2757 2743
2758 vmx->__launched = vmx->loaded_vmcs->launched;
2759
2760 asm( 2744 asm(
2761 /* Set HOST_RSP */
2762 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2745 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2763 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" 2746 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2764 "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" 2747 "je 1f \n\t"
2748 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2749 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2750 "1: \n\t"
2765 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2751 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2766 2752
2767 /* Check if vmlaunch or vmresume is needed */ 2753 /* Check if vmlaunch or vmresume is needed */
2768 "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" 2754 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2769 2755
2756 /*
2757 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2758 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2759 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
2760 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2761 */
2770 "call vmx_vmenter\n\t" 2762 "call vmx_vmenter\n\t"
2771 2763
2772 /* Set vmx->fail accordingly */ 2764 CC_SET(be)
2773 "setbe %c[fail](%% " _ASM_CX")\n\t" 2765 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2774 : ASM_CALL_CONSTRAINT 2766 : [HOST_RSP]"r"((unsigned long)HOST_RSP),
2775 : "c"(vmx), "d"((unsigned long)HOST_RSP), 2767 [loaded_vmcs]"r"(vmx->loaded_vmcs),
2776 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 2768 [launched]"i"(offsetof(struct loaded_vmcs, launched)),
2777 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 2769 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2778 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
2779 [wordsize]"i"(sizeof(ulong)) 2770 [wordsize]"i"(sizeof(ulong))
2780 : "rax", "cc", "memory" 2771 : "cc", "memory"
2781 ); 2772 );
2782 2773
2783 preempt_enable(); 2774 preempt_enable();
@@ -2787,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2787 if (vmx->msr_autoload.guest.nr) 2778 if (vmx->msr_autoload.guest.nr)
2788 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2779 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2789 2780
2790 if (vmx->fail) { 2781 if (vm_fail) {
2791 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 2782 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2792 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2783 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2793 vmx->fail = 0;
2794 return 1; 2784 return 1;
2795 } 2785 }
2796 2786
@@ -2813,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2813 2803
2814 return 0; 2804 return 0;
2815} 2805}
2816STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
2817
2818 2806
2819static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2807static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2820 struct vmcs12 *vmcs12); 2808 struct vmcs12 *vmcs12);
@@ -3031,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
3031 kvm_make_request(KVM_REQ_EVENT, vcpu); 3019 kvm_make_request(KVM_REQ_EVENT, vcpu);
3032 3020
3033 /* 3021 /*
3022 * Do not start the preemption timer hrtimer until after we know
3023 * we are successful, so that only nested_vmx_vmexit needs to cancel
3024 * the timer.
3025 */
3026 vmx->nested.preemption_timer_expired = false;
3027 if (nested_cpu_has_preemption_timer(vmcs12))
3028 vmx_start_preemption_timer(vcpu);
3029
3030 /*
3034 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3031 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3035 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3032 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3036 * returned as far as L1 is concerned. It will only return (and set 3033 * returned as far as L1 is concerned. It will only return (and set
@@ -3450,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3450 else 3447 else
3451 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3448 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3452 3449
3453 if (nested_cpu_has_preemption_timer(vmcs12)) { 3450 if (nested_cpu_has_preemption_timer(vmcs12) &&
3454 if (vmcs12->vm_exit_controls & 3451 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3455 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3456 vmcs12->vmx_preemption_timer_value = 3452 vmcs12->vmx_preemption_timer_value =
3457 vmx_get_preemption_timer_value(vcpu); 3453 vmx_get_preemption_timer_value(vcpu);
3458 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3459 }
3460 3454
3461 /* 3455 /*
3462 * In some cases (usually, nested EPT), L2 is allowed to change its 3456 * In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3864,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3864 3858
3865 leave_guest_mode(vcpu); 3859 leave_guest_mode(vcpu);
3866 3860
3861 if (nested_cpu_has_preemption_timer(vmcs12))
3862 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3863
3867 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3864 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3868 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3865 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3869 3866
@@ -3915,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3915 vmx_flush_tlb(vcpu, true); 3912 vmx_flush_tlb(vcpu, true);
3916 } 3913 }
3917 3914
3918 /* This is needed for same reason as it was needed in prepare_vmcs02 */
3919 vmx->host_rsp = 0;
3920
3921 /* Unpin physical memory we referred to in vmcs02 */ 3915 /* Unpin physical memory we referred to in vmcs02 */
3922 if (vmx->nested.apic_access_page) { 3916 if (vmx->nested.apic_access_page) {
3923 kvm_release_page_dirty(vmx->nested.apic_access_page); 3917 kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4035,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4035 /* Addr = segment_base + offset */ 4029 /* Addr = segment_base + offset */
4036 /* offset = base + [index * scale] + displacement */ 4030 /* offset = base + [index * scale] + displacement */
4037 off = exit_qualification; /* holds the displacement */ 4031 off = exit_qualification; /* holds the displacement */
4032 if (addr_size == 1)
4033 off = (gva_t)sign_extend64(off, 31);
4034 else if (addr_size == 0)
4035 off = (gva_t)sign_extend64(off, 15);
4038 if (base_is_valid) 4036 if (base_is_valid)
4039 off += kvm_register_read(vcpu, base_reg); 4037 off += kvm_register_read(vcpu, base_reg);
4040 if (index_is_valid) 4038 if (index_is_valid)
4041 off += kvm_register_read(vcpu, index_reg)<<scaling; 4039 off += kvm_register_read(vcpu, index_reg)<<scaling;
4042 vmx_get_segment(vcpu, &s, seg_reg); 4040 vmx_get_segment(vcpu, &s, seg_reg);
4043 *ret = s.base + off;
4044 4041
4042 /*
4043 * The effective address, i.e. @off, of a memory operand is truncated
4044 * based on the address size of the instruction. Note that this is
4045 * the *effective address*, i.e. the address prior to accounting for
4046 * the segment's base.
4047 */
4045 if (addr_size == 1) /* 32 bit */ 4048 if (addr_size == 1) /* 32 bit */
4046 *ret &= 0xffffffff; 4049 off &= 0xffffffff;
4050 else if (addr_size == 0) /* 16 bit */
4051 off &= 0xffff;
4047 4052
4048 /* Checks for #GP/#SS exceptions. */ 4053 /* Checks for #GP/#SS exceptions. */
4049 exn = false; 4054 exn = false;
4050 if (is_long_mode(vcpu)) { 4055 if (is_long_mode(vcpu)) {
4056 /*
4057 * The virtual/linear address is never truncated in 64-bit
4058 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4059 * address when using FS/GS with a non-zero base.
4060 */
4061 *ret = s.base + off;
4062
4051 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4063 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4052 * non-canonical form. This is the only check on the memory 4064 * non-canonical form. This is the only check on the memory
4053 * destination for long mode! 4065 * destination for long mode!
4054 */ 4066 */
4055 exn = is_noncanonical_address(*ret, vcpu); 4067 exn = is_noncanonical_address(*ret, vcpu);
4056 } else if (is_protmode(vcpu)) { 4068 } else {
4069 /*
4070 * When not in long mode, the virtual/linear address is
4071 * unconditionally truncated to 32 bits regardless of the
4072 * address size.
4073 */
4074 *ret = (s.base + off) & 0xffffffff;
4075
4057 /* Protected mode: apply checks for segment validity in the 4076 /* Protected mode: apply checks for segment validity in the
4058 * following order: 4077 * following order:
4059 * - segment type check (#GP(0) may be thrown) 4078 * - segment type check (#GP(0) may be thrown)
@@ -4077,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4077 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4096 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4078 */ 4097 */
4079 exn = (s.unusable != 0); 4098 exn = (s.unusable != 0);
4080 /* Protected mode: #GP(0)/#SS(0) if the memory 4099
4081 * operand is outside the segment limit. 4100 /*
4101 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4102 * outside the segment limit. All CPUs that support VMX ignore
4103 * limit checks for flat segments, i.e. segments with base==0,
4104 * limit==0xffffffff and of type expand-up data or code.
4082 */ 4105 */
4083 exn = exn || (off + sizeof(u64) > s.limit); 4106 if (!(s.base == 0 && s.limit == 0xffffffff &&
4107 ((s.type & 8) || !(s.type & 4))))
4108 exn = exn || (off + sizeof(u64) > s.limit);
4084 } 4109 }
4085 if (exn) { 4110 if (exn) {
4086 kvm_queue_exception_e(vcpu, 4111 kvm_queue_exception_e(vcpu,
@@ -4145,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4145 if (r < 0) 4170 if (r < 0)
4146 goto out_vmcs02; 4171 goto out_vmcs02;
4147 4172
4148 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); 4173 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4149 if (!vmx->nested.cached_vmcs12) 4174 if (!vmx->nested.cached_vmcs12)
4150 goto out_cached_vmcs12; 4175 goto out_cached_vmcs12;
4151 4176
4152 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); 4177 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4153 if (!vmx->nested.cached_shadow_vmcs12) 4178 if (!vmx->nested.cached_shadow_vmcs12)
4154 goto out_cached_shadow_vmcs12; 4179 goto out_cached_shadow_vmcs12;
4155 4180
@@ -5696,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5696 enable_shadow_vmcs = 0; 5721 enable_shadow_vmcs = 0;
5697 if (enable_shadow_vmcs) { 5722 if (enable_shadow_vmcs) {
5698 for (i = 0; i < VMX_BITMAP_NR; i++) { 5723 for (i = 0; i < VMX_BITMAP_NR; i++) {
5724 /*
5725 * The vmx_bitmap is not tied to a VM and so should
5726 * not be charged to a memcg.
5727 */
5699 vmx_bitmap[i] = (unsigned long *) 5728 vmx_bitmap[i] = (unsigned long *)
5700 __get_free_page(GFP_KERNEL); 5729 __get_free_page(GFP_KERNEL);
5701 if (!vmx_bitmap[i]) { 5730 if (!vmx_bitmap[i]) {
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6def3ba88e3b..cb6079f8a227 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -34,6 +34,7 @@ struct vmcs_host_state {
34 unsigned long cr4; /* May not match real cr4 */ 34 unsigned long cr4; /* May not match real cr4 */
35 unsigned long gs_base; 35 unsigned long gs_base;
36 unsigned long fs_base; 36 unsigned long fs_base;
37 unsigned long rsp;
37 38
38 u16 fs_sel, gs_sel, ldt_sel; 39 u16 fs_sel, gs_sel, ldt_sel;
39#ifdef CONFIG_X86_64 40#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bcef2c7e9bc4..7b272738c576 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,6 +1,30 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/asm.h> 3#include <asm/asm.h>
4#include <asm/bitsperlong.h>
5#include <asm/kvm_vcpu_regs.h>
6
7#define WORD_SIZE (BITS_PER_LONG / 8)
8
9#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
10#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
11#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
12#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
13/* Intentionally omit RSP as it's context switched by hardware */
14#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
15#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
16#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
17
18#ifdef CONFIG_X86_64
19#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
20#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
21#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
22#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
23#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
24#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
25#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
26#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
27#endif
4 28
5 .text 29 .text
6 30
@@ -55,3 +79,146 @@ ENDPROC(vmx_vmenter)
55ENTRY(vmx_vmexit) 79ENTRY(vmx_vmexit)
56 ret 80 ret
57ENDPROC(vmx_vmexit) 81ENDPROC(vmx_vmexit)
82
83/**
84 * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
85 * @vmx: struct vcpu_vmx *
86 * @regs: unsigned long * (to guest registers)
87 * @launched: %true if the VMCS has been launched
88 *
89 * Returns:
90 * 0 on VM-Exit, 1 on VM-Fail
91 */
92ENTRY(__vmx_vcpu_run)
93 push %_ASM_BP
94 mov %_ASM_SP, %_ASM_BP
95#ifdef CONFIG_X86_64
96 push %r15
97 push %r14
98 push %r13
99 push %r12
100#else
101 push %edi
102 push %esi
103#endif
104 push %_ASM_BX
105
106 /*
107 * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
108 * @regs is needed after VM-Exit to save the guest's register values.
109 */
110 push %_ASM_ARG2
111
112 /* Copy @launched to BL, _ASM_ARG3 is volatile. */
113 mov %_ASM_ARG3B, %bl
114
115 /* Adjust RSP to account for the CALL to vmx_vmenter(). */
116 lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
117 call vmx_update_host_rsp
118
119 /* Load @regs to RAX. */
120 mov (%_ASM_SP), %_ASM_AX
121
122 /* Check if vmlaunch or vmresume is needed */
123 cmpb $0, %bl
124
125 /* Load guest registers. Don't clobber flags. */
126 mov VCPU_RBX(%_ASM_AX), %_ASM_BX
127 mov VCPU_RCX(%_ASM_AX), %_ASM_CX
128 mov VCPU_RDX(%_ASM_AX), %_ASM_DX
129 mov VCPU_RSI(%_ASM_AX), %_ASM_SI
130 mov VCPU_RDI(%_ASM_AX), %_ASM_DI
131 mov VCPU_RBP(%_ASM_AX), %_ASM_BP
132#ifdef CONFIG_X86_64
133 mov VCPU_R8 (%_ASM_AX), %r8
134 mov VCPU_R9 (%_ASM_AX), %r9
135 mov VCPU_R10(%_ASM_AX), %r10
136 mov VCPU_R11(%_ASM_AX), %r11
137 mov VCPU_R12(%_ASM_AX), %r12
138 mov VCPU_R13(%_ASM_AX), %r13
139 mov VCPU_R14(%_ASM_AX), %r14
140 mov VCPU_R15(%_ASM_AX), %r15
141#endif
142 /* Load guest RAX. This kills the vmx_vcpu pointer! */
143 mov VCPU_RAX(%_ASM_AX), %_ASM_AX
144
145 /* Enter guest mode */
146 call vmx_vmenter
147
148 /* Jump on VM-Fail. */
149 jbe 2f
150
151 /* Temporarily save guest's RAX. */
152 push %_ASM_AX
153
154 /* Reload @regs to RAX. */
155 mov WORD_SIZE(%_ASM_SP), %_ASM_AX
156
157 /* Save all guest registers, including RAX from the stack */
158 __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
159 mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
160 mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
161 mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
162 mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
163 mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
164 mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
165#ifdef CONFIG_X86_64
166 mov %r8, VCPU_R8 (%_ASM_AX)
167 mov %r9, VCPU_R9 (%_ASM_AX)
168 mov %r10, VCPU_R10(%_ASM_AX)
169 mov %r11, VCPU_R11(%_ASM_AX)
170 mov %r12, VCPU_R12(%_ASM_AX)
171 mov %r13, VCPU_R13(%_ASM_AX)
172 mov %r14, VCPU_R14(%_ASM_AX)
173 mov %r15, VCPU_R15(%_ASM_AX)
174#endif
175
176 /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
177 xor %eax, %eax
178
179 /*
180 * Clear all general purpose registers except RSP and RAX to prevent
181 * speculative use of the guest's values, even those that are reloaded
182 * via the stack. In theory, an L1 cache miss when restoring registers
183 * could lead to speculative execution with the guest's values.
184 * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
185 * free. RSP and RAX are exempt as RSP is restored by hardware during
186 * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
187 */
1881: xor %ebx, %ebx
189 xor %ecx, %ecx
190 xor %edx, %edx
191 xor %esi, %esi
192 xor %edi, %edi
193 xor %ebp, %ebp
194#ifdef CONFIG_X86_64
195 xor %r8d, %r8d
196 xor %r9d, %r9d
197 xor %r10d, %r10d
198 xor %r11d, %r11d
199 xor %r12d, %r12d
200 xor %r13d, %r13d
201 xor %r14d, %r14d
202 xor %r15d, %r15d
203#endif
204
205 /* "POP" @regs. */
206 add $WORD_SIZE, %_ASM_SP
207 pop %_ASM_BX
208
209#ifdef CONFIG_X86_64
210 pop %r12
211 pop %r13
212 pop %r14
213 pop %r15
214#else
215 pop %esi
216 pop %edi
217#endif
218 pop %_ASM_BP
219 ret
220
221 /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2222: mov $1, %eax
223 jmp 1b
224ENDPROC(__vmx_vcpu_run)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30a6bcd735ec..c73375e01ab8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
246 246
247 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 247 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
248 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 248 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
249 /*
250 * This allocation for vmx_l1d_flush_pages is not tied to a VM
251 * lifetime and so should not be charged to a memcg.
252 */
249 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 253 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
250 if (!page) 254 if (!page)
251 return -ENOMEM; 255 return -ENOMEM;
@@ -2387,13 +2391,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2387 return 0; 2391 return 0;
2388} 2392}
2389 2393
2390struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) 2394struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2391{ 2395{
2392 int node = cpu_to_node(cpu); 2396 int node = cpu_to_node(cpu);
2393 struct page *pages; 2397 struct page *pages;
2394 struct vmcs *vmcs; 2398 struct vmcs *vmcs;
2395 2399
2396 pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); 2400 pages = __alloc_pages_node(node, flags, vmcs_config.order);
2397 if (!pages) 2401 if (!pages)
2398 return NULL; 2402 return NULL;
2399 vmcs = page_address(pages); 2403 vmcs = page_address(pages);
@@ -2440,7 +2444,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2440 loaded_vmcs_init(loaded_vmcs); 2444 loaded_vmcs_init(loaded_vmcs);
2441 2445
2442 if (cpu_has_vmx_msr_bitmap()) { 2446 if (cpu_has_vmx_msr_bitmap()) {
2443 loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 2447 loaded_vmcs->msr_bitmap = (unsigned long *)
2448 __get_free_page(GFP_KERNEL_ACCOUNT);
2444 if (!loaded_vmcs->msr_bitmap) 2449 if (!loaded_vmcs->msr_bitmap)
2445 goto out_vmcs; 2450 goto out_vmcs;
2446 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2451 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
@@ -2481,7 +2486,7 @@ static __init int alloc_kvm_area(void)
2481 for_each_possible_cpu(cpu) { 2486 for_each_possible_cpu(cpu) {
2482 struct vmcs *vmcs; 2487 struct vmcs *vmcs;
2483 2488
2484 vmcs = alloc_vmcs_cpu(false, cpu); 2489 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2485 if (!vmcs) { 2490 if (!vmcs) {
2486 free_kvm_area(); 2491 free_kvm_area();
2487 return -ENOMEM; 2492 return -ENOMEM;
@@ -6360,150 +6365,15 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
6360 vmx->loaded_vmcs->hv_timer_armed = false; 6365 vmx->loaded_vmcs->hv_timer_armed = false;
6361} 6366}
6362 6367
6363static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 6368void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
6364{ 6369{
6365 unsigned long evmcs_rsp; 6370 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
6366 6371 vmx->loaded_vmcs->host_state.rsp = host_rsp;
6367 vmx->__launched = vmx->loaded_vmcs->launched; 6372 vmcs_writel(HOST_RSP, host_rsp);
6368 6373 }
6369 evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
6370 (unsigned long)&current_evmcs->host_rsp : 0;
6371
6372 if (static_branch_unlikely(&vmx_l1d_should_flush))
6373 vmx_l1d_flush(vcpu);
6374
6375 asm(
6376 /* Store host registers */
6377 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6378 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6379 "push %%" _ASM_CX " \n\t"
6380 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
6381 "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
6382 "je 1f \n\t"
6383 "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
6384 /* Avoid VMWRITE when Enlightened VMCS is in use */
6385 "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
6386 "jz 2f \n\t"
6387 "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
6388 "jmp 1f \n\t"
6389 "2: \n\t"
6390 __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
6391 "1: \n\t"
6392 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
6393
6394 /* Reload cr2 if changed */
6395 "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
6396 "mov %%cr2, %%" _ASM_DX " \n\t"
6397 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6398 "je 3f \n\t"
6399 "mov %%" _ASM_AX", %%cr2 \n\t"
6400 "3: \n\t"
6401 /* Check if vmlaunch or vmresume is needed */
6402 "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
6403 /* Load guest registers. Don't clobber flags. */
6404 "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
6405 "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
6406 "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
6407 "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
6408 "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
6409 "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
6410#ifdef CONFIG_X86_64
6411 "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
6412 "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
6413 "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
6414 "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
6415 "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
6416 "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
6417 "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
6418 "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
6419#endif
6420 /* Load guest RCX. This kills the vmx_vcpu pointer! */
6421 "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
6422
6423 /* Enter guest mode */
6424 "call vmx_vmenter\n\t"
6425
6426 /* Save guest's RCX to the stack placeholder (see above) */
6427 "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
6428
6429 /* Load host's RCX, i.e. the vmx_vcpu pointer */
6430 "pop %%" _ASM_CX " \n\t"
6431
6432 /* Set vmx->fail based on EFLAGS.{CF,ZF} */
6433 "setbe %c[fail](%%" _ASM_CX ")\n\t"
6434
6435 /* Save all guest registers, including RCX from the stack */
6436 "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
6437 "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
6438 __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
6439 "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
6440 "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
6441 "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
6442 "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
6443#ifdef CONFIG_X86_64
6444 "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
6445 "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
6446 "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
6447 "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
6448 "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
6449 "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
6450 "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
6451 "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
6452 /*
6453 * Clear host registers marked as clobbered to prevent
6454 * speculative use.
6455 */
6456 "xor %%r8d, %%r8d \n\t"
6457 "xor %%r9d, %%r9d \n\t"
6458 "xor %%r10d, %%r10d \n\t"
6459 "xor %%r11d, %%r11d \n\t"
6460 "xor %%r12d, %%r12d \n\t"
6461 "xor %%r13d, %%r13d \n\t"
6462 "xor %%r14d, %%r14d \n\t"
6463 "xor %%r15d, %%r15d \n\t"
6464#endif
6465 "mov %%cr2, %%" _ASM_AX " \n\t"
6466 "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
6467
6468 "xor %%eax, %%eax \n\t"
6469 "xor %%ebx, %%ebx \n\t"
6470 "xor %%esi, %%esi \n\t"
6471 "xor %%edi, %%edi \n\t"
6472 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
6473 : ASM_CALL_CONSTRAINT
6474 : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
6475 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6476 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
6477 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
6478 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
6479 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
6480 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
6481 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
6482 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
6483 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
6484 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
6485#ifdef CONFIG_X86_64
6486 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
6487 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
6488 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
6489 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
6490 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
6491 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
6492 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
6493 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
6494#endif
6495 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6496 [wordsize]"i"(sizeof(ulong))
6497 : "cc", "memory"
6498#ifdef CONFIG_X86_64
6499 , "rax", "rbx", "rdi"
6500 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6501#else
6502 , "eax", "ebx", "edi"
6503#endif
6504 );
6505} 6374}
6506STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); 6375
6376bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
6507 6377
6508static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 6378static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6509{ 6379{
@@ -6572,7 +6442,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6572 */ 6442 */
6573 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); 6443 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6574 6444
6575 __vmx_vcpu_run(vcpu, vmx); 6445 if (static_branch_unlikely(&vmx_l1d_should_flush))
6446 vmx_l1d_flush(vcpu);
6447
6448 if (vcpu->arch.cr2 != read_cr2())
6449 write_cr2(vcpu->arch.cr2);
6450
6451 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
6452 vmx->loaded_vmcs->launched);
6453
6454 vcpu->arch.cr2 = read_cr2();
6576 6455
6577 /* 6456 /*
6578 * We do not use IBRS in the kernel. If this vCPU has used the 6457 * We do not use IBRS in the kernel. If this vCPU has used the
@@ -6657,7 +6536,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6657 6536
6658static struct kvm *vmx_vm_alloc(void) 6537static struct kvm *vmx_vm_alloc(void)
6659{ 6538{
6660 struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); 6539 struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
6540 GFP_KERNEL_ACCOUNT | __GFP_ZERO,
6541 PAGE_KERNEL);
6661 return &kvm_vmx->kvm; 6542 return &kvm_vmx->kvm;
6662} 6543}
6663 6544
@@ -6673,7 +6554,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6673 if (enable_pml) 6554 if (enable_pml)
6674 vmx_destroy_pml_buffer(vmx); 6555 vmx_destroy_pml_buffer(vmx);
6675 free_vpid(vmx->vpid); 6556 free_vpid(vmx->vpid);
6676 leave_guest_mode(vcpu);
6677 nested_vmx_free_vcpu(vcpu); 6557 nested_vmx_free_vcpu(vcpu);
6678 free_loaded_vmcs(vmx->loaded_vmcs); 6558 free_loaded_vmcs(vmx->loaded_vmcs);
6679 kfree(vmx->guest_msrs); 6559 kfree(vmx->guest_msrs);
@@ -6685,14 +6565,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6685static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6565static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6686{ 6566{
6687 int err; 6567 int err;
6688 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 6568 struct vcpu_vmx *vmx;
6689 unsigned long *msr_bitmap; 6569 unsigned long *msr_bitmap;
6690 int cpu; 6570 int cpu;
6691 6571
6572 vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
6692 if (!vmx) 6573 if (!vmx)
6693 return ERR_PTR(-ENOMEM); 6574 return ERR_PTR(-ENOMEM);
6694 6575
6695 vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); 6576 vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
6577 GFP_KERNEL_ACCOUNT);
6696 if (!vmx->vcpu.arch.guest_fpu) { 6578 if (!vmx->vcpu.arch.guest_fpu) {
6697 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 6579 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
6698 err = -ENOMEM; 6580 err = -ENOMEM;
@@ -6714,12 +6596,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6714 * for the guest, etc. 6596 * for the guest, etc.
6715 */ 6597 */
6716 if (enable_pml) { 6598 if (enable_pml) {
6717 vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 6599 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
6718 if (!vmx->pml_pg) 6600 if (!vmx->pml_pg)
6719 goto uninit_vcpu; 6601 goto uninit_vcpu;
6720 } 6602 }
6721 6603
6722 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 6604 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
6723 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 6605 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
6724 > PAGE_SIZE); 6606 > PAGE_SIZE);
6725 6607
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 0ac0a64c7790..1554cb45b393 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -175,7 +175,6 @@ struct nested_vmx {
175 175
176struct vcpu_vmx { 176struct vcpu_vmx {
177 struct kvm_vcpu vcpu; 177 struct kvm_vcpu vcpu;
178 unsigned long host_rsp;
179 u8 fail; 178 u8 fail;
180 u8 msr_bitmap_mode; 179 u8 msr_bitmap_mode;
181 u32 exit_intr_info; 180 u32 exit_intr_info;
@@ -209,7 +208,7 @@ struct vcpu_vmx {
209 struct loaded_vmcs vmcs01; 208 struct loaded_vmcs vmcs01;
210 struct loaded_vmcs *loaded_vmcs; 209 struct loaded_vmcs *loaded_vmcs;
211 struct loaded_vmcs *loaded_cpu_state; 210 struct loaded_vmcs *loaded_cpu_state;
212 bool __launched; /* temporary, used in vmx_vcpu_run */ 211
213 struct msr_autoload { 212 struct msr_autoload {
214 struct vmx_msrs guest; 213 struct vmx_msrs guest;
215 struct vmx_msrs host; 214 struct vmx_msrs host;
@@ -339,8 +338,8 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
339 338
340static inline void pi_set_sn(struct pi_desc *pi_desc) 339static inline void pi_set_sn(struct pi_desc *pi_desc)
341{ 340{
342 return set_bit(POSTED_INTR_SN, 341 set_bit(POSTED_INTR_SN,
343 (unsigned long *)&pi_desc->control); 342 (unsigned long *)&pi_desc->control);
344} 343}
345 344
346static inline void pi_set_on(struct pi_desc *pi_desc) 345static inline void pi_set_on(struct pi_desc *pi_desc)
@@ -445,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void)
445{ 444{
446 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 445 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
447 if (pt_mode == PT_MODE_SYSTEM) 446 if (pt_mode == PT_MODE_SYSTEM)
448 vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); 447 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
448 VM_ENTRY_LOAD_IA32_RTIT_CTL);
449 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 449 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
450 return vmentry_ctrl & 450 return vmentry_ctrl &
451 ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); 451 ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
@@ -455,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void)
455{ 455{
456 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 456 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
457 if (pt_mode == PT_MODE_SYSTEM) 457 if (pt_mode == PT_MODE_SYSTEM)
458 vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); 458 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
459 VM_EXIT_CLEAR_IA32_RTIT_CTL);
459 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 460 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
460 return vmcs_config.vmexit_ctrl & 461 return vmexit_ctrl &
461 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 462 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
462} 463}
463 464
@@ -478,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
478 return &(to_vmx(vcpu)->pi_desc); 479 return &(to_vmx(vcpu)->pi_desc);
479} 480}
480 481
481struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); 482struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
482void free_vmcs(struct vmcs *vmcs); 483void free_vmcs(struct vmcs *vmcs);
483int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); 484int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
484void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); 485void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -487,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
487 488
488static inline struct vmcs *alloc_vmcs(bool shadow) 489static inline struct vmcs *alloc_vmcs(bool shadow)
489{ 490{
490 return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); 491 return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
492 GFP_KERNEL_ACCOUNT);
491} 493}
492 494
493u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); 495u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 941f932373d0..65e4559eef2f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3879 r = -EINVAL; 3879 r = -EINVAL;
3880 if (!lapic_in_kernel(vcpu)) 3880 if (!lapic_in_kernel(vcpu))
3881 goto out; 3881 goto out;
3882 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 3882 u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
3883 GFP_KERNEL_ACCOUNT);
3883 3884
3884 r = -ENOMEM; 3885 r = -ENOMEM;
3885 if (!u.lapic) 3886 if (!u.lapic)
@@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
4066 break; 4067 break;
4067 } 4068 }
4068 case KVM_GET_XSAVE: { 4069 case KVM_GET_XSAVE: {
4069 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 4070 u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4070 r = -ENOMEM; 4071 r = -ENOMEM;
4071 if (!u.xsave) 4072 if (!u.xsave)
4072 break; 4073 break;
@@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
4090 break; 4091 break;
4091 } 4092 }
4092 case KVM_GET_XCRS: { 4093 case KVM_GET_XCRS: {
4093 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 4094 u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4094 r = -ENOMEM; 4095 r = -ENOMEM;
4095 if (!u.xcrs) 4096 if (!u.xcrs)
4096 break; 4097 break;
@@ -7055,6 +7056,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
7055 7056
7056void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) 7057void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
7057{ 7058{
7059 if (!lapic_in_kernel(vcpu)) {
7060 WARN_ON_ONCE(vcpu->arch.apicv_active);
7061 return;
7062 }
7063 if (!vcpu->arch.apicv_active)
7064 return;
7065
7058 vcpu->arch.apicv_active = false; 7066 vcpu->arch.apicv_active = false;
7059 kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); 7067 kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
7060} 7068}
@@ -9005,7 +9013,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
9005 struct page *page; 9013 struct page *page;
9006 int r; 9014 int r;
9007 9015
9008 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
9009 vcpu->arch.emulate_ctxt.ops = &emulate_ops; 9016 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
9010 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) 9017 if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
9011 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 9018 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -9026,6 +9033,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
9026 goto fail_free_pio_data; 9033 goto fail_free_pio_data;
9027 9034
9028 if (irqchip_in_kernel(vcpu->kvm)) { 9035 if (irqchip_in_kernel(vcpu->kvm)) {
9036 vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
9029 r = kvm_create_lapic(vcpu); 9037 r = kvm_create_lapic(vcpu);
9030 if (r < 0) 9038 if (r < 0)
9031 goto fail_mmu_destroy; 9039 goto fail_mmu_destroy;
@@ -9033,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
9033 static_key_slow_inc(&kvm_no_apic_vcpu); 9041 static_key_slow_inc(&kvm_no_apic_vcpu);
9034 9042
9035 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 9043 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
9036 GFP_KERNEL); 9044 GFP_KERNEL_ACCOUNT);
9037 if (!vcpu->arch.mce_banks) { 9045 if (!vcpu->arch.mce_banks) {
9038 r = -ENOMEM; 9046 r = -ENOMEM;
9039 goto fail_free_lapic; 9047 goto fail_free_lapic;
9040 } 9048 }
9041 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 9049 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
9042 9050
9043 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { 9051 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
9052 GFP_KERNEL_ACCOUNT)) {
9044 r = -ENOMEM; 9053 r = -ENOMEM;
9045 goto fail_free_mce_banks; 9054 goto fail_free_mce_banks;
9046 } 9055 }
@@ -9104,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
9104 9113
9105 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); 9114 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
9106 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 9115 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
9107 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
9108 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 9116 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
9109 atomic_set(&kvm->arch.noncoherent_dma_count, 0); 9117 atomic_set(&kvm->arch.noncoherent_dma_count, 0);
9110 9118
@@ -9299,13 +9307,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
9299 9307
9300 slot->arch.rmap[i] = 9308 slot->arch.rmap[i] =
9301 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), 9309 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
9302 GFP_KERNEL); 9310 GFP_KERNEL_ACCOUNT);
9303 if (!slot->arch.rmap[i]) 9311 if (!slot->arch.rmap[i])
9304 goto out_free; 9312 goto out_free;
9305 if (i == 0) 9313 if (i == 0)
9306 continue; 9314 continue;
9307 9315
9308 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); 9316 linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
9309 if (!linfo) 9317 if (!linfo)
9310 goto out_free; 9318 goto out_free;
9311 9319
@@ -9348,13 +9356,13 @@ out_free:
9348 return -ENOMEM; 9356 return -ENOMEM;
9349} 9357}
9350 9358
9351void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 9359void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
9352{ 9360{
9353 /* 9361 /*
9354 * memslots->generation has been incremented. 9362 * memslots->generation has been incremented.
9355 * mmio generation may have reached its maximum value. 9363 * mmio generation may have reached its maximum value.
9356 */ 9364 */
9357 kvm_mmu_invalidate_mmio_sptes(kvm, slots); 9365 kvm_mmu_invalidate_mmio_sptes(kvm, gen);
9358} 9366}
9359 9367
9360int kvm_arch_prepare_memory_region(struct kvm *kvm, 9368int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9462,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
9462 9470
9463void kvm_arch_flush_shadow_all(struct kvm *kvm) 9471void kvm_arch_flush_shadow_all(struct kvm *kvm)
9464{ 9472{
9465 kvm_mmu_invalidate_zap_all_pages(kvm); 9473 kvm_mmu_zap_all(kvm);
9466} 9474}
9467 9475
9468void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 9476void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 224cd0a47568..28406aa1136d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la,
181static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, 181static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
182 gva_t gva, gfn_t gfn, unsigned access) 182 gva_t gva, gfn_t gfn, unsigned access)
183{ 183{
184 u64 gen = kvm_memslots(vcpu->kvm)->generation;
185
186 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
187 return;
188
184 /* 189 /*
185 * If this is a shadow nested page table, the "GVA" is 190 * If this is a shadow nested page table, the "GVA" is
186 * actually a nGPA. 191 * actually a nGPA.
@@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
188 vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; 193 vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
189 vcpu->arch.access = access; 194 vcpu->arch.access = access;
190 vcpu->arch.mmio_gfn = gfn; 195 vcpu->arch.mmio_gfn = gfn;
191 vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; 196 vcpu->arch.mmio_gen = gen;
192} 197}
193 198
194static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) 199static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index a8b20b65bd4b..aa4ec53281ce 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1261,6 +1261,13 @@ static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void)
1261 return ARCH_TIMER_PHYS_SECURE_PPI; 1261 return ARCH_TIMER_PHYS_SECURE_PPI;
1262} 1262}
1263 1263
1264static void __init arch_timer_populate_kvm_info(void)
1265{
1266 arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
1267 if (is_kernel_in_hyp_mode())
1268 arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
1269}
1270
1264static int __init arch_timer_of_init(struct device_node *np) 1271static int __init arch_timer_of_init(struct device_node *np)
1265{ 1272{
1266 int i, ret; 1273 int i, ret;
@@ -1275,7 +1282,7 @@ static int __init arch_timer_of_init(struct device_node *np)
1275 for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) 1282 for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++)
1276 arch_timer_ppi[i] = irq_of_parse_and_map(np, i); 1283 arch_timer_ppi[i] = irq_of_parse_and_map(np, i);
1277 1284
1278 arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; 1285 arch_timer_populate_kvm_info();
1279 1286
1280 rate = arch_timer_get_cntfrq(); 1287 rate = arch_timer_get_cntfrq();
1281 arch_timer_of_configure_rate(rate, np); 1288 arch_timer_of_configure_rate(rate, np);
@@ -1605,7 +1612,7 @@ static int __init arch_timer_acpi_init(struct acpi_table_header *table)
1605 arch_timer_ppi[ARCH_TIMER_HYP_PPI] = 1612 arch_timer_ppi[ARCH_TIMER_HYP_PPI] =
1606 acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI); 1613 acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI);
1607 1614
1608 arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI]; 1615 arch_timer_populate_kvm_info();
1609 1616
1610 /* 1617 /*
1611 * When probing via ACPI, we have no mechanism to override the sysreg 1618 * When probing via ACPI, we have no mechanism to override the sysreg
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index a0baee25134c..4159c63a5fd2 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1382,3 +1382,40 @@ int chsc_pnso_brinfo(struct subchannel_id schid,
1382 return chsc_error_from_response(brinfo_area->response.code); 1382 return chsc_error_from_response(brinfo_area->response.code);
1383} 1383}
1384EXPORT_SYMBOL_GPL(chsc_pnso_brinfo); 1384EXPORT_SYMBOL_GPL(chsc_pnso_brinfo);
1385
1386int chsc_sgib(u32 origin)
1387{
1388 struct {
1389 struct chsc_header request;
1390 u16 op;
1391 u8 reserved01[2];
1392 u8 reserved02:4;
1393 u8 fmt:4;
1394 u8 reserved03[7];
1395 /* operation data area begin */
1396 u8 reserved04[4];
1397 u32 gib_origin;
1398 u8 reserved05[10];
1399 u8 aix;
1400 u8 reserved06[4029];
1401 struct chsc_header response;
1402 u8 reserved07[4];
1403 } *sgib_area;
1404 int ret;
1405
1406 spin_lock_irq(&chsc_page_lock);
1407 memset(chsc_page, 0, PAGE_SIZE);
1408 sgib_area = chsc_page;
1409 sgib_area->request.length = 0x0fe0;
1410 sgib_area->request.code = 0x0021;
1411 sgib_area->op = 0x1;
1412 sgib_area->gib_origin = origin;
1413
1414 ret = chsc(sgib_area);
1415 if (ret == 0)
1416 ret = chsc_error_from_response(sgib_area->response.code);
1417 spin_unlock_irq(&chsc_page_lock);
1418
1419 return ret;
1420}
1421EXPORT_SYMBOL_GPL(chsc_sgib);
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index 78aba8d94eec..e57d68e325a3 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -164,6 +164,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp);
164int chsc_ssqd(struct subchannel_id schid, struct chsc_ssqd_area *ssqd); 164int chsc_ssqd(struct subchannel_id schid, struct chsc_ssqd_area *ssqd);
165int chsc_sadc(struct subchannel_id schid, struct chsc_scssc_area *scssc, 165int chsc_sadc(struct subchannel_id schid, struct chsc_scssc_area *scssc,
166 u64 summary_indicator_addr, u64 subchannel_indicator_addr); 166 u64 summary_indicator_addr, u64 subchannel_indicator_addr);
167int chsc_sgib(u32 origin);
167int chsc_error_from_response(int response); 168int chsc_error_from_response(int response);
168 169
169int chsc_siosl(struct subchannel_id schid); 170int chsc_siosl(struct subchannel_id schid);
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h
index 349e5957c949..702967d996bb 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -74,6 +74,7 @@ enum arch_timer_spi_nr {
74struct arch_timer_kvm_info { 74struct arch_timer_kvm_info {
75 struct timecounter timecounter; 75 struct timecounter timecounter;
76 int virtual_irq; 76 int virtual_irq;
77 int physical_irq;
77}; 78};
78 79
79struct arch_timer_mem_frame { 80struct arch_timer_mem_frame {
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 33771352dcd6..05a18dd265b5 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -22,7 +22,22 @@
22#include <linux/clocksource.h> 22#include <linux/clocksource.h>
23#include <linux/hrtimer.h> 23#include <linux/hrtimer.h>
24 24
25enum kvm_arch_timers {
26 TIMER_PTIMER,
27 TIMER_VTIMER,
28 NR_KVM_TIMERS
29};
30
31enum kvm_arch_timer_regs {
32 TIMER_REG_CNT,
33 TIMER_REG_CVAL,
34 TIMER_REG_TVAL,
35 TIMER_REG_CTL,
36};
37
25struct arch_timer_context { 38struct arch_timer_context {
39 struct kvm_vcpu *vcpu;
40
26 /* Registers: control register, timer value */ 41 /* Registers: control register, timer value */
27 u32 cnt_ctl; 42 u32 cnt_ctl;
28 u64 cnt_cval; 43 u64 cnt_cval;
@@ -30,30 +45,36 @@ struct arch_timer_context {
30 /* Timer IRQ */ 45 /* Timer IRQ */
31 struct kvm_irq_level irq; 46 struct kvm_irq_level irq;
32 47
48 /* Virtual offset */
49 u64 cntvoff;
50
51 /* Emulated Timer (may be unused) */
52 struct hrtimer hrtimer;
53
33 /* 54 /*
34 * We have multiple paths which can save/restore the timer state 55 * We have multiple paths which can save/restore the timer state onto
35 * onto the hardware, so we need some way of keeping track of 56 * the hardware, so we need some way of keeping track of where the
36 * where the latest state is. 57 * latest state is.
37 *
38 * loaded == true: State is loaded on the hardware registers.
39 * loaded == false: State is stored in memory.
40 */ 58 */
41 bool loaded; 59 bool loaded;
42 60
43 /* Virtual offset */ 61 /* Duplicated state from arch_timer.c for convenience */
44 u64 cntvoff; 62 u32 host_timer_irq;
63 u32 host_timer_irq_flags;
64};
65
66struct timer_map {
67 struct arch_timer_context *direct_vtimer;
68 struct arch_timer_context *direct_ptimer;
69 struct arch_timer_context *emul_ptimer;
45}; 70};
46 71
47struct arch_timer_cpu { 72struct arch_timer_cpu {
48 struct arch_timer_context vtimer; 73 struct arch_timer_context timers[NR_KVM_TIMERS];
49 struct arch_timer_context ptimer;
50 74
51 /* Background timer used when the guest is not running */ 75 /* Background timer used when the guest is not running */
52 struct hrtimer bg_timer; 76 struct hrtimer bg_timer;
53 77
54 /* Physical timer emulation */
55 struct hrtimer phys_timer;
56
57 /* Is the timer enabled */ 78 /* Is the timer enabled */
58 bool enabled; 79 bool enabled;
59}; 80};
@@ -76,9 +97,6 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
76 97
77bool kvm_timer_is_pending(struct kvm_vcpu *vcpu); 98bool kvm_timer_is_pending(struct kvm_vcpu *vcpu);
78 99
79void kvm_timer_schedule(struct kvm_vcpu *vcpu);
80void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
81
82u64 kvm_phys_timer_read(void); 100u64 kvm_phys_timer_read(void);
83 101
84void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); 102void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu);
@@ -88,7 +106,19 @@ void kvm_timer_init_vhe(void);
88 106
89bool kvm_arch_timer_get_input_level(int vintid); 107bool kvm_arch_timer_get_input_level(int vintid);
90 108
91#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.vtimer) 109#define vcpu_timer(v) (&(v)->arch.timer_cpu)
92#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.ptimer) 110#define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)])
111#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
112#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
113
114#define arch_timer_ctx_index(ctx) ((ctx) - vcpu_timer((ctx)->vcpu)->timers)
115
116u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
117 enum kvm_arch_timers tmr,
118 enum kvm_arch_timer_regs treg);
119void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
120 enum kvm_arch_timers tmr,
121 enum kvm_arch_timer_regs treg,
122 u64 val);
93 123
94#endif 124#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c38cc5eb7e73..9d55c63db09b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -48,6 +48,27 @@
48 */ 48 */
49#define KVM_MEMSLOT_INVALID (1UL << 16) 49#define KVM_MEMSLOT_INVALID (1UL << 16)
50 50
51/*
52 * Bit 63 of the memslot generation number is an "update in-progress flag",
53 * e.g. is temporarily set for the duration of install_new_memslots().
54 * This flag effectively creates a unique generation number that is used to
55 * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
56 * i.e. may (or may not) have come from the previous memslots generation.
57 *
58 * This is necessary because the actual memslots update is not atomic with
59 * respect to the generation number update. Updating the generation number
60 * first would allow a vCPU to cache a spte from the old memslots using the
61 * new generation number, and updating the generation number after switching
62 * to the new memslots would allow cache hits using the old generation number
63 * to reference the defunct memslots.
64 *
65 * This mechanism is used to prevent getting hits in KVM's caches while a
66 * memslot update is in-progress, and to prevent cache hits *after* updating
67 * the actual generation number against accesses that were inserted into the
68 * cache *before* the memslots were updated.
69 */
70#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS BIT_ULL(63)
71
51/* Two fragments for cross MMIO pages. */ 72/* Two fragments for cross MMIO pages. */
52#define KVM_MAX_MMIO_FRAGMENTS 2 73#define KVM_MAX_MMIO_FRAGMENTS 2
53 74
@@ -634,7 +655,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
634 struct kvm_memory_slot *dont); 655 struct kvm_memory_slot *dont);
635int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, 656int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
636 unsigned long npages); 657 unsigned long npages);
637void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots); 658void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
638int kvm_arch_prepare_memory_region(struct kvm *kvm, 659int kvm_arch_prepare_memory_region(struct kvm *kvm,
639 struct kvm_memory_slot *memslot, 660 struct kvm_memory_slot *memslot,
640 const struct kvm_userspace_memory_region *mem, 661 const struct kvm_userspace_memory_region *mem,
@@ -1182,6 +1203,7 @@ extern bool kvm_rebooting;
1182 1203
1183extern unsigned int halt_poll_ns; 1204extern unsigned int halt_poll_ns;
1184extern unsigned int halt_poll_ns_grow; 1205extern unsigned int halt_poll_ns_grow;
1206extern unsigned int halt_poll_ns_grow_start;
1185extern unsigned int halt_poll_ns_shrink; 1207extern unsigned int halt_poll_ns_shrink;
1186 1208
1187struct kvm_device { 1209struct kvm_device {
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 6210ba41c29e..2689d1ea6d7a 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -3,6 +3,7 @@
3/x86_64/platform_info_test 3/x86_64/platform_info_test
4/x86_64/set_sregs_test 4/x86_64/set_sregs_test
5/x86_64/sync_regs_test 5/x86_64/sync_regs_test
6/x86_64/vmx_close_while_nested_test
6/x86_64/vmx_tsc_adjust_test 7/x86_64/vmx_tsc_adjust_test
7/x86_64/state_test 8/x86_64/state_test
8/dirty_log_test 9/dirty_log_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index f9a0e9938480..3c1f4bdf9000 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -16,6 +16,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
16TEST_GEN_PROGS_x86_64 += x86_64/state_test 16TEST_GEN_PROGS_x86_64 += x86_64/state_test
17TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test 17TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
18TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid 18TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
19TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
19TEST_GEN_PROGS_x86_64 += dirty_log_test 20TEST_GEN_PROGS_x86_64 += dirty_log_test
20TEST_GEN_PROGS_x86_64 += clear_dirty_log_test 21TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
21 22
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
new file mode 100644
index 000000000000..6edec6fd790b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@@ -0,0 +1,95 @@
1/*
2 * vmx_close_while_nested
3 *
4 * Copyright (C) 2019, Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.
7 *
8 * Verify that nothing bad happens if a KVM user exits with open
9 * file descriptors while executing a nested guest.
10 */
11
12#include "test_util.h"
13#include "kvm_util.h"
14#include "processor.h"
15#include "vmx.h"
16
17#include <string.h>
18#include <sys/ioctl.h>
19
20#include "kselftest.h"
21
22#define VCPU_ID 5
23
24enum {
25 PORT_L0_EXIT = 0x2000,
26};
27
28/* The virtual machine object. */
29static struct kvm_vm *vm;
30
31static void l2_guest_code(void)
32{
33 /* Exit to L0 */
34 asm volatile("inb %%dx, %%al"
35 : : [port] "d" (PORT_L0_EXIT) : "rax");
36}
37
38static void l1_guest_code(struct vmx_pages *vmx_pages)
39{
40#define L2_GUEST_STACK_SIZE 64
41 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
42 uint32_t control;
43 uintptr_t save_cr3;
44
45 GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
46 GUEST_ASSERT(load_vmcs(vmx_pages));
47
48 /* Prepare the VMCS for L2 execution. */
49 prepare_vmcs(vmx_pages, l2_guest_code,
50 &l2_guest_stack[L2_GUEST_STACK_SIZE]);
51
52 GUEST_ASSERT(!vmlaunch());
53 GUEST_ASSERT(0);
54}
55
56int main(int argc, char *argv[])
57{
58 struct vmx_pages *vmx_pages;
59 vm_vaddr_t vmx_pages_gva;
60 struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
61
62 if (!(entry->ecx & CPUID_VMX)) {
63 fprintf(stderr, "nested VMX not enabled, skipping test\n");
64 exit(KSFT_SKIP);
65 }
66
67 vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
68 vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
69
70 /* Allocate VMX pages and shared descriptors (vmx_pages). */
71 vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
72 vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
73
74 for (;;) {
75 volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
76 struct ucall uc;
77
78 vcpu_run(vm, VCPU_ID);
79 TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
80 "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
81 run->exit_reason,
82 exit_reason_str(run->exit_reason));
83
84 if (run->io.port == PORT_L0_EXIT)
85 break;
86
87 switch (get_ucall(vm, VCPU_ID, &uc)) {
88 case UCALL_ABORT:
89 TEST_ASSERT(false, "%s", (const char *)uc.args[0]);
90 /* NOT REACHED */
91 default:
92 TEST_ASSERT(false, "Unknown ucall 0x%x.", uc.cmd);
93 }
94 }
95}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index b07ac4614e1c..3417f2dbc366 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -25,6 +25,7 @@
25 25
26#include <clocksource/arm_arch_timer.h> 26#include <clocksource/arm_arch_timer.h>
27#include <asm/arch_timer.h> 27#include <asm/arch_timer.h>
28#include <asm/kvm_emulate.h>
28#include <asm/kvm_hyp.h> 29#include <asm/kvm_hyp.h>
29 30
30#include <kvm/arm_vgic.h> 31#include <kvm/arm_vgic.h>
@@ -34,7 +35,9 @@
34 35
35static struct timecounter *timecounter; 36static struct timecounter *timecounter;
36static unsigned int host_vtimer_irq; 37static unsigned int host_vtimer_irq;
38static unsigned int host_ptimer_irq;
37static u32 host_vtimer_irq_flags; 39static u32 host_vtimer_irq_flags;
40static u32 host_ptimer_irq_flags;
38 41
39static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); 42static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
40 43
@@ -52,12 +55,34 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
52static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, 55static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
53 struct arch_timer_context *timer_ctx); 56 struct arch_timer_context *timer_ctx);
54static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); 57static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
58static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
59 struct arch_timer_context *timer,
60 enum kvm_arch_timer_regs treg,
61 u64 val);
62static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
63 struct arch_timer_context *timer,
64 enum kvm_arch_timer_regs treg);
55 65
56u64 kvm_phys_timer_read(void) 66u64 kvm_phys_timer_read(void)
57{ 67{
58 return timecounter->cc->read(timecounter->cc); 68 return timecounter->cc->read(timecounter->cc);
59} 69}
60 70
71static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
72{
73 if (has_vhe()) {
74 map->direct_vtimer = vcpu_vtimer(vcpu);
75 map->direct_ptimer = vcpu_ptimer(vcpu);
76 map->emul_ptimer = NULL;
77 } else {
78 map->direct_vtimer = vcpu_vtimer(vcpu);
79 map->direct_ptimer = NULL;
80 map->emul_ptimer = vcpu_ptimer(vcpu);
81 }
82
83 trace_kvm_get_timer_map(vcpu->vcpu_id, map);
84}
85
61static inline bool userspace_irqchip(struct kvm *kvm) 86static inline bool userspace_irqchip(struct kvm *kvm)
62{ 87{
63 return static_branch_unlikely(&userspace_irqchip_in_use) && 88 return static_branch_unlikely(&userspace_irqchip_in_use) &&
@@ -78,20 +103,27 @@ static void soft_timer_cancel(struct hrtimer *hrt)
78static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 103static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
79{ 104{
80 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; 105 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
81 struct arch_timer_context *vtimer; 106 struct arch_timer_context *ctx;
107 struct timer_map map;
82 108
83 /* 109 /*
84 * We may see a timer interrupt after vcpu_put() has been called which 110 * We may see a timer interrupt after vcpu_put() has been called which
85 * sets the CPU's vcpu pointer to NULL, because even though the timer 111 * sets the CPU's vcpu pointer to NULL, because even though the timer
86 * has been disabled in vtimer_save_state(), the hardware interrupt 112 * has been disabled in timer_save_state(), the hardware interrupt
87 * signal may not have been retired from the interrupt controller yet. 113 * signal may not have been retired from the interrupt controller yet.
88 */ 114 */
89 if (!vcpu) 115 if (!vcpu)
90 return IRQ_HANDLED; 116 return IRQ_HANDLED;
91 117
92 vtimer = vcpu_vtimer(vcpu); 118 get_timer_map(vcpu, &map);
93 if (kvm_timer_should_fire(vtimer)) 119
94 kvm_timer_update_irq(vcpu, true, vtimer); 120 if (irq == host_vtimer_irq)
121 ctx = map.direct_vtimer;
122 else
123 ctx = map.direct_ptimer;
124
125 if (kvm_timer_should_fire(ctx))
126 kvm_timer_update_irq(vcpu, true, ctx);
95 127
96 if (userspace_irqchip(vcpu->kvm) && 128 if (userspace_irqchip(vcpu->kvm) &&
97 !static_branch_unlikely(&has_gic_active_state)) 129 !static_branch_unlikely(&has_gic_active_state))
@@ -122,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
122 154
123static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) 155static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
124{ 156{
125 return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && 157 WARN_ON(timer_ctx && timer_ctx->loaded);
158 return timer_ctx &&
159 !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
126 (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); 160 (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
127} 161}
128 162
@@ -132,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
132 */ 166 */
133static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) 167static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
134{ 168{
135 u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; 169 u64 min_delta = ULLONG_MAX;
136 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 170 int i;
137 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
138 171
139 if (kvm_timer_irq_can_fire(vtimer)) 172 for (i = 0; i < NR_KVM_TIMERS; i++) {
140 min_virt = kvm_timer_compute_delta(vtimer); 173 struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
141 174
142 if (kvm_timer_irq_can_fire(ptimer)) 175 WARN(ctx->loaded, "timer %d loaded\n", i);
143 min_phys = kvm_timer_compute_delta(ptimer); 176 if (kvm_timer_irq_can_fire(ctx))
177 min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
178 }
144 179
145 /* If none of timers can fire, then return 0 */ 180 /* If none of timers can fire, then return 0 */
146 if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) 181 if (min_delta == ULLONG_MAX)
147 return 0; 182 return 0;
148 183
149 return min(min_virt, min_phys); 184 return min_delta;
150} 185}
151 186
152static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) 187static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
@@ -173,41 +208,58 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
173 return HRTIMER_NORESTART; 208 return HRTIMER_NORESTART;
174} 209}
175 210
176static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) 211static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
177{ 212{
178 struct arch_timer_context *ptimer; 213 struct arch_timer_context *ctx;
179 struct arch_timer_cpu *timer;
180 struct kvm_vcpu *vcpu; 214 struct kvm_vcpu *vcpu;
181 u64 ns; 215 u64 ns;
182 216
183 timer = container_of(hrt, struct arch_timer_cpu, phys_timer); 217 ctx = container_of(hrt, struct arch_timer_context, hrtimer);
184 vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); 218 vcpu = ctx->vcpu;
185 ptimer = vcpu_ptimer(vcpu); 219
220 trace_kvm_timer_hrtimer_expire(ctx);
186 221
187 /* 222 /*
188 * Check that the timer has really expired from the guest's 223 * Check that the timer has really expired from the guest's
189 * PoV (NTP on the host may have forced it to expire 224 * PoV (NTP on the host may have forced it to expire
190 * early). If not ready, schedule for a later time. 225 * early). If not ready, schedule for a later time.
191 */ 226 */
192 ns = kvm_timer_compute_delta(ptimer); 227 ns = kvm_timer_compute_delta(ctx);
193 if (unlikely(ns)) { 228 if (unlikely(ns)) {
194 hrtimer_forward_now(hrt, ns_to_ktime(ns)); 229 hrtimer_forward_now(hrt, ns_to_ktime(ns));
195 return HRTIMER_RESTART; 230 return HRTIMER_RESTART;
196 } 231 }
197 232
198 kvm_timer_update_irq(vcpu, true, ptimer); 233 kvm_timer_update_irq(vcpu, true, ctx);
199 return HRTIMER_NORESTART; 234 return HRTIMER_NORESTART;
200} 235}
201 236
202static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) 237static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
203{ 238{
239 enum kvm_arch_timers index;
204 u64 cval, now; 240 u64 cval, now;
205 241
242 if (!timer_ctx)
243 return false;
244
245 index = arch_timer_ctx_index(timer_ctx);
246
206 if (timer_ctx->loaded) { 247 if (timer_ctx->loaded) {
207 u32 cnt_ctl; 248 u32 cnt_ctl = 0;
249
250 switch (index) {
251 case TIMER_VTIMER:
252 cnt_ctl = read_sysreg_el0(cntv_ctl);
253 break;
254 case TIMER_PTIMER:
255 cnt_ctl = read_sysreg_el0(cntp_ctl);
256 break;
257 case NR_KVM_TIMERS:
258 /* GCC is braindead */
259 cnt_ctl = 0;
260 break;
261 }
208 262
209 /* Only the virtual timer can be loaded so far */
210 cnt_ctl = read_sysreg_el0(cntv_ctl);
211 return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && 263 return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
212 (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && 264 (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
213 !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); 265 !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
@@ -224,13 +276,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
224 276
225bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) 277bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
226{ 278{
227 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 279 struct timer_map map;
228 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
229 280
230 if (kvm_timer_should_fire(vtimer)) 281 get_timer_map(vcpu, &map);
231 return true;
232 282
233 return kvm_timer_should_fire(ptimer); 283 return kvm_timer_should_fire(map.direct_vtimer) ||
284 kvm_timer_should_fire(map.direct_ptimer) ||
285 kvm_timer_should_fire(map.emul_ptimer);
234} 286}
235 287
236/* 288/*
@@ -269,77 +321,70 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
269 } 321 }
270} 322}
271 323
272/* Schedule the background timer for the emulated timer. */ 324static void timer_emulate(struct arch_timer_context *ctx)
273static void phys_timer_emulate(struct kvm_vcpu *vcpu)
274{ 325{
275 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 326 bool should_fire = kvm_timer_should_fire(ctx);
276 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 327
328 trace_kvm_timer_emulate(ctx, should_fire);
329
330 if (should_fire) {
331 kvm_timer_update_irq(ctx->vcpu, true, ctx);
332 return;
333 }
277 334
278 /* 335 /*
279 * If the timer can fire now, we don't need to have a soft timer 336 * If the timer can fire now, we don't need to have a soft timer
280 * scheduled for the future. If the timer cannot fire at all, 337 * scheduled for the future. If the timer cannot fire at all,
281 * then we also don't need a soft timer. 338 * then we also don't need a soft timer.
282 */ 339 */
283 if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { 340 if (!kvm_timer_irq_can_fire(ctx)) {
284 soft_timer_cancel(&timer->phys_timer); 341 soft_timer_cancel(&ctx->hrtimer);
285 return; 342 return;
286 } 343 }
287 344
288 soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); 345 soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
289} 346}
290 347
291/* 348static void timer_save_state(struct arch_timer_context *ctx)
292 * Check if there was a change in the timer state, so that we should either
293 * raise or lower the line level to the GIC or schedule a background timer to
294 * emulate the physical timer.
295 */
296static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
297{ 349{
298 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 350 struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
299 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 351 enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
300 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 352 unsigned long flags;
301 bool level;
302 353
303 if (unlikely(!timer->enabled)) 354 if (!timer->enabled)
304 return; 355 return;
305 356
306 /* 357 local_irq_save(flags);
307 * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
308 * of its lifecycle is offloaded to the hardware, and we therefore may
309 * not have lowered the irq.level value before having to signal a new
310 * interrupt, but have to signal an interrupt every time the level is
311 * asserted.
312 */
313 level = kvm_timer_should_fire(vtimer);
314 kvm_timer_update_irq(vcpu, level, vtimer);
315 358
316 phys_timer_emulate(vcpu); 359 if (!ctx->loaded)
360 goto out;
317 361
318 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) 362 switch (index) {
319 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); 363 case TIMER_VTIMER:
320} 364 ctx->cnt_ctl = read_sysreg_el0(cntv_ctl);
365 ctx->cnt_cval = read_sysreg_el0(cntv_cval);
321 366
322static void vtimer_save_state(struct kvm_vcpu *vcpu) 367 /* Disable the timer */
323{ 368 write_sysreg_el0(0, cntv_ctl);
324 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 369 isb();
325 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
326 unsigned long flags;
327 370
328 local_irq_save(flags); 371 break;
372 case TIMER_PTIMER:
373 ctx->cnt_ctl = read_sysreg_el0(cntp_ctl);
374 ctx->cnt_cval = read_sysreg_el0(cntp_cval);
329 375
330 if (!vtimer->loaded) 376 /* Disable the timer */
331 goto out; 377 write_sysreg_el0(0, cntp_ctl);
378 isb();
332 379
333 if (timer->enabled) { 380 break;
334 vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); 381 case NR_KVM_TIMERS:
335 vtimer->cnt_cval = read_sysreg_el0(cntv_cval); 382 BUG();
336 } 383 }
337 384
338 /* Disable the virtual timer */ 385 trace_kvm_timer_save_state(ctx);
339 write_sysreg_el0(0, cntv_ctl);
340 isb();
341 386
342 vtimer->loaded = false; 387 ctx->loaded = false;
343out: 388out:
344 local_irq_restore(flags); 389 local_irq_restore(flags);
345} 390}
@@ -349,67 +394,72 @@ out:
349 * thread is removed from its waitqueue and made runnable when there's a timer 394 * thread is removed from its waitqueue and made runnable when there's a timer
350 * interrupt to handle. 395 * interrupt to handle.
351 */ 396 */
352void kvm_timer_schedule(struct kvm_vcpu *vcpu) 397static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
353{ 398{
354 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 399 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
355 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 400 struct timer_map map;
356 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
357
358 vtimer_save_state(vcpu);
359 401
360 /* 402 get_timer_map(vcpu, &map);
361 * No need to schedule a background timer if any guest timer has
362 * already expired, because kvm_vcpu_block will return before putting
363 * the thread to sleep.
364 */
365 if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer))
366 return;
367 403
368 /* 404 /*
369 * If both timers are not capable of raising interrupts (disabled or 405 * If no timers are capable of raising interrupts (disabled or
370 * masked), then there's no more work for us to do. 406 * masked), then there's no more work for us to do.
371 */ 407 */
372 if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) 408 if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
409 !kvm_timer_irq_can_fire(map.direct_ptimer) &&
410 !kvm_timer_irq_can_fire(map.emul_ptimer))
373 return; 411 return;
374 412
375 /* 413 /*
376 * The guest timers have not yet expired, schedule a background timer. 414 * At least one guest time will expire. Schedule a background timer.
377 * Set the earliest expiration time among the guest timers. 415 * Set the earliest expiration time among the guest timers.
378 */ 416 */
379 soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); 417 soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
380} 418}
381 419
382static void vtimer_restore_state(struct kvm_vcpu *vcpu) 420static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
383{ 421{
384 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 422 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
385 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 423
424 soft_timer_cancel(&timer->bg_timer);
425}
426
427static void timer_restore_state(struct arch_timer_context *ctx)
428{
429 struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
430 enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
386 unsigned long flags; 431 unsigned long flags;
387 432
433 if (!timer->enabled)
434 return;
435
388 local_irq_save(flags); 436 local_irq_save(flags);
389 437
390 if (vtimer->loaded) 438 if (ctx->loaded)
391 goto out; 439 goto out;
392 440
393 if (timer->enabled) { 441 switch (index) {
394 write_sysreg_el0(vtimer->cnt_cval, cntv_cval); 442 case TIMER_VTIMER:
443 write_sysreg_el0(ctx->cnt_cval, cntv_cval);
395 isb(); 444 isb();
396 write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); 445 write_sysreg_el0(ctx->cnt_ctl, cntv_ctl);
446 break;
447 case TIMER_PTIMER:
448 write_sysreg_el0(ctx->cnt_cval, cntp_cval);
449 isb();
450 write_sysreg_el0(ctx->cnt_ctl, cntp_ctl);
451 break;
452 case NR_KVM_TIMERS:
453 BUG();
397 } 454 }
398 455
399 vtimer->loaded = true; 456 trace_kvm_timer_restore_state(ctx);
457
458 ctx->loaded = true;
400out: 459out:
401 local_irq_restore(flags); 460 local_irq_restore(flags);
402} 461}
403 462
404void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
405{
406 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
407
408 vtimer_restore_state(vcpu);
409
410 soft_timer_cancel(&timer->bg_timer);
411}
412
413static void set_cntvoff(u64 cntvoff) 463static void set_cntvoff(u64 cntvoff)
414{ 464{
415 u32 low = lower_32_bits(cntvoff); 465 u32 low = lower_32_bits(cntvoff);
@@ -425,23 +475,32 @@ static void set_cntvoff(u64 cntvoff)
425 kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); 475 kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
426} 476}
427 477
428static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) 478static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
429{ 479{
430 int r; 480 int r;
431 r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); 481 r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
432 WARN_ON(r); 482 WARN_ON(r);
433} 483}
434 484
435static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) 485static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
436{ 486{
437 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 487 struct kvm_vcpu *vcpu = ctx->vcpu;
438 bool phys_active; 488 bool phys_active = false;
489
490 /*
491 * Update the timer output so that it is likely to match the
492 * state we're about to restore. If the timer expires between
493 * this point and the register restoration, we'll take the
494 * interrupt anyway.
495 */
496 kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
439 497
440 if (irqchip_in_kernel(vcpu->kvm)) 498 if (irqchip_in_kernel(vcpu->kvm))
441 phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); 499 phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
442 else 500
443 phys_active = vtimer->irq.level; 501 phys_active |= ctx->irq.level;
444 set_vtimer_irq_phys_active(vcpu, phys_active); 502
503 set_timer_irq_phys_active(ctx, phys_active);
445} 504}
446 505
447static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) 506static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
@@ -466,28 +525,32 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
466 525
467void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) 526void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
468{ 527{
469 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 528 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
470 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 529 struct timer_map map;
471 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
472 530
473 if (unlikely(!timer->enabled)) 531 if (unlikely(!timer->enabled))
474 return; 532 return;
475 533
476 if (static_branch_likely(&has_gic_active_state)) 534 get_timer_map(vcpu, &map);
477 kvm_timer_vcpu_load_gic(vcpu); 535
478 else 536 if (static_branch_likely(&has_gic_active_state)) {
537 kvm_timer_vcpu_load_gic(map.direct_vtimer);
538 if (map.direct_ptimer)
539 kvm_timer_vcpu_load_gic(map.direct_ptimer);
540 } else {
479 kvm_timer_vcpu_load_nogic(vcpu); 541 kvm_timer_vcpu_load_nogic(vcpu);
542 }
480 543
481 set_cntvoff(vtimer->cntvoff); 544 set_cntvoff(map.direct_vtimer->cntvoff);
482 545
483 vtimer_restore_state(vcpu); 546 kvm_timer_unblocking(vcpu);
484 547
485 /* Set the background timer for the physical timer emulation. */ 548 timer_restore_state(map.direct_vtimer);
486 phys_timer_emulate(vcpu); 549 if (map.direct_ptimer)
550 timer_restore_state(map.direct_ptimer);
487 551
488 /* If the timer fired while we weren't running, inject it now */ 552 if (map.emul_ptimer)
489 if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) 553 timer_emulate(map.emul_ptimer);
490 kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
491} 554}
492 555
493bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) 556bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -509,15 +572,20 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
509 572
510void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) 573void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
511{ 574{
512 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 575 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
576 struct timer_map map;
513 577
514 if (unlikely(!timer->enabled)) 578 if (unlikely(!timer->enabled))
515 return; 579 return;
516 580
517 vtimer_save_state(vcpu); 581 get_timer_map(vcpu, &map);
582
583 timer_save_state(map.direct_vtimer);
584 if (map.direct_ptimer)
585 timer_save_state(map.direct_ptimer);
518 586
519 /* 587 /*
520 * Cancel the physical timer emulation, because the only case where we 588 * Cancel soft timer emulation, because the only case where we
521 * need it after a vcpu_put is in the context of a sleeping VCPU, and 589 * need it after a vcpu_put is in the context of a sleeping VCPU, and
522 * in that case we already factor in the deadline for the physical 590 * in that case we already factor in the deadline for the physical
523 * timer when scheduling the bg_timer. 591 * timer when scheduling the bg_timer.
@@ -525,7 +593,11 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
525 * In any case, we re-schedule the hrtimer for the physical timer when 593 * In any case, we re-schedule the hrtimer for the physical timer when
526 * coming back to the VCPU thread in kvm_timer_vcpu_load(). 594 * coming back to the VCPU thread in kvm_timer_vcpu_load().
527 */ 595 */
528 soft_timer_cancel(&timer->phys_timer); 596 if (map.emul_ptimer)
597 soft_timer_cancel(&map.emul_ptimer->hrtimer);
598
599 if (swait_active(kvm_arch_vcpu_wq(vcpu)))
600 kvm_timer_blocking(vcpu);
529 601
530 /* 602 /*
531 * The kernel may decide to run userspace after calling vcpu_put, so 603 * The kernel may decide to run userspace after calling vcpu_put, so
@@ -534,8 +606,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
534 * counter of non-VHE case. For VHE, the virtual counter uses a fixed 606 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
535 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. 607 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
536 */ 608 */
537 if (!has_vhe()) 609 set_cntvoff(0);
538 set_cntvoff(0);
539} 610}
540 611
541/* 612/*
@@ -550,7 +621,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
550 if (!kvm_timer_should_fire(vtimer)) { 621 if (!kvm_timer_should_fire(vtimer)) {
551 kvm_timer_update_irq(vcpu, false, vtimer); 622 kvm_timer_update_irq(vcpu, false, vtimer);
552 if (static_branch_likely(&has_gic_active_state)) 623 if (static_branch_likely(&has_gic_active_state))
553 set_vtimer_irq_phys_active(vcpu, false); 624 set_timer_irq_phys_active(vtimer, false);
554 else 625 else
555 enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); 626 enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
556 } 627 }
@@ -558,7 +629,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
558 629
559void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) 630void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
560{ 631{
561 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 632 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
562 633
563 if (unlikely(!timer->enabled)) 634 if (unlikely(!timer->enabled))
564 return; 635 return;
@@ -569,9 +640,10 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
569 640
570int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) 641int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
571{ 642{
572 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 643 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
573 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 644 struct timer_map map;
574 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 645
646 get_timer_map(vcpu, &map);
575 647
576 /* 648 /*
577 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 649 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -579,12 +651,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
579 * resets the timer to be disabled and unmasked and is compliant with 651 * resets the timer to be disabled and unmasked and is compliant with
580 * the ARMv7 architecture. 652 * the ARMv7 architecture.
581 */ 653 */
582 vtimer->cnt_ctl = 0; 654 vcpu_vtimer(vcpu)->cnt_ctl = 0;
583 ptimer->cnt_ctl = 0; 655 vcpu_ptimer(vcpu)->cnt_ctl = 0;
584 kvm_timer_update_state(vcpu);
585 656
586 if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) 657 if (timer->enabled) {
587 kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); 658 kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
659 kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
660
661 if (irqchip_in_kernel(vcpu->kvm)) {
662 kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
663 if (map.direct_ptimer)
664 kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
665 }
666 }
667
668 if (map.emul_ptimer)
669 soft_timer_cancel(&map.emul_ptimer->hrtimer);
588 670
589 return 0; 671 return 0;
590} 672}
@@ -610,56 +692,76 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
610 692
611void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) 693void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
612{ 694{
613 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 695 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
614 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 696 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
615 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 697 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
616 698
617 /* Synchronize cntvoff across all vtimers of a VM. */ 699 /* Synchronize cntvoff across all vtimers of a VM. */
618 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); 700 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
619 vcpu_ptimer(vcpu)->cntvoff = 0; 701 ptimer->cntvoff = 0;
620 702
621 hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 703 hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
622 timer->bg_timer.function = kvm_bg_timer_expire; 704 timer->bg_timer.function = kvm_bg_timer_expire;
623 705
624 hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 706 hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
625 timer->phys_timer.function = kvm_phys_timer_expire; 707 hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
708 vtimer->hrtimer.function = kvm_hrtimer_expire;
709 ptimer->hrtimer.function = kvm_hrtimer_expire;
626 710
627 vtimer->irq.irq = default_vtimer_irq.irq; 711 vtimer->irq.irq = default_vtimer_irq.irq;
628 ptimer->irq.irq = default_ptimer_irq.irq; 712 ptimer->irq.irq = default_ptimer_irq.irq;
713
714 vtimer->host_timer_irq = host_vtimer_irq;
715 ptimer->host_timer_irq = host_ptimer_irq;
716
717 vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
718 ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
719
720 vtimer->vcpu = vcpu;
721 ptimer->vcpu = vcpu;
629} 722}
630 723
631static void kvm_timer_init_interrupt(void *info) 724static void kvm_timer_init_interrupt(void *info)
632{ 725{
633 enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); 726 enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
727 enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
634} 728}
635 729
636int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) 730int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
637{ 731{
638 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 732 struct arch_timer_context *timer;
639 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); 733 bool level;
640 734
641 switch (regid) { 735 switch (regid) {
642 case KVM_REG_ARM_TIMER_CTL: 736 case KVM_REG_ARM_TIMER_CTL:
643 vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; 737 timer = vcpu_vtimer(vcpu);
738 kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
644 break; 739 break;
645 case KVM_REG_ARM_TIMER_CNT: 740 case KVM_REG_ARM_TIMER_CNT:
741 timer = vcpu_vtimer(vcpu);
646 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); 742 update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
647 break; 743 break;
648 case KVM_REG_ARM_TIMER_CVAL: 744 case KVM_REG_ARM_TIMER_CVAL:
649 vtimer->cnt_cval = value; 745 timer = vcpu_vtimer(vcpu);
746 kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
650 break; 747 break;
651 case KVM_REG_ARM_PTIMER_CTL: 748 case KVM_REG_ARM_PTIMER_CTL:
652 ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; 749 timer = vcpu_ptimer(vcpu);
750 kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
653 break; 751 break;
654 case KVM_REG_ARM_PTIMER_CVAL: 752 case KVM_REG_ARM_PTIMER_CVAL:
655 ptimer->cnt_cval = value; 753 timer = vcpu_ptimer(vcpu);
754 kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
656 break; 755 break;
657 756
658 default: 757 default:
659 return -1; 758 return -1;
660 } 759 }
661 760
662 kvm_timer_update_state(vcpu); 761 level = kvm_timer_should_fire(timer);
762 kvm_timer_update_irq(vcpu, level, timer);
763 timer_emulate(timer);
764
663 return 0; 765 return 0;
664} 766}
665 767
@@ -679,26 +781,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer)
679 781
680u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) 782u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
681{ 783{
682 struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
683 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
684
685 switch (regid) { 784 switch (regid) {
686 case KVM_REG_ARM_TIMER_CTL: 785 case KVM_REG_ARM_TIMER_CTL:
687 return read_timer_ctl(vtimer); 786 return kvm_arm_timer_read(vcpu,
787 vcpu_vtimer(vcpu), TIMER_REG_CTL);
688 case KVM_REG_ARM_TIMER_CNT: 788 case KVM_REG_ARM_TIMER_CNT:
689 return kvm_phys_timer_read() - vtimer->cntvoff; 789 return kvm_arm_timer_read(vcpu,
790 vcpu_vtimer(vcpu), TIMER_REG_CNT);
690 case KVM_REG_ARM_TIMER_CVAL: 791 case KVM_REG_ARM_TIMER_CVAL:
691 return vtimer->cnt_cval; 792 return kvm_arm_timer_read(vcpu,
793 vcpu_vtimer(vcpu), TIMER_REG_CVAL);
692 case KVM_REG_ARM_PTIMER_CTL: 794 case KVM_REG_ARM_PTIMER_CTL:
693 return read_timer_ctl(ptimer); 795 return kvm_arm_timer_read(vcpu,
694 case KVM_REG_ARM_PTIMER_CVAL: 796 vcpu_ptimer(vcpu), TIMER_REG_CTL);
695 return ptimer->cnt_cval;
696 case KVM_REG_ARM_PTIMER_CNT: 797 case KVM_REG_ARM_PTIMER_CNT:
697 return kvm_phys_timer_read(); 798 return kvm_arm_timer_read(vcpu,
799 vcpu_vtimer(vcpu), TIMER_REG_CNT);
800 case KVM_REG_ARM_PTIMER_CVAL:
801 return kvm_arm_timer_read(vcpu,
802 vcpu_ptimer(vcpu), TIMER_REG_CVAL);
698 } 803 }
699 return (u64)-1; 804 return (u64)-1;
700} 805}
701 806
807static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
808 struct arch_timer_context *timer,
809 enum kvm_arch_timer_regs treg)
810{
811 u64 val;
812
813 switch (treg) {
814 case TIMER_REG_TVAL:
815 val = kvm_phys_timer_read() - timer->cntvoff - timer->cnt_cval;
816 break;
817
818 case TIMER_REG_CTL:
819 val = read_timer_ctl(timer);
820 break;
821
822 case TIMER_REG_CVAL:
823 val = timer->cnt_cval;
824 break;
825
826 case TIMER_REG_CNT:
827 val = kvm_phys_timer_read() - timer->cntvoff;
828 break;
829
830 default:
831 BUG();
832 }
833
834 return val;
835}
836
837u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
838 enum kvm_arch_timers tmr,
839 enum kvm_arch_timer_regs treg)
840{
841 u64 val;
842
843 preempt_disable();
844 kvm_timer_vcpu_put(vcpu);
845
846 val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
847
848 kvm_timer_vcpu_load(vcpu);
849 preempt_enable();
850
851 return val;
852}
853
854static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
855 struct arch_timer_context *timer,
856 enum kvm_arch_timer_regs treg,
857 u64 val)
858{
859 switch (treg) {
860 case TIMER_REG_TVAL:
861 timer->cnt_cval = val - kvm_phys_timer_read() - timer->cntvoff;
862 break;
863
864 case TIMER_REG_CTL:
865 timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
866 break;
867
868 case TIMER_REG_CVAL:
869 timer->cnt_cval = val;
870 break;
871
872 default:
873 BUG();
874 }
875}
876
877void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
878 enum kvm_arch_timers tmr,
879 enum kvm_arch_timer_regs treg,
880 u64 val)
881{
882 preempt_disable();
883 kvm_timer_vcpu_put(vcpu);
884
885 kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
886
887 kvm_timer_vcpu_load(vcpu);
888 preempt_enable();
889}
890
702static int kvm_timer_starting_cpu(unsigned int cpu) 891static int kvm_timer_starting_cpu(unsigned int cpu)
703{ 892{
704 kvm_timer_init_interrupt(NULL); 893 kvm_timer_init_interrupt(NULL);
@@ -724,6 +913,8 @@ int kvm_timer_hyp_init(bool has_gic)
724 return -ENODEV; 913 return -ENODEV;
725 } 914 }
726 915
916 /* First, do the virtual EL1 timer irq */
917
727 if (info->virtual_irq <= 0) { 918 if (info->virtual_irq <= 0) {
728 kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", 919 kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
729 info->virtual_irq); 920 info->virtual_irq);
@@ -734,15 +925,15 @@ int kvm_timer_hyp_init(bool has_gic)
734 host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); 925 host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
735 if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && 926 if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
736 host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { 927 host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
737 kvm_err("Invalid trigger for IRQ%d, assuming level low\n", 928 kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
738 host_vtimer_irq); 929 host_vtimer_irq);
739 host_vtimer_irq_flags = IRQF_TRIGGER_LOW; 930 host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
740 } 931 }
741 932
742 err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, 933 err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
743 "kvm guest timer", kvm_get_running_vcpus()); 934 "kvm guest vtimer", kvm_get_running_vcpus());
744 if (err) { 935 if (err) {
745 kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", 936 kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
746 host_vtimer_irq, err); 937 host_vtimer_irq, err);
747 return err; 938 return err;
748 } 939 }
@@ -760,6 +951,43 @@ int kvm_timer_hyp_init(bool has_gic)
760 951
761 kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); 952 kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
762 953
954 /* Now let's do the physical EL1 timer irq */
955
956 if (info->physical_irq > 0) {
957 host_ptimer_irq = info->physical_irq;
958 host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
959 if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
960 host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
961 kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
962 host_ptimer_irq);
963 host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
964 }
965
966 err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
967 "kvm guest ptimer", kvm_get_running_vcpus());
968 if (err) {
969 kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
970 host_ptimer_irq, err);
971 return err;
972 }
973
974 if (has_gic) {
975 err = irq_set_vcpu_affinity(host_ptimer_irq,
976 kvm_get_running_vcpus());
977 if (err) {
978 kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
979 goto out_free_irq;
980 }
981 }
982
983 kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
984 } else if (has_vhe()) {
985 kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
986 info->physical_irq);
987 err = -ENODEV;
988 goto out_free_irq;
989 }
990
763 cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, 991 cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
764 "kvm/arm/timer:starting", kvm_timer_starting_cpu, 992 "kvm/arm/timer:starting", kvm_timer_starting_cpu,
765 kvm_timer_dying_cpu); 993 kvm_timer_dying_cpu);
@@ -771,7 +999,7 @@ out_free_irq:
771 999
772void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) 1000void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
773{ 1001{
774 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 1002 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
775 1003
776 soft_timer_cancel(&timer->bg_timer); 1004 soft_timer_cancel(&timer->bg_timer);
777} 1005}
@@ -807,16 +1035,18 @@ bool kvm_arch_timer_get_input_level(int vintid)
807 1035
808 if (vintid == vcpu_vtimer(vcpu)->irq.irq) 1036 if (vintid == vcpu_vtimer(vcpu)->irq.irq)
809 timer = vcpu_vtimer(vcpu); 1037 timer = vcpu_vtimer(vcpu);
1038 else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
1039 timer = vcpu_ptimer(vcpu);
810 else 1040 else
811 BUG(); /* We only map the vtimer so far */ 1041 BUG();
812 1042
813 return kvm_timer_should_fire(timer); 1043 return kvm_timer_should_fire(timer);
814} 1044}
815 1045
816int kvm_timer_enable(struct kvm_vcpu *vcpu) 1046int kvm_timer_enable(struct kvm_vcpu *vcpu)
817{ 1047{
818 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 1048 struct arch_timer_cpu *timer = vcpu_timer(vcpu);
819 struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); 1049 struct timer_map map;
820 int ret; 1050 int ret;
821 1051
822 if (timer->enabled) 1052 if (timer->enabled)
@@ -834,19 +1064,33 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
834 return -EINVAL; 1064 return -EINVAL;
835 } 1065 }
836 1066
837 ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, 1067 get_timer_map(vcpu, &map);
1068
1069 ret = kvm_vgic_map_phys_irq(vcpu,
1070 map.direct_vtimer->host_timer_irq,
1071 map.direct_vtimer->irq.irq,
838 kvm_arch_timer_get_input_level); 1072 kvm_arch_timer_get_input_level);
839 if (ret) 1073 if (ret)
840 return ret; 1074 return ret;
841 1075
1076 if (map.direct_ptimer) {
1077 ret = kvm_vgic_map_phys_irq(vcpu,
1078 map.direct_ptimer->host_timer_irq,
1079 map.direct_ptimer->irq.irq,
1080 kvm_arch_timer_get_input_level);
1081 }
1082
1083 if (ret)
1084 return ret;
1085
842no_vgic: 1086no_vgic:
843 timer->enabled = 1; 1087 timer->enabled = 1;
844 return 0; 1088 return 0;
845} 1089}
846 1090
847/* 1091/*
848 * On VHE system, we only need to configure trap on physical timer and counter 1092 * On VHE system, we only need to configure the EL2 timer trap register once,
849 * accesses in EL0 and EL1 once, not for every world switch. 1093 * not for every world switch.
850 * The host kernel runs at EL2 with HCR_EL2.TGE == 1, 1094 * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
851 * and this makes those bits have no effect for the host kernel execution. 1095 * and this makes those bits have no effect for the host kernel execution.
852 */ 1096 */
@@ -857,11 +1101,11 @@ void kvm_timer_init_vhe(void)
857 u64 val; 1101 u64 val;
858 1102
859 /* 1103 /*
860 * Disallow physical timer access for the guest. 1104 * VHE systems allow the guest direct access to the EL1 physical
861 * Physical counter access is allowed. 1105 * timer/counter.
862 */ 1106 */
863 val = read_sysreg(cnthctl_el2); 1107 val = read_sysreg(cnthctl_el2);
864 val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift); 1108 val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
865 val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); 1109 val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
866 write_sysreg(val, cnthctl_el2); 1110 write_sysreg(val, cnthctl_el2);
867} 1111}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 9c486fad3f9f..99c37384ba7b 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -65,7 +65,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
65/* The VMID used in the VTTBR */ 65/* The VMID used in the VTTBR */
66static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); 66static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
67static u32 kvm_next_vmid; 67static u32 kvm_next_vmid;
68static unsigned int kvm_vmid_bits __read_mostly;
69static DEFINE_SPINLOCK(kvm_vmid_lock); 68static DEFINE_SPINLOCK(kvm_vmid_lock);
70 69
71static bool vgic_present; 70static bool vgic_present;
@@ -142,7 +141,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
142 kvm_vgic_early_init(kvm); 141 kvm_vgic_early_init(kvm);
143 142
144 /* Mark the initial VMID generation invalid */ 143 /* Mark the initial VMID generation invalid */
145 kvm->arch.vmid_gen = 0; 144 kvm->arch.vmid.vmid_gen = 0;
146 145
147 /* The maximum number of VCPUs is limited by the host's GIC model */ 146 /* The maximum number of VCPUs is limited by the host's GIC model */
148 kvm->arch.max_vcpus = vgic_present ? 147 kvm->arch.max_vcpus = vgic_present ?
@@ -336,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
336 335
337void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) 336void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
338{ 337{
339 kvm_timer_schedule(vcpu);
340 kvm_vgic_v4_enable_doorbell(vcpu); 338 kvm_vgic_v4_enable_doorbell(vcpu);
341} 339}
342 340
343void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) 341void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
344{ 342{
345 kvm_timer_unschedule(vcpu);
346 kvm_vgic_v4_disable_doorbell(vcpu); 343 kvm_vgic_v4_disable_doorbell(vcpu);
347} 344}
348 345
@@ -472,37 +469,31 @@ void force_vm_exit(const cpumask_t *mask)
472 469
473/** 470/**
474 * need_new_vmid_gen - check that the VMID is still valid 471 * need_new_vmid_gen - check that the VMID is still valid
475 * @kvm: The VM's VMID to check 472 * @vmid: The VMID to check
476 * 473 *
477 * return true if there is a new generation of VMIDs being used 474 * return true if there is a new generation of VMIDs being used
478 * 475 *
479 * The hardware supports only 256 values with the value zero reserved for the 476 * The hardware supports a limited set of values with the value zero reserved
480 * host, so we check if an assigned value belongs to a previous generation, 477 * for the host, so we check if an assigned value belongs to a previous
481 * which which requires us to assign a new value. If we're the first to use a 478 * generation, which which requires us to assign a new value. If we're the
482 * VMID for the new generation, we must flush necessary caches and TLBs on all 479 * first to use a VMID for the new generation, we must flush necessary caches
483 * CPUs. 480 * and TLBs on all CPUs.
484 */ 481 */
485static bool need_new_vmid_gen(struct kvm *kvm) 482static bool need_new_vmid_gen(struct kvm_vmid *vmid)
486{ 483{
487 u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); 484 u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
488 smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ 485 smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
489 return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen); 486 return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
490} 487}
491 488
492/** 489/**
493 * update_vttbr - Update the VTTBR with a valid VMID before the guest runs 490 * update_vmid - Update the vmid with a valid VMID for the current generation
494 * @kvm The guest that we are about to run 491 * @kvm: The guest that struct vmid belongs to
495 * 492 * @vmid: The stage-2 VMID information struct
496 * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
497 * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
498 * caches and TLBs.
499 */ 493 */
500static void update_vttbr(struct kvm *kvm) 494static void update_vmid(struct kvm_vmid *vmid)
501{ 495{
502 phys_addr_t pgd_phys; 496 if (!need_new_vmid_gen(vmid))
503 u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0;
504
505 if (!need_new_vmid_gen(kvm))
506 return; 497 return;
507 498
508 spin_lock(&kvm_vmid_lock); 499 spin_lock(&kvm_vmid_lock);
@@ -512,7 +503,7 @@ static void update_vttbr(struct kvm *kvm)
512 * already allocated a valid vmid for this vm, then this vcpu should 503 * already allocated a valid vmid for this vm, then this vcpu should
513 * use the same vmid. 504 * use the same vmid.
514 */ 505 */
515 if (!need_new_vmid_gen(kvm)) { 506 if (!need_new_vmid_gen(vmid)) {
516 spin_unlock(&kvm_vmid_lock); 507 spin_unlock(&kvm_vmid_lock);
517 return; 508 return;
518 } 509 }
@@ -536,18 +527,12 @@ static void update_vttbr(struct kvm *kvm)
536 kvm_call_hyp(__kvm_flush_vm_context); 527 kvm_call_hyp(__kvm_flush_vm_context);
537 } 528 }
538 529
539 kvm->arch.vmid = kvm_next_vmid; 530 vmid->vmid = kvm_next_vmid;
540 kvm_next_vmid++; 531 kvm_next_vmid++;
541 kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; 532 kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
542
543 /* update vttbr to be used with the new vmid */
544 pgd_phys = virt_to_phys(kvm->arch.pgd);
545 BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm));
546 vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
547 kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
548 533
549 smp_wmb(); 534 smp_wmb();
550 WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen)); 535 WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
551 536
552 spin_unlock(&kvm_vmid_lock); 537 spin_unlock(&kvm_vmid_lock);
553} 538}
@@ -700,7 +685,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
700 */ 685 */
701 cond_resched(); 686 cond_resched();
702 687
703 update_vttbr(vcpu->kvm); 688 update_vmid(&vcpu->kvm->arch.vmid);
704 689
705 check_vcpu_requests(vcpu); 690 check_vcpu_requests(vcpu);
706 691
@@ -749,7 +734,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
749 */ 734 */
750 smp_store_mb(vcpu->mode, IN_GUEST_MODE); 735 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
751 736
752 if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || 737 if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
753 kvm_request_pending(vcpu)) { 738 kvm_request_pending(vcpu)) {
754 vcpu->mode = OUTSIDE_GUEST_MODE; 739 vcpu->mode = OUTSIDE_GUEST_MODE;
755 isb(); /* Ensure work in x_flush_hwstate is committed */ 740 isb(); /* Ensure work in x_flush_hwstate is committed */
@@ -775,7 +760,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
775 ret = kvm_vcpu_run_vhe(vcpu); 760 ret = kvm_vcpu_run_vhe(vcpu);
776 kvm_arm_vhe_guest_exit(); 761 kvm_arm_vhe_guest_exit();
777 } else { 762 } else {
778 ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu); 763 ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
779 } 764 }
780 765
781 vcpu->mode = OUTSIDE_GUEST_MODE; 766 vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -1427,10 +1412,6 @@ static inline void hyp_cpu_pm_exit(void)
1427 1412
1428static int init_common_resources(void) 1413static int init_common_resources(void)
1429{ 1414{
1430 /* set size of VMID supported by CPU */
1431 kvm_vmid_bits = kvm_get_vmid_bits();
1432 kvm_info("%d-bit VMID\n", kvm_vmid_bits);
1433
1434 kvm_set_ipa_limit(); 1415 kvm_set_ipa_limit();
1435 1416
1436 return 0; 1417 return 0;
@@ -1571,6 +1552,7 @@ static int init_hyp_mode(void)
1571 kvm_cpu_context_t *cpu_ctxt; 1552 kvm_cpu_context_t *cpu_ctxt;
1572 1553
1573 cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); 1554 cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu);
1555 kvm_init_host_cpu_context(cpu_ctxt, cpu);
1574 err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); 1556 err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
1575 1557
1576 if (err) { 1558 if (err) {
@@ -1581,7 +1563,7 @@ static int init_hyp_mode(void)
1581 1563
1582 err = hyp_map_aux_data(); 1564 err = hyp_map_aux_data();
1583 if (err) 1565 if (err)
1584 kvm_err("Cannot map host auxilary data: %d\n", err); 1566 kvm_err("Cannot map host auxiliary data: %d\n", err);
1585 1567
1586 return 0; 1568 return 0;
1587 1569
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 9652c453480f..264d92da3240 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -226,7 +226,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
226 int i; 226 int i;
227 u32 elrsr; 227 u32 elrsr;
228 228
229 elrsr = read_gicreg(ICH_ELSR_EL2); 229 elrsr = read_gicreg(ICH_ELRSR_EL2);
230 230
231 write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); 231 write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
232 232
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index e9d28a7ca673..ffd7acdceac7 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -908,6 +908,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
908 */ 908 */
909int kvm_alloc_stage2_pgd(struct kvm *kvm) 909int kvm_alloc_stage2_pgd(struct kvm *kvm)
910{ 910{
911 phys_addr_t pgd_phys;
911 pgd_t *pgd; 912 pgd_t *pgd;
912 913
913 if (kvm->arch.pgd != NULL) { 914 if (kvm->arch.pgd != NULL) {
@@ -920,7 +921,12 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
920 if (!pgd) 921 if (!pgd)
921 return -ENOMEM; 922 return -ENOMEM;
922 923
924 pgd_phys = virt_to_phys(pgd);
925 if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
926 return -EINVAL;
927
923 kvm->arch.pgd = pgd; 928 kvm->arch.pgd = pgd;
929 kvm->arch.pgd_phys = pgd_phys;
924 return 0; 930 return 0;
925} 931}
926 932
@@ -1008,6 +1014,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
1008 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); 1014 unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
1009 pgd = READ_ONCE(kvm->arch.pgd); 1015 pgd = READ_ONCE(kvm->arch.pgd);
1010 kvm->arch.pgd = NULL; 1016 kvm->arch.pgd = NULL;
1017 kvm->arch.pgd_phys = 0;
1011 } 1018 }
1012 spin_unlock(&kvm->mmu_lock); 1019 spin_unlock(&kvm->mmu_lock);
1013 1020
@@ -1396,14 +1403,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
1396 return false; 1403 return false;
1397} 1404}
1398 1405
1399static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
1400{
1401 if (kvm_vcpu_trap_is_iabt(vcpu))
1402 return false;
1403
1404 return kvm_vcpu_dabt_iswrite(vcpu);
1405}
1406
1407/** 1406/**
1408 * stage2_wp_ptes - write protect PMD range 1407 * stage2_wp_ptes - write protect PMD range
1409 * @pmd: pointer to pmd entry 1408 * @pmd: pointer to pmd entry
@@ -1598,14 +1597,13 @@ static void kvm_send_hwpoison_signal(unsigned long address,
1598static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, 1597static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
1599 unsigned long hva) 1598 unsigned long hva)
1600{ 1599{
1601 gpa_t gpa_start, gpa_end; 1600 gpa_t gpa_start;
1602 hva_t uaddr_start, uaddr_end; 1601 hva_t uaddr_start, uaddr_end;
1603 size_t size; 1602 size_t size;
1604 1603
1605 size = memslot->npages * PAGE_SIZE; 1604 size = memslot->npages * PAGE_SIZE;
1606 1605
1607 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1606 gpa_start = memslot->base_gfn << PAGE_SHIFT;
1608 gpa_end = gpa_start + size;
1609 1607
1610 uaddr_start = memslot->userspace_addr; 1608 uaddr_start = memslot->userspace_addr;
1611 uaddr_end = uaddr_start + size; 1609 uaddr_end = uaddr_start + size;
@@ -2353,7 +2351,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
2353 return 0; 2351 return 0;
2354} 2352}
2355 2353
2356void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) 2354void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2357{ 2355{
2358} 2356}
2359 2357
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
index 3828beab93f2..204d210d01c2 100644
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@@ -2,6 +2,7 @@
2#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) 2#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
3#define _TRACE_KVM_H 3#define _TRACE_KVM_H
4 4
5#include <kvm/arm_arch_timer.h>
5#include <linux/tracepoint.h> 6#include <linux/tracepoint.h>
6 7
7#undef TRACE_SYSTEM 8#undef TRACE_SYSTEM
@@ -262,10 +263,114 @@ TRACE_EVENT(kvm_timer_update_irq,
262 __entry->vcpu_id, __entry->irq, __entry->level) 263 __entry->vcpu_id, __entry->irq, __entry->level)
263); 264);
264 265
266TRACE_EVENT(kvm_get_timer_map,
267 TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
268 TP_ARGS(vcpu_id, map),
269
270 TP_STRUCT__entry(
271 __field( unsigned long, vcpu_id )
272 __field( int, direct_vtimer )
273 __field( int, direct_ptimer )
274 __field( int, emul_ptimer )
275 ),
276
277 TP_fast_assign(
278 __entry->vcpu_id = vcpu_id;
279 __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer);
280 __entry->direct_ptimer =
281 (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
282 __entry->emul_ptimer =
283 (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
284 ),
285
286 TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
287 __entry->vcpu_id,
288 __entry->direct_vtimer,
289 __entry->direct_ptimer,
290 __entry->emul_ptimer)
291);
292
293TRACE_EVENT(kvm_timer_save_state,
294 TP_PROTO(struct arch_timer_context *ctx),
295 TP_ARGS(ctx),
296
297 TP_STRUCT__entry(
298 __field( unsigned long, ctl )
299 __field( unsigned long long, cval )
300 __field( int, timer_idx )
301 ),
302
303 TP_fast_assign(
304 __entry->ctl = ctx->cnt_ctl;
305 __entry->cval = ctx->cnt_cval;
306 __entry->timer_idx = arch_timer_ctx_index(ctx);
307 ),
308
309 TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
310 __entry->ctl,
311 __entry->cval,
312 __entry->timer_idx)
313);
314
315TRACE_EVENT(kvm_timer_restore_state,
316 TP_PROTO(struct arch_timer_context *ctx),
317 TP_ARGS(ctx),
318
319 TP_STRUCT__entry(
320 __field( unsigned long, ctl )
321 __field( unsigned long long, cval )
322 __field( int, timer_idx )
323 ),
324
325 TP_fast_assign(
326 __entry->ctl = ctx->cnt_ctl;
327 __entry->cval = ctx->cnt_cval;
328 __entry->timer_idx = arch_timer_ctx_index(ctx);
329 ),
330
331 TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
332 __entry->ctl,
333 __entry->cval,
334 __entry->timer_idx)
335);
336
337TRACE_EVENT(kvm_timer_hrtimer_expire,
338 TP_PROTO(struct arch_timer_context *ctx),
339 TP_ARGS(ctx),
340
341 TP_STRUCT__entry(
342 __field( int, timer_idx )
343 ),
344
345 TP_fast_assign(
346 __entry->timer_idx = arch_timer_ctx_index(ctx);
347 ),
348
349 TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
350);
351
352TRACE_EVENT(kvm_timer_emulate,
353 TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
354 TP_ARGS(ctx, should_fire),
355
356 TP_STRUCT__entry(
357 __field( int, timer_idx )
358 __field( bool, should_fire )
359 ),
360
361 TP_fast_assign(
362 __entry->timer_idx = arch_timer_ctx_index(ctx);
363 __entry->should_fire = should_fire;
364 ),
365
366 TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
367 __entry->timer_idx, __entry->should_fire)
368);
369
265#endif /* _TRACE_KVM_H */ 370#endif /* _TRACE_KVM_H */
266 371
267#undef TRACE_INCLUDE_PATH 372#undef TRACE_INCLUDE_PATH
268#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm 373#define TRACE_INCLUDE_PATH ../../virt/kvm/arm
269#undef TRACE_INCLUDE_FILE 374#undef TRACE_INCLUDE_FILE
270#define TRACE_INCLUDE_FILE trace 375#define TRACE_INCLUDE_FILE trace
271 376
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 4ee0aeb9a905..408a78eb6a97 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -589,7 +589,7 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
589 */ 589 */
590int vgic_v3_probe(const struct gic_kvm_info *info) 590int vgic_v3_probe(const struct gic_kvm_info *info)
591{ 591{
592 u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); 592 u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
593 int ret; 593 int ret;
594 594
595 /* 595 /*
@@ -679,7 +679,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
679 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; 679 struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
680 680
681 if (likely(cpu_if->vgic_sre)) 681 if (likely(cpu_if->vgic_sre))
682 cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); 682 cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
683 683
684 kvm_call_hyp(__vgic_v3_save_aprs, vcpu); 684 kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
685 685
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 6855cce3e528..5294abb3f178 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
144 if (zone->pio != 1 && zone->pio != 0) 144 if (zone->pio != 1 && zone->pio != 0)
145 return -EINVAL; 145 return -EINVAL;
146 146
147 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); 147 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
148 GFP_KERNEL_ACCOUNT);
148 if (!dev) 149 if (!dev)
149 return -ENOMEM; 150 return -ENOMEM;
150 151
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b20b751286fc..4325250afd72 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
297 if (!kvm_arch_intc_initialized(kvm)) 297 if (!kvm_arch_intc_initialized(kvm))
298 return -EAGAIN; 298 return -EAGAIN;
299 299
300 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 300 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
301 if (!irqfd) 301 if (!irqfd)
302 return -ENOMEM; 302 return -ENOMEM;
303 303
@@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
345 } 345 }
346 346
347 if (!irqfd->resampler) { 347 if (!irqfd->resampler) {
348 resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); 348 resampler = kzalloc(sizeof(*resampler),
349 GFP_KERNEL_ACCOUNT);
349 if (!resampler) { 350 if (!resampler) {
350 ret = -ENOMEM; 351 ret = -ENOMEM;
351 mutex_unlock(&kvm->irqfds.resampler_lock); 352 mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
797 if (IS_ERR(eventfd)) 798 if (IS_ERR(eventfd))
798 return PTR_ERR(eventfd); 799 return PTR_ERR(eventfd);
799 800
800 p = kzalloc(sizeof(*p), GFP_KERNEL); 801 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
801 if (!p) { 802 if (!p) {
802 ret = -ENOMEM; 803 ret = -ENOMEM;
803 goto fail; 804 goto fail;
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index b1286c4e0712..3547b0d8c91e 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -196,7 +196,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
196 nr_rt_entries += 1; 196 nr_rt_entries += 1;
197 197
198 new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)), 198 new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
199 GFP_KERNEL); 199 GFP_KERNEL_ACCOUNT);
200 200
201 if (!new) 201 if (!new)
202 return -ENOMEM; 202 return -ENOMEM;
@@ -208,7 +208,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
208 208
209 for (i = 0; i < nr; ++i) { 209 for (i = 0; i < nr; ++i) {
210 r = -ENOMEM; 210 r = -ENOMEM;
211 e = kzalloc(sizeof(*e), GFP_KERNEL); 211 e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
212 if (!e) 212 if (!e)
213 goto out; 213 goto out;
214 214
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d237d3350a99..f25aa98a94df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2;
81module_param(halt_poll_ns_grow, uint, 0644); 81module_param(halt_poll_ns_grow, uint, 0644);
82EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
83 83
84/* The start value to grow halt_poll_ns from */
85unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
86module_param(halt_poll_ns_grow_start, uint, 0644);
87EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
88
84/* Default resets per-vcpu halt_poll_ns . */ 89/* Default resets per-vcpu halt_poll_ns . */
85unsigned int halt_poll_ns_shrink; 90unsigned int halt_poll_ns_shrink;
86module_param(halt_poll_ns_shrink, uint, 0644); 91module_param(halt_poll_ns_shrink, uint, 0644);
@@ -525,7 +530,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
525 int i; 530 int i;
526 struct kvm_memslots *slots; 531 struct kvm_memslots *slots;
527 532
528 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 533 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
529 if (!slots) 534 if (!slots)
530 return NULL; 535 return NULL;
531 536
@@ -601,12 +606,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
601 606
602 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 607 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
603 sizeof(*kvm->debugfs_stat_data), 608 sizeof(*kvm->debugfs_stat_data),
604 GFP_KERNEL); 609 GFP_KERNEL_ACCOUNT);
605 if (!kvm->debugfs_stat_data) 610 if (!kvm->debugfs_stat_data)
606 return -ENOMEM; 611 return -ENOMEM;
607 612
608 for (p = debugfs_entries; p->name; p++) { 613 for (p = debugfs_entries; p->name; p++) {
609 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); 614 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
610 if (!stat_data) 615 if (!stat_data)
611 return -ENOMEM; 616 return -ENOMEM;
612 617
@@ -656,12 +661,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
656 struct kvm_memslots *slots = kvm_alloc_memslots(); 661 struct kvm_memslots *slots = kvm_alloc_memslots();
657 if (!slots) 662 if (!slots)
658 goto out_err_no_srcu; 663 goto out_err_no_srcu;
659 /* 664 /* Generations must be different for each address space. */
660 * Generations must be different for each address space. 665 slots->generation = i;
661 * Init kvm generation close to the maximum to easily test the
662 * code of handling generation number wrap-around.
663 */
664 slots->generation = i * 2 - 150;
665 rcu_assign_pointer(kvm->memslots[i], slots); 666 rcu_assign_pointer(kvm->memslots[i], slots);
666 } 667 }
667 668
@@ -671,7 +672,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
671 goto out_err_no_irq_srcu; 672 goto out_err_no_irq_srcu;
672 for (i = 0; i < KVM_NR_BUSES; i++) { 673 for (i = 0; i < KVM_NR_BUSES; i++) {
673 rcu_assign_pointer(kvm->buses[i], 674 rcu_assign_pointer(kvm->buses[i],
674 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); 675 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
675 if (!kvm->buses[i]) 676 if (!kvm->buses[i])
676 goto out_err; 677 goto out_err;
677 } 678 }
@@ -789,7 +790,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
789{ 790{
790 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 791 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
791 792
792 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); 793 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
793 if (!memslot->dirty_bitmap) 794 if (!memslot->dirty_bitmap)
794 return -ENOMEM; 795 return -ENOMEM;
795 796
@@ -874,31 +875,34 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
874 int as_id, struct kvm_memslots *slots) 875 int as_id, struct kvm_memslots *slots)
875{ 876{
876 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 877 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
878 u64 gen = old_memslots->generation;
877 879
878 /* 880 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
879 * Set the low bit in the generation, which disables SPTE caching 881 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
880 * until the end of synchronize_srcu_expedited.
881 */
882 WARN_ON(old_memslots->generation & 1);
883 slots->generation = old_memslots->generation + 1;
884 882
885 rcu_assign_pointer(kvm->memslots[as_id], slots); 883 rcu_assign_pointer(kvm->memslots[as_id], slots);
886 synchronize_srcu_expedited(&kvm->srcu); 884 synchronize_srcu_expedited(&kvm->srcu);
887 885
888 /* 886 /*
889 * Increment the new memslot generation a second time. This prevents 887 * Increment the new memslot generation a second time, dropping the
890 * vm exits that race with memslot updates from caching a memslot 888 * update in-progress flag and incrementing then generation based on
891 * generation that will (potentially) be valid forever. 889 * the number of address spaces. This provides a unique and easily
892 * 890 * identifiable generation number while the memslots are in flux.
891 */
892 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
893
894 /*
893 * Generations must be unique even across address spaces. We do not need 895 * Generations must be unique even across address spaces. We do not need
894 * a global counter for that, instead the generation space is evenly split 896 * a global counter for that, instead the generation space is evenly split
895 * across address spaces. For example, with two address spaces, address 897 * across address spaces. For example, with two address spaces, address
896 * space 0 will use generations 0, 4, 8, ... while * address space 1 will 898 * space 0 will use generations 0, 2, 4, ... while address space 1 will
897 * use generations 2, 6, 10, 14, ... 899 * use generations 1, 3, 5, ...
898 */ 900 */
899 slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; 901 gen += KVM_ADDRESS_SPACE_NUM;
902
903 kvm_arch_memslots_updated(kvm, gen);
900 904
901 kvm_arch_memslots_updated(kvm, slots); 905 slots->generation = gen;
902 906
903 return old_memslots; 907 return old_memslots;
904} 908}
@@ -1018,7 +1022,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
1018 goto out_free; 1022 goto out_free;
1019 } 1023 }
1020 1024
1021 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 1025 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
1022 if (!slots) 1026 if (!slots)
1023 goto out_free; 1027 goto out_free;
1024 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1028 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
@@ -1201,11 +1205,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
1201 mask = xchg(&dirty_bitmap[i], 0); 1205 mask = xchg(&dirty_bitmap[i], 0);
1202 dirty_bitmap_buffer[i] = mask; 1206 dirty_bitmap_buffer[i] = mask;
1203 1207
1204 if (mask) { 1208 offset = i * BITS_PER_LONG;
1205 offset = i * BITS_PER_LONG; 1209 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1206 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1210 offset, mask);
1207 offset, mask);
1208 }
1209 } 1211 }
1210 spin_unlock(&kvm->mmu_lock); 1212 spin_unlock(&kvm->mmu_lock);
1211 } 1213 }
@@ -2185,20 +2187,23 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2185 2187
2186static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2188static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2187{ 2189{
2188 unsigned int old, val, grow; 2190 unsigned int old, val, grow, grow_start;
2189 2191
2190 old = val = vcpu->halt_poll_ns; 2192 old = val = vcpu->halt_poll_ns;
2193 grow_start = READ_ONCE(halt_poll_ns_grow_start);
2191 grow = READ_ONCE(halt_poll_ns_grow); 2194 grow = READ_ONCE(halt_poll_ns_grow);
2192 /* 10us base */ 2195 if (!grow)
2193 if (val == 0 && grow) 2196 goto out;
2194 val = 10000; 2197
2195 else 2198 val *= grow;
2196 val *= grow; 2199 if (val < grow_start)
2200 val = grow_start;
2197 2201
2198 if (val > halt_poll_ns) 2202 if (val > halt_poll_ns)
2199 val = halt_poll_ns; 2203 val = halt_poll_ns;
2200 2204
2201 vcpu->halt_poll_ns = val; 2205 vcpu->halt_poll_ns = val;
2206out:
2202 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2207 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2203} 2208}
2204 2209
@@ -2683,7 +2688,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
2683 struct kvm_regs *kvm_regs; 2688 struct kvm_regs *kvm_regs;
2684 2689
2685 r = -ENOMEM; 2690 r = -ENOMEM;
2686 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2691 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
2687 if (!kvm_regs) 2692 if (!kvm_regs)
2688 goto out; 2693 goto out;
2689 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2694 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
@@ -2711,7 +2716,8 @@ out_free1:
2711 break; 2716 break;
2712 } 2717 }
2713 case KVM_GET_SREGS: { 2718 case KVM_GET_SREGS: {
2714 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2719 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
2720 GFP_KERNEL_ACCOUNT);
2715 r = -ENOMEM; 2721 r = -ENOMEM;
2716 if (!kvm_sregs) 2722 if (!kvm_sregs)
2717 goto out; 2723 goto out;
@@ -2803,7 +2809,7 @@ out_free1:
2803 break; 2809 break;
2804 } 2810 }
2805 case KVM_GET_FPU: { 2811 case KVM_GET_FPU: {
2806 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2812 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
2807 r = -ENOMEM; 2813 r = -ENOMEM;
2808 if (!fpu) 2814 if (!fpu)
2809 goto out; 2815 goto out;
@@ -2980,7 +2986,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
2980 if (test) 2986 if (test)
2981 return 0; 2987 return 0;
2982 2988
2983 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2989 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
2984 if (!dev) 2990 if (!dev)
2985 return -ENOMEM; 2991 return -ENOMEM;
2986 2992
@@ -3625,6 +3631,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3625 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3631 r = __kvm_io_bus_write(vcpu, bus, &range, val);
3626 return r < 0 ? r : 0; 3632 return r < 0 ? r : 0;
3627} 3633}
3634EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3628 3635
3629/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3636/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
3630int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3637int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
@@ -3675,7 +3682,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
3675 3682
3676 return -EOPNOTSUPP; 3683 return -EOPNOTSUPP;
3677} 3684}
3678EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3679 3685
3680/* kvm_io_bus_read - called under kvm->slots_lock */ 3686/* kvm_io_bus_read - called under kvm->slots_lock */
3681int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3687int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
@@ -3697,7 +3703,6 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3697 return r < 0 ? r : 0; 3703 return r < 0 ? r : 0;
3698} 3704}
3699 3705
3700
3701/* Caller must hold slots_lock. */ 3706/* Caller must hold slots_lock. */
3702int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3707int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3703 int len, struct kvm_io_device *dev) 3708 int len, struct kvm_io_device *dev)
@@ -3714,8 +3719,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3714 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3719 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
3715 return -ENOSPC; 3720 return -ENOSPC;
3716 3721
3717 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3722 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
3718 sizeof(struct kvm_io_range)), GFP_KERNEL); 3723 GFP_KERNEL_ACCOUNT);
3719 if (!new_bus) 3724 if (!new_bus)
3720 return -ENOMEM; 3725 return -ENOMEM;
3721 3726
@@ -3760,8 +3765,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3760 if (i == bus->dev_count) 3765 if (i == bus->dev_count)
3761 return; 3766 return;
3762 3767
3763 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3768 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
3764 sizeof(struct kvm_io_range)), GFP_KERNEL); 3769 GFP_KERNEL_ACCOUNT);
3765 if (!new_bus) { 3770 if (!new_bus) {
3766 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3771 pr_err("kvm: failed to shrink bus, removing it completely\n");
3767 goto broken; 3772 goto broken;
@@ -4029,7 +4034,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4029 active = kvm_active_vms; 4034 active = kvm_active_vms;
4030 spin_unlock(&kvm_lock); 4035 spin_unlock(&kvm_lock);
4031 4036
4032 env = kzalloc(sizeof(*env), GFP_KERNEL); 4037 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4033 if (!env) 4038 if (!env)
4034 return; 4039 return;
4035 4040
@@ -4045,7 +4050,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4045 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4050 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4046 4051
4047 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4052 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4048 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); 4053 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4049 4054
4050 if (p) { 4055 if (p) {
4051 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4056 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d99850c462a1..524cbd20379f 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
219 } 219 }
220 } 220 }
221 221
222 kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); 222 kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT);
223 if (!kvg) { 223 if (!kvg) {
224 mutex_unlock(&kv->lock); 224 mutex_unlock(&kv->lock);
225 kvm_vfio_group_put_external_user(vfio_group); 225 kvm_vfio_group_put_external_user(vfio_group);
@@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
405 if (tmp->ops == &kvm_vfio_ops) 405 if (tmp->ops == &kvm_vfio_ops)
406 return -EBUSY; 406 return -EBUSY;
407 407
408 kv = kzalloc(sizeof(*kv), GFP_KERNEL); 408 kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
409 if (!kv) 409 if (!kv)
410 return -ENOMEM; 410 return -ENOMEM;
411 411