aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 16:21:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 16:21:40 -0400
commitfe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 (patch)
tree46596fd7edf7c4da1dafdb2c62011841e71cf32d
parent3e34131a65127e73fbae68c82748f32c8af7e4a4 (diff)
parenta3ff5fbc94a829680d4aa005cd17add1c1a1fb5b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "On the x86 side, there are some optimizations and documentation updates. The big ARM/KVM change for 3.11, support for AArch64, will come through Catalin Marinas's tree. s390 and PPC have misc cleanups and bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits) KVM: PPC: Ignore PIR writes KVM: PPC: Book3S PR: Invalidate SLB entries properly KVM: PPC: Book3S PR: Allow guest to use 1TB segments KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry KVM: PPC: Book3S PR: Fix proto-VSID calculations KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL KVM: Fix RTC interrupt coalescing tracking kvm: Add a tracepoint write_tsc_offset KVM: MMU: Inform users of mmio generation wraparound KVM: MMU: document fast invalidate all mmio sptes KVM: MMU: document fast invalidate all pages KVM: MMU: document fast page fault KVM: MMU: document mmio page fault KVM: MMU: document write_flooding_count KVM: MMU: document clear_spte_count KVM: MMU: drop kvm_mmu_zap_mmio_sptes KVM: MMU: init kvm generation close to mmio wrap-around value KVM: MMU: add tracepoint for check_mmio_spte KVM: MMU: fast invalidate all mmio sptes ...
-rw-r--r--Documentation/virtual/kvm/api.txt8
-rw-r--r--Documentation/virtual/kvm/mmu.txt91
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/arm/include/asm/kvm_arm.h1
-rw-r--r--arch/arm/include/asm/kvm_asm.h24
-rw-r--r--arch/arm/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm/include/asm/kvm_host.h13
-rw-r--r--arch/arm/kvm/Kconfig8
-rw-r--r--arch/arm/kvm/Makefile7
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/coproc.c4
-rw-r--r--arch/arm/kvm/handle_exit.c3
-rw-r--r--arch/arm/kvm/interrupts.S16
-rw-r--r--arch/arm/kvm/interrupts_head.S10
-rw-r--r--arch/arm/kvm/mmio.c6
-rw-r--r--arch/arm/kvm/mmu.c3
-rw-r--r--arch/arm/kvm/psci.c2
-rw-r--r--arch/arm/kvm/reset.c12
-rw-r--r--arch/ia64/kvm/Makefile7
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h6
-rw-r--r--arch/powerpc/kvm/Makefile13
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c81
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c21
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S13
-rw-r--r--arch/powerpc/kvm/book3s_pr.c3
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/emulate.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h18
-rw-r--r--arch/s390/include/asm/perf_event.h10
-rw-r--r--arch/s390/include/asm/pgtable.h83
-rw-r--r--arch/s390/kernel/asm-offsets.c3
-rw-r--r--arch/s390/kernel/entry64.S81
-rw-r--r--arch/s390/kernel/perf_event.c52
-rw-r--r--arch/s390/kernel/s390_ksyms.c1
-rw-r--r--arch/s390/kvm/Makefile3
-rw-r--r--arch/s390/kvm/diag.c3
-rw-r--r--arch/s390/kvm/intercept.c124
-rw-r--r--arch/s390/kvm/interrupt.c18
-rw-r--r--arch/s390/kvm/kvm-s390.c105
-rw-r--r--arch/s390/kvm/kvm-s390.h14
-rw-r--r--arch/s390/kvm/priv.c274
-rw-r--r--arch/s390/kvm/sigp.c19
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h15
-rw-r--r--arch/x86/kvm/Makefile13
-rw-r--r--arch/x86/kvm/emulate.c391
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c301
-rw-r--r--arch/x86/kvm/mmu.h18
-rw-r--r--arch/x86/kvm/mmutrace.h76
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/trace.h21
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c80
-rw-r--r--include/kvm/arm_arch_timer.h (renamed from arch/arm/include/asm/kvm_arch_timer.h)4
-rw-r--r--include/kvm/arm_vgic.h (renamed from arch/arm/include/asm/kvm_vgic.h)0
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--virt/kvm/arm/arch_timer.c (renamed from arch/arm/kvm/arch_timer.c)33
-rw-r--r--virt/kvm/arm/vgic.c (renamed from arch/arm/kvm/vgic.c)0
-rw-r--r--virt/kvm/eventfd.c2
-rw-r--r--virt/kvm/kvm_main.c4
62 files changed, 1382 insertions, 807 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 9bfadeb8be31..66dd2aa53ba4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2278,7 +2278,7 @@ return indicates the attribute is implemented. It does not necessarily
2278indicate that the attribute can be read or written in the device's 2278indicate that the attribute can be read or written in the device's
2279current state. "addr" is ignored. 2279current state. "addr" is ignored.
2280 2280
22814.77 KVM_ARM_VCPU_INIT 22814.82 KVM_ARM_VCPU_INIT
2282 2282
2283Capability: basic 2283Capability: basic
2284Architectures: arm, arm64 2284Architectures: arm, arm64
@@ -2304,7 +2304,7 @@ Possible features:
2304 Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). 2304 Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
2305 2305
2306 2306
23074.78 KVM_GET_REG_LIST 23074.83 KVM_GET_REG_LIST
2308 2308
2309Capability: basic 2309Capability: basic
2310Architectures: arm, arm64 2310Architectures: arm, arm64
@@ -2324,7 +2324,7 @@ This ioctl returns the guest registers that are supported for the
2324KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 2324KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
2325 2325
2326 2326
23274.80 KVM_ARM_SET_DEVICE_ADDR 23274.84 KVM_ARM_SET_DEVICE_ADDR
2328 2328
2329Capability: KVM_CAP_ARM_SET_DEVICE_ADDR 2329Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
2330Architectures: arm, arm64 2330Architectures: arm, arm64
@@ -2362,7 +2362,7 @@ must be called after calling KVM_CREATE_IRQCHIP, but before calling
2362KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the 2362KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the
2363base addresses will return -EEXIST. 2363base addresses will return -EEXIST.
2364 2364
23654.82 KVM_PPC_RTAS_DEFINE_TOKEN 23654.85 KVM_PPC_RTAS_DEFINE_TOKEN
2366 2366
2367Capability: KVM_CAP_PPC_RTAS 2367Capability: KVM_CAP_PPC_RTAS
2368Architectures: ppc 2368Architectures: ppc
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 43fcb761ed16..290894176142 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -191,12 +191,12 @@ Shadow pages contain the following information:
191 A counter keeping track of how many hardware registers (guest cr3 or 191 A counter keeping track of how many hardware registers (guest cr3 or
192 pdptrs) are now pointing at the page. While this counter is nonzero, the 192 pdptrs) are now pointing at the page. While this counter is nonzero, the
193 page cannot be destroyed. See role.invalid. 193 page cannot be destroyed. See role.invalid.
194 multimapped: 194 parent_ptes:
195 Whether there exist multiple sptes pointing at this page. 195 The reverse mapping for the pte/ptes pointing at this page's spt. If
196 parent_pte/parent_ptes: 196 parent_ptes bit 0 is zero, only one spte points at this pages and
197 If multimapped is zero, parent_pte points at the single spte that points at 197 parent_ptes points at this single spte, otherwise, there exists multiple
198 this page's spt. Otherwise, parent_ptes points at a data structure 198 sptes pointing at this page and (parent_ptes & ~0x1) points at a data
199 with a list of parent_ptes. 199 structure with a list of parent_ptes.
200 unsync: 200 unsync:
201 If true, then the translations in this page may not match the guest's 201 If true, then the translations in this page may not match the guest's
202 translation. This is equivalent to the state of the tlb when a pte is 202 translation. This is equivalent to the state of the tlb when a pte is
@@ -210,6 +210,24 @@ Shadow pages contain the following information:
210 A bitmap indicating which sptes in spt point (directly or indirectly) at 210 A bitmap indicating which sptes in spt point (directly or indirectly) at
211 pages that may be unsynchronized. Used to quickly locate all unsychronized 211 pages that may be unsynchronized. Used to quickly locate all unsychronized
212 pages reachable from a given page. 212 pages reachable from a given page.
213 mmu_valid_gen:
214 Generation number of the page. It is compared with kvm->arch.mmu_valid_gen
215 during hash table lookup, and used to skip invalidated shadow pages (see
216 "Zapping all pages" below.)
217 clear_spte_count:
218 Only present on 32-bit hosts, where a 64-bit spte cannot be written
219 atomically. The reader uses this while running out of the MMU lock
220 to detect in-progress updates and retry them until the writer has
221 finished the write.
222 write_flooding_count:
223 A guest may write to a page table many times, causing a lot of
224 emulations if the page needs to be write-protected (see "Synchronized
225 and unsynchronized pages" below). Leaf pages can be unsynchronized
226 so that they do not trigger frequent emulation, but this is not
227 possible for non-leafs. This field counts the number of emulations
228 since the last time the page table was actually used; if emulation
229 is triggered too frequently on this page, KVM will unmap the page
230 to avoid emulation in the future.
213 231
214Reverse map 232Reverse map
215=========== 233===========
@@ -258,14 +276,26 @@ This is the most complicated event. The cause of a page fault can be:
258 276
259Handling a page fault is performed as follows: 277Handling a page fault is performed as follows:
260 278
279 - if the RSV bit of the error code is set, the page fault is caused by guest
280 accessing MMIO and cached MMIO information is available.
281 - walk shadow page table
282 - check for valid generation number in the spte (see "Fast invalidation of
283 MMIO sptes" below)
284 - cache the information to vcpu->arch.mmio_gva, vcpu->arch.access and
285 vcpu->arch.mmio_gfn, and call the emulator
286 - If both P bit and R/W bit of error code are set, this could possibly
287 be handled as a "fast page fault" (fixed without taking the MMU lock). See
288 the description in Documentation/virtual/kvm/locking.txt.
261 - if needed, walk the guest page tables to determine the guest translation 289 - if needed, walk the guest page tables to determine the guest translation
262 (gva->gpa or ngpa->gpa) 290 (gva->gpa or ngpa->gpa)
263 - if permissions are insufficient, reflect the fault back to the guest 291 - if permissions are insufficient, reflect the fault back to the guest
264 - determine the host page 292 - determine the host page
265 - if this is an mmio request, there is no host page; call the emulator 293 - if this is an mmio request, there is no host page; cache the info to
266 to emulate the instruction instead 294 vcpu->arch.mmio_gva, vcpu->arch.access and vcpu->arch.mmio_gfn
267 - walk the shadow page table to find the spte for the translation, 295 - walk the shadow page table to find the spte for the translation,
268 instantiating missing intermediate page tables as necessary 296 instantiating missing intermediate page tables as necessary
297 - If this is an mmio request, cache the mmio info to the spte and set some
298 reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask)
269 - try to unsynchronize the page 299 - try to unsynchronize the page
270 - if successful, we can let the guest continue and modify the gpte 300 - if successful, we can let the guest continue and modify the gpte
271 - emulate the instruction 301 - emulate the instruction
@@ -351,6 +381,51 @@ causes its write_count to be incremented, thus preventing instantiation of
351a large spte. The frames at the end of an unaligned memory slot have 381a large spte. The frames at the end of an unaligned memory slot have
352artificially inflated ->write_counts so they can never be instantiated. 382artificially inflated ->write_counts so they can never be instantiated.
353 383
384Zapping all pages (page generation count)
385=========================================
386
387For the large memory guests, walking and zapping all pages is really slow
388(because there are a lot of pages), and also blocks memory accesses of
389all VCPUs because it needs to hold the MMU lock.
390
391To make it be more scalable, kvm maintains a global generation number
392which is stored in kvm->arch.mmu_valid_gen. Every shadow page stores
393the current global generation-number into sp->mmu_valid_gen when it
394is created. Pages with a mismatching generation number are "obsolete".
395
396When KVM need zap all shadow pages sptes, it just simply increases the global
397generation-number then reload root shadow pages on all vcpus. As the VCPUs
398create new shadow page tables, the old pages are not used because of the
399mismatching generation number.
400
401KVM then walks through all pages and zaps obsolete pages. While the zap
402operation needs to take the MMU lock, the lock can be released periodically
403so that the VCPUs can make progress.
404
405Fast invalidation of MMIO sptes
406===============================
407
408As mentioned in "Reaction to events" above, kvm will cache MMIO
409information in leaf sptes. When a new memslot is added or an existing
410memslot is changed, this information may become stale and needs to be
411invalidated. This also needs to hold the MMU lock while walking all
412shadow pages, and is made more scalable with a similar technique.
413
414MMIO sptes have a few spare bits, which are used to store a
415generation number. The global generation number is stored in
416kvm_memslots(kvm)->generation, and increased whenever guest memory info
417changes. This generation number is distinct from the one described in
418the previous section.
419
420When KVM finds an MMIO spte, it checks the generation number of the spte.
421If the generation number of the spte does not equal the global generation
422number, it will ignore the cached MMIO information and handle the page
423fault through the slow path.
424
425Since only 19 bits are used to store generation-number on mmio spte, all
426pages are zapped when there is an overflow.
427
428
354Further reading 429Further reading
355=============== 430===============
356 431
diff --git a/MAINTAINERS b/MAINTAINERS
index c85bf69bb321..60c68fbee64a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4733,10 +4733,10 @@ F: arch/s390/kvm/
4733F: drivers/s390/kvm/ 4733F: drivers/s390/kvm/
4734 4734
4735KERNEL VIRTUAL MACHINE (KVM) FOR ARM 4735KERNEL VIRTUAL MACHINE (KVM) FOR ARM
4736M: Christoffer Dall <cdall@cs.columbia.edu> 4736M: Christoffer Dall <christoffer.dall@linaro.org>
4737L: kvmarm@lists.cs.columbia.edu 4737L: kvmarm@lists.cs.columbia.edu
4738W: http://systems.cs.columbia.edu/projects/kvm-arm 4738W: http://systems.cs.columbia.edu/projects/kvm-arm
4739S: Maintained 4739S: Supported
4740F: arch/arm/include/uapi/asm/kvm* 4740F: arch/arm/include/uapi/asm/kvm*
4741F: arch/arm/include/asm/kvm* 4741F: arch/arm/include/asm/kvm*
4742F: arch/arm/kvm/ 4742F: arch/arm/kvm/
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 124623e5ef14..64e96960de29 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -135,7 +135,6 @@
135#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL) 135#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL)
136#define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30)) 136#define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30))
137#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) 137#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
138#define S2_PGD_SIZE (1 << S2_PGD_ORDER)
139 138
140/* Virtualization Translation Control Register (VTCR) bits */ 139/* Virtualization Translation Control Register (VTCR) bits */
141#define VTCR_SH0 (3 << 12) 140#define VTCR_SH0 (3 << 12)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 18d50322a9e2..a2f43ddcc300 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -37,16 +37,18 @@
37#define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */ 37#define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */
38#define c6_DFAR 16 /* Data Fault Address Register */ 38#define c6_DFAR 16 /* Data Fault Address Register */
39#define c6_IFAR 17 /* Instruction Fault Address Register */ 39#define c6_IFAR 17 /* Instruction Fault Address Register */
40#define c9_L2CTLR 18 /* Cortex A15 L2 Control Register */ 40#define c7_PAR 18 /* Physical Address Register */
41#define c10_PRRR 19 /* Primary Region Remap Register */ 41#define c7_PAR_high 19 /* PAR top 32 bits */
42#define c10_NMRR 20 /* Normal Memory Remap Register */ 42#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */
43#define c12_VBAR 21 /* Vector Base Address Register */ 43#define c10_PRRR 21 /* Primary Region Remap Register */
44#define c13_CID 22 /* Context ID Register */ 44#define c10_NMRR 22 /* Normal Memory Remap Register */
45#define c13_TID_URW 23 /* Thread ID, User R/W */ 45#define c12_VBAR 23 /* Vector Base Address Register */
46#define c13_TID_URO 24 /* Thread ID, User R/O */ 46#define c13_CID 24 /* Context ID Register */
47#define c13_TID_PRIV 25 /* Thread ID, Privileged */ 47#define c13_TID_URW 25 /* Thread ID, User R/W */
48#define c14_CNTKCTL 26 /* Timer Control Register (PL1) */ 48#define c13_TID_URO 26 /* Thread ID, User R/O */
49#define NR_CP15_REGS 27 /* Number of regs (incl. invalid) */ 49#define c13_TID_PRIV 27 /* Thread ID, Privileged */
50#define c14_CNTKCTL 28 /* Timer Control Register (PL1) */
51#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */
50 52
51#define ARM_EXCEPTION_RESET 0 53#define ARM_EXCEPTION_RESET 0
52#define ARM_EXCEPTION_UNDEFINED 1 54#define ARM_EXCEPTION_UNDEFINED 1
@@ -72,8 +74,6 @@ extern char __kvm_hyp_vector[];
72extern char __kvm_hyp_code_start[]; 74extern char __kvm_hyp_code_start[];
73extern char __kvm_hyp_code_end[]; 75extern char __kvm_hyp_code_end[];
74 76
75extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
76
77extern void __kvm_flush_vm_context(void); 77extern void __kvm_flush_vm_context(void);
78extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); 78extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
79 79
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 82b4babead2c..a464e8d7b6c5 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -65,11 +65,6 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
65 return cpsr_mode > USR_MODE;; 65 return cpsr_mode > USR_MODE;;
66} 66}
67 67
68static inline bool kvm_vcpu_reg_is_pc(struct kvm_vcpu *vcpu, int reg)
69{
70 return reg == 15;
71}
72
73static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) 68static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
74{ 69{
75 return vcpu->arch.fault.hsr; 70 return vcpu->arch.fault.hsr;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 57cb786a6203..7d22517d8071 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -23,9 +23,14 @@
23#include <asm/kvm_asm.h> 23#include <asm/kvm_asm.h>
24#include <asm/kvm_mmio.h> 24#include <asm/kvm_mmio.h>
25#include <asm/fpstate.h> 25#include <asm/fpstate.h>
26#include <asm/kvm_arch_timer.h> 26#include <kvm/arm_arch_timer.h>
27 27
28#if defined(CONFIG_KVM_ARM_MAX_VCPUS)
28#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS 29#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
30#else
31#define KVM_MAX_VCPUS 0
32#endif
33
29#define KVM_USER_MEM_SLOTS 32 34#define KVM_USER_MEM_SLOTS 32
30#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
31#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 36#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -38,7 +43,7 @@
38#define KVM_NR_PAGE_SIZES 1 43#define KVM_NR_PAGE_SIZES 1
39#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) 44#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
40 45
41#include <asm/kvm_vgic.h> 46#include <kvm/arm_vgic.h>
42 47
43struct kvm_vcpu; 48struct kvm_vcpu;
44u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); 49u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
@@ -190,8 +195,8 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
190int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, 195int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
191 int exception_index); 196 int exception_index);
192 197
193static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, 198static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
194 unsigned long long pgd_ptr, 199 phys_addr_t pgd_ptr,
195 unsigned long hyp_stack_ptr, 200 unsigned long hyp_stack_ptr,
196 unsigned long vector_ptr) 201 unsigned long vector_ptr)
197{ 202{
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 370e1a8af6ac..ebf5015508b5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -41,9 +41,9 @@ config KVM_ARM_HOST
41 Provides host support for ARM processors. 41 Provides host support for ARM processors.
42 42
43config KVM_ARM_MAX_VCPUS 43config KVM_ARM_MAX_VCPUS
44 int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST 44 int "Number maximum supported virtual CPUs per VM"
45 default 4 if KVM_ARM_HOST 45 depends on KVM_ARM_HOST
46 default 0 46 default 4
47 help 47 help
48 Static number of max supported virtual CPUs per VM. 48 Static number of max supported virtual CPUs per VM.
49 49
@@ -67,6 +67,4 @@ config KVM_ARM_TIMER
67 ---help--- 67 ---help---
68 Adds support for the Architected Timers in virtual machines 68 Adds support for the Architected Timers in virtual machines
69 69
70source drivers/virtio/Kconfig
71
72endif # VIRTUALIZATION 70endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 53c5ed83d16f..d99bee4950e5 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -14,10 +14,11 @@ CFLAGS_mmu.o := -I.
14AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) 14AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) 15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
16 16
17kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 17KVM := ../../../virt/kvm
18kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
18 19
19obj-y += kvm-arm.o init.o interrupts.o 20obj-y += kvm-arm.o init.o interrupts.o
20obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 21obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
21obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o 22obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
22obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o 23obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
23obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o 24obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index ef1703b9587b..741f66a2edbd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -800,8 +800,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
800 800
801static void cpu_init_hyp_mode(void *dummy) 801static void cpu_init_hyp_mode(void *dummy)
802{ 802{
803 unsigned long long boot_pgd_ptr; 803 phys_addr_t boot_pgd_ptr;
804 unsigned long long pgd_ptr; 804 phys_addr_t pgd_ptr;
805 unsigned long hyp_stack_ptr; 805 unsigned long hyp_stack_ptr;
806 unsigned long stack_page; 806 unsigned long stack_page;
807 unsigned long vector_ptr; 807 unsigned long vector_ptr;
@@ -809,8 +809,8 @@ static void cpu_init_hyp_mode(void *dummy)
809 /* Switch from the HYP stub to our own HYP init vector */ 809 /* Switch from the HYP stub to our own HYP init vector */
810 __hyp_set_vectors(kvm_get_idmap_vector()); 810 __hyp_set_vectors(kvm_get_idmap_vector());
811 811
812 boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); 812 boot_pgd_ptr = kvm_mmu_get_boot_httbr();
813 pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); 813 pgd_ptr = kvm_mmu_get_httbr();
814 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); 814 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
815 hyp_stack_ptr = stack_page + PAGE_SIZE; 815 hyp_stack_ptr = stack_page + PAGE_SIZE;
816 vector_ptr = (unsigned long)__kvm_hyp_vector; 816 vector_ptr = (unsigned long)__kvm_hyp_vector;
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 8eea97be1ed5..4a5199070430 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -180,6 +180,10 @@ static const struct coproc_reg cp15_regs[] = {
180 NULL, reset_unknown, c6_DFAR }, 180 NULL, reset_unknown, c6_DFAR },
181 { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, 181 { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
182 NULL, reset_unknown, c6_IFAR }, 182 NULL, reset_unknown, c6_IFAR },
183
184 /* PAR swapped by interrupt.S */
185 { CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
186
183 /* 187 /*
184 * DC{C,I,CI}SW operations: 188 * DC{C,I,CI}SW operations:
185 */ 189 */
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 3d74a0be47db..df4c82d47ad7 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -52,9 +52,6 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
52 52
53static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) 53static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
54{ 54{
55 if (kvm_psci_call(vcpu))
56 return 1;
57
58 kvm_inject_undefined(vcpu); 55 kvm_inject_undefined(vcpu);
59 return 1; 56 return 1;
60} 57}
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index f7793df62f58..16cd4ba5d7fd 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -49,6 +49,7 @@ __kvm_hyp_code_start:
49ENTRY(__kvm_tlb_flush_vmid_ipa) 49ENTRY(__kvm_tlb_flush_vmid_ipa)
50 push {r2, r3} 50 push {r2, r3}
51 51
52 dsb ishst
52 add r0, r0, #KVM_VTTBR 53 add r0, r0, #KVM_VTTBR
53 ldrd r2, r3, [r0] 54 ldrd r2, r3, [r0]
54 mcrr p15, 6, r2, r3, c2 @ Write VTTBR 55 mcrr p15, 6, r2, r3, c2 @ Write VTTBR
@@ -291,6 +292,7 @@ THUMB( orr r2, r2, #PSR_T_BIT )
291 ldr r2, =BSYM(panic) 292 ldr r2, =BSYM(panic)
292 msr ELR_hyp, r2 293 msr ELR_hyp, r2
293 ldr r0, =\panic_str 294 ldr r0, =\panic_str
295 clrex @ Clear exclusive monitor
294 eret 296 eret
295.endm 297.endm
296 298
@@ -414,6 +416,10 @@ guest_trap:
414 mrcne p15, 4, r2, c6, c0, 4 @ HPFAR 416 mrcne p15, 4, r2, c6, c0, 4 @ HPFAR
415 bne 3f 417 bne 3f
416 418
419 /* Preserve PAR */
420 mrrc p15, 0, r0, r1, c7 @ PAR
421 push {r0, r1}
422
417 /* Resolve IPA using the xFAR */ 423 /* Resolve IPA using the xFAR */
418 mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR 424 mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR
419 isb 425 isb
@@ -424,13 +430,20 @@ guest_trap:
424 lsl r2, r2, #4 430 lsl r2, r2, #4
425 orr r2, r2, r1, lsl #24 431 orr r2, r2, r1, lsl #24
426 432
433 /* Restore PAR */
434 pop {r0, r1}
435 mcrr p15, 0, r0, r1, c7 @ PAR
436
4273: load_vcpu @ Load VCPU pointer to r0 4373: load_vcpu @ Load VCPU pointer to r0
428 str r2, [r0, #VCPU_HPFAR] 438 str r2, [r0, #VCPU_HPFAR]
429 439
4301: mov r1, #ARM_EXCEPTION_HVC 4401: mov r1, #ARM_EXCEPTION_HVC
431 b __kvm_vcpu_return 441 b __kvm_vcpu_return
432 442
4334: pop {r0, r1, r2} @ Failed translation, return to guest 4434: pop {r0, r1} @ Failed translation, return to guest
444 mcrr p15, 0, r0, r1, c7 @ PAR
445 clrex
446 pop {r0, r1, r2}
434 eret 447 eret
435 448
436/* 449/*
@@ -456,6 +469,7 @@ switch_to_guest_vfp:
456 469
457 pop {r3-r7} 470 pop {r3-r7}
458 pop {r0-r2} 471 pop {r0-r2}
472 clrex
459 eret 473 eret
460#endif 474#endif
461 475
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index d43cfb5b37c4..6f18695a09cb 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -302,11 +302,14 @@ vcpu .req r0 @ vcpu pointer always in r0
302 .endif 302 .endif
303 303
304 mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL 304 mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL
305 mrrc p15, 0, r4, r5, c7 @ PAR
305 306
306 .if \store_to_vcpu == 0 307 .if \store_to_vcpu == 0
307 push {r2} 308 push {r2,r4-r5}
308 .else 309 .else
309 str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] 310 str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
311 add r12, vcpu, #CP15_OFFSET(c7_PAR)
312 strd r4, r5, [r12]
310 .endif 313 .endif
311.endm 314.endm
312 315
@@ -319,12 +322,15 @@ vcpu .req r0 @ vcpu pointer always in r0
319 */ 322 */
320.macro write_cp15_state read_from_vcpu 323.macro write_cp15_state read_from_vcpu
321 .if \read_from_vcpu == 0 324 .if \read_from_vcpu == 0
322 pop {r2} 325 pop {r2,r4-r5}
323 .else 326 .else
324 ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] 327 ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
328 add r12, vcpu, #CP15_OFFSET(c7_PAR)
329 ldrd r4, r5, [r12]
325 .endif 330 .endif
326 331
327 mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL 332 mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL
333 mcrr p15, 0, r4, r5, c7 @ PAR
328 334
329 .if \read_from_vcpu == 0 335 .if \read_from_vcpu == 0
330 pop {r2-r12} 336 pop {r2-r12}
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 72a12f2171b2..b8e06b7a2833 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -86,12 +86,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
86 sign_extend = kvm_vcpu_dabt_issext(vcpu); 86 sign_extend = kvm_vcpu_dabt_issext(vcpu);
87 rt = kvm_vcpu_dabt_get_rd(vcpu); 87 rt = kvm_vcpu_dabt_get_rd(vcpu);
88 88
89 if (kvm_vcpu_reg_is_pc(vcpu, rt)) {
90 /* IO memory trying to read/write pc */
91 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
92 return 1;
93 }
94
95 mmio->is_write = is_write; 89 mmio->is_write = is_write;
96 mmio->phys_addr = fault_ipa; 90 mmio->phys_addr = fault_ipa;
97 mmio->len = len; 91 mmio->len = len;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 84ba67b982c0..ca6bea4859b4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -382,9 +382,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
382 if (!pgd) 382 if (!pgd)
383 return -ENOMEM; 383 return -ENOMEM;
384 384
385 /* stage-2 pgd must be aligned to its size */
386 VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1));
387
388 memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); 385 memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));
389 kvm_clean_pgd(pgd); 386 kvm_clean_pgd(pgd);
390 kvm->arch.pgd = pgd; 387 kvm->arch.pgd = pgd;
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 7ee5bb7a3667..86a693a02ba3 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -75,7 +75,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
75 * kvm_psci_call - handle PSCI call if r0 value is in range 75 * kvm_psci_call - handle PSCI call if r0 value is in range
76 * @vcpu: Pointer to the VCPU struct 76 * @vcpu: Pointer to the VCPU struct
77 * 77 *
78 * Handle PSCI calls from guests through traps from HVC or SMC instructions. 78 * Handle PSCI calls from guests through traps from HVC instructions.
79 * The calling convention is similar to SMC calls to the secure world where 79 * The calling convention is similar to SMC calls to the secure world where
80 * the function number is placed in r0 and this function returns true if the 80 * the function number is placed in r0 and this function returns true if the
81 * function number specified in r0 is withing the PSCI range, and false 81 * function number specified in r0 is withing the PSCI range, and false
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b80256b554cd..b7840e7aa452 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -27,6 +27,8 @@
27#include <asm/kvm_arm.h> 27#include <asm/kvm_arm.h>
28#include <asm/kvm_coproc.h> 28#include <asm/kvm_coproc.h>
29 29
30#include <kvm/arm_arch_timer.h>
31
30/****************************************************************************** 32/******************************************************************************
31 * Cortex-A15 Reset Values 33 * Cortex-A15 Reset Values
32 */ 34 */
@@ -37,6 +39,11 @@ static struct kvm_regs a15_regs_reset = {
37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, 39 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
38}; 40};
39 41
42static const struct kvm_irq_level a15_vtimer_irq = {
43 .irq = 27,
44 .level = 1,
45};
46
40 47
41/******************************************************************************* 48/*******************************************************************************
42 * Exported reset function 49 * Exported reset function
@@ -52,6 +59,7 @@ static struct kvm_regs a15_regs_reset = {
52int kvm_reset_vcpu(struct kvm_vcpu *vcpu) 59int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
53{ 60{
54 struct kvm_regs *cpu_reset; 61 struct kvm_regs *cpu_reset;
62 const struct kvm_irq_level *cpu_vtimer_irq;
55 63
56 switch (vcpu->arch.target) { 64 switch (vcpu->arch.target) {
57 case KVM_ARM_TARGET_CORTEX_A15: 65 case KVM_ARM_TARGET_CORTEX_A15:
@@ -59,6 +67,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
59 return -EINVAL; 67 return -EINVAL;
60 cpu_reset = &a15_regs_reset; 68 cpu_reset = &a15_regs_reset;
61 vcpu->arch.midr = read_cpuid_id(); 69 vcpu->arch.midr = read_cpuid_id();
70 cpu_vtimer_irq = &a15_vtimer_irq;
62 break; 71 break;
63 default: 72 default:
64 return -ENODEV; 73 return -ENODEV;
@@ -70,5 +79,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
70 /* Reset CP15 registers */ 79 /* Reset CP15 registers */
71 kvm_reset_coprocs(vcpu); 80 kvm_reset_coprocs(vcpu);
72 81
82 /* Reset arch_timer context */
83 kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
84
73 return 0; 85 return 0;
74} 86}
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 1a4053789d01..18e45ec49bbf 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -47,12 +47,13 @@ FORCE : $(obj)/$(offsets-file)
47 47
48ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 48ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
50KVM := ../../../virt/kvm
50 51
51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 52common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o \
52 coalesced_mmio.o irq_comm.o) 53 $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o
53 54
54ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y) 55ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
55common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o) 56common-objs += $(KVM)/assigned-dev.o $(KVM)/iommu.o
56endif 57endif
57 58
58kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o 59kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 349ed85c7d61..08891d07aeb6 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -107,8 +107,9 @@ struct kvmppc_vcpu_book3s {
107#define CONTEXT_GUEST 1 107#define CONTEXT_GUEST 1
108#define CONTEXT_GUEST_END 2 108#define CONTEXT_GUEST_END 2
109 109
110#define VSID_REAL 0x1fffffffffc00000ULL 110#define VSID_REAL 0x0fffffffffc00000ULL
111#define VSID_BAT 0x1fffffffffb00000ULL 111#define VSID_BAT 0x0fffffffffb00000ULL
112#define VSID_1T 0x1000000000000000ULL
112#define VSID_REAL_DR 0x2000000000000000ULL 113#define VSID_REAL_DR 0x2000000000000000ULL
113#define VSID_REAL_IR 0x4000000000000000ULL 114#define VSID_REAL_IR 0x4000000000000000ULL
114#define VSID_PR 0x8000000000000000ULL 115#define VSID_PR 0x8000000000000000ULL
@@ -123,6 +124,7 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
123extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); 124extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
124extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 125extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
125extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 126extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
127extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size);
126extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 128extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
127extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 129extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
128 struct kvm_vcpu *vcpu, unsigned long addr, 130 struct kvm_vcpu *vcpu, unsigned long addr,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 422de3f4d46c..008cd856c5b5 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -5,9 +5,10 @@
5subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 5subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm 7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
8KVM := ../../../virt/kvm
8 9
9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ 10common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
10 eventfd.o) 11 $(KVM)/eventfd.o
11 12
12CFLAGS_44x_tlb.o := -I. 13CFLAGS_44x_tlb.o := -I.
13CFLAGS_e500_mmu.o := -I. 14CFLAGS_e500_mmu.o := -I.
@@ -53,7 +54,7 @@ kvm-e500mc-objs := \
53kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) 54kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
54 55
55kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ 56kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
56 ../../../virt/kvm/coalesced_mmio.o \ 57 $(KVM)/coalesced_mmio.o \
57 fpu.o \ 58 fpu.o \
58 book3s_paired_singles.o \ 59 book3s_paired_singles.o \
59 book3s_pr.o \ 60 book3s_pr.o \
@@ -86,8 +87,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
86 book3s_xics.o 87 book3s_xics.o
87 88
88kvm-book3s_64-module-objs := \ 89kvm-book3s_64-module-objs := \
89 ../../../virt/kvm/kvm_main.o \ 90 $(KVM)/kvm_main.o \
90 ../../../virt/kvm/eventfd.o \ 91 $(KVM)/eventfd.o \
91 powerpc.o \ 92 powerpc.o \
92 emulate.o \ 93 emulate.o \
93 book3s.o \ 94 book3s.o \
@@ -111,7 +112,7 @@ kvm-book3s_32-objs := \
111kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 112kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
112 113
113kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o 114kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
114kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o) 115kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
115 116
116kvm-objs := $(kvm-objs-m) $(kvm-objs-y) 117kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
117 118
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b871721c0050..739bfbadb85e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -26,6 +26,7 @@
26#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
27#include <asm/kvm_ppc.h> 27#include <asm/kvm_ppc.h>
28#include <asm/kvm_book3s.h> 28#include <asm/kvm_book3s.h>
29#include <asm/mmu-hash64.h>
29 30
30/* #define DEBUG_MMU */ 31/* #define DEBUG_MMU */
31 32
@@ -76,6 +77,24 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
76 return NULL; 77 return NULL;
77} 78}
78 79
80static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe)
81{
82 return slbe->tb ? SID_SHIFT_1T : SID_SHIFT;
83}
84
85static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe)
86{
87 return (1ul << kvmppc_slb_sid_shift(slbe)) - 1;
88}
89
90static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr)
91{
92 eaddr &= kvmppc_slb_offset_mask(slb);
93
94 return (eaddr >> VPN_SHIFT) |
95 ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT));
96}
97
79static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, 98static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
80 bool data) 99 bool data)
81{ 100{
@@ -85,11 +104,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
85 if (!slb) 104 if (!slb)
86 return 0; 105 return 0;
87 106
88 if (slb->tb) 107 return kvmppc_slb_calc_vpn(slb, eaddr);
89 return (((u64)eaddr >> 12) & 0xfffffff) |
90 (((u64)slb->vsid) << 28);
91
92 return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16);
93} 108}
94 109
95static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) 110static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
@@ -100,7 +115,8 @@ static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
100static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) 115static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
101{ 116{
102 int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); 117 int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
103 return ((eaddr & 0xfffffff) >> p); 118
119 return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
104} 120}
105 121
106static hva_t kvmppc_mmu_book3s_64_get_pteg( 122static hva_t kvmppc_mmu_book3s_64_get_pteg(
@@ -109,13 +125,15 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
109 bool second) 125 bool second)
110{ 126{
111 u64 hash, pteg, htabsize; 127 u64 hash, pteg, htabsize;
112 u32 page; 128 u32 ssize;
113 hva_t r; 129 hva_t r;
130 u64 vpn;
114 131
115 page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
116 htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); 132 htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
117 133
118 hash = slbe->vsid ^ page; 134 vpn = kvmppc_slb_calc_vpn(slbe, eaddr);
135 ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M;
136 hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);
119 if (second) 137 if (second)
120 hash = ~hash; 138 hash = ~hash;
121 hash &= ((1ULL << 39ULL) - 1ULL); 139 hash &= ((1ULL << 39ULL) - 1ULL);
@@ -146,7 +164,7 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
146 u64 avpn; 164 u64 avpn;
147 165
148 avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); 166 avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
149 avpn |= slbe->vsid << (28 - p); 167 avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
150 168
151 if (p < 24) 169 if (p < 24)
152 avpn >>= ((80 - p) - 56) - 8; 170 avpn >>= ((80 - p) - 56) - 8;
@@ -167,7 +185,6 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
167 int i; 185 int i;
168 u8 key = 0; 186 u8 key = 0;
169 bool found = false; 187 bool found = false;
170 bool perm_err = false;
171 int second = 0; 188 int second = 0;
172 ulong mp_ea = vcpu->arch.magic_page_ea; 189 ulong mp_ea = vcpu->arch.magic_page_ea;
173 190
@@ -190,13 +207,15 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
190 if (!slbe) 207 if (!slbe)
191 goto no_seg_found; 208 goto no_seg_found;
192 209
210 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
211 if (slbe->tb)
212 avpn |= SLB_VSID_B_1T;
213
193do_second: 214do_second:
194 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); 215 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
195 if (kvm_is_error_hva(ptegp)) 216 if (kvm_is_error_hva(ptegp))
196 goto no_page_found; 217 goto no_page_found;
197 218
198 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
199
200 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { 219 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
201 printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); 220 printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
202 goto no_page_found; 221 goto no_page_found;
@@ -219,7 +238,7 @@ do_second:
219 continue; 238 continue;
220 239
221 /* AVPN compare */ 240 /* AVPN compare */
222 if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { 241 if (HPTE_V_COMPARE(avpn, v)) {
223 u8 pp = (r & HPTE_R_PP) | key; 242 u8 pp = (r & HPTE_R_PP) | key;
224 int eaddr_mask = 0xFFF; 243 int eaddr_mask = 0xFFF;
225 244
@@ -248,11 +267,6 @@ do_second:
248 break; 267 break;
249 } 268 }
250 269
251 if (!gpte->may_read) {
252 perm_err = true;
253 continue;
254 }
255
256 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " 270 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
257 "-> 0x%lx\n", 271 "-> 0x%lx\n",
258 eaddr, avpn, gpte->vpage, gpte->raddr); 272 eaddr, avpn, gpte->vpage, gpte->raddr);
@@ -281,6 +295,8 @@ do_second:
281 if (pteg[i+1] != oldr) 295 if (pteg[i+1] != oldr)
282 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); 296 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
283 297
298 if (!gpte->may_read)
299 return -EPERM;
284 return 0; 300 return 0;
285 } else { 301 } else {
286 dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " 302 dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
@@ -296,13 +312,7 @@ do_second:
296 } 312 }
297 } 313 }
298 314
299
300no_page_found: 315no_page_found:
301
302
303 if (perm_err)
304 return -EPERM;
305
306 return -ENOENT; 316 return -ENOENT;
307 317
308no_seg_found: 318no_seg_found:
@@ -334,7 +344,7 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
334 slbe->large = (rs & SLB_VSID_L) ? 1 : 0; 344 slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
335 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; 345 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0;
336 slbe->esid = slbe->tb ? esid_1t : esid; 346 slbe->esid = slbe->tb ? esid_1t : esid;
337 slbe->vsid = rs >> 12; 347 slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);
338 slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; 348 slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
339 slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; 349 slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0;
340 slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; 350 slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0;
@@ -375,6 +385,7 @@ static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
375static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) 385static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
376{ 386{
377 struct kvmppc_slb *slbe; 387 struct kvmppc_slb *slbe;
388 u64 seg_size;
378 389
379 dprintk("KVM MMU: slbie(0x%llx)\n", ea); 390 dprintk("KVM MMU: slbie(0x%llx)\n", ea);
380 391
@@ -386,8 +397,11 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
386 dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); 397 dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
387 398
388 slbe->valid = false; 399 slbe->valid = false;
400 slbe->orige = 0;
401 slbe->origv = 0;
389 402
390 kvmppc_mmu_map_segment(vcpu, ea); 403 seg_size = 1ull << kvmppc_slb_sid_shift(slbe);
404 kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);
391} 405}
392 406
393static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) 407static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
@@ -396,8 +410,11 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
396 410
397 dprintk("KVM MMU: slbia()\n"); 411 dprintk("KVM MMU: slbia()\n");
398 412
399 for (i = 1; i < vcpu->arch.slb_nr; i++) 413 for (i = 1; i < vcpu->arch.slb_nr; i++) {
400 vcpu->arch.slb[i].valid = false; 414 vcpu->arch.slb[i].valid = false;
415 vcpu->arch.slb[i].orige = 0;
416 vcpu->arch.slb[i].origv = 0;
417 }
401 418
402 if (vcpu->arch.shared->msr & MSR_IR) { 419 if (vcpu->arch.shared->msr & MSR_IR) {
403 kvmppc_mmu_flush_segments(vcpu); 420 kvmppc_mmu_flush_segments(vcpu);
@@ -467,8 +484,14 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
467 484
468 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 485 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
469 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); 486 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
470 if (slb) 487 if (slb) {
471 gvsid = slb->vsid; 488 gvsid = slb->vsid;
489 if (slb->tb) {
490 gvsid <<= SID_SHIFT_1T - SID_SHIFT;
491 gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
492 gvsid |= VSID_1T;
493 }
494 }
472 } 495 }
473 496
474 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 497 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 3a9a1aceb14f..b350d9494b26 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -301,6 +301,23 @@ out:
301 return r; 301 return r;
302} 302}
303 303
304void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
305{
306 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
307 ulong seg_mask = -seg_size;
308 int i;
309
310 for (i = 1; i < svcpu->slb_max; i++) {
311 if ((svcpu->slb[i].esid & SLB_ESID_V) &&
312 (svcpu->slb[i].esid & seg_mask) == ea) {
313 /* Invalidate this entry */
314 svcpu->slb[i].esid = 0;
315 }
316 }
317
318 svcpu_put(svcpu);
319}
320
304void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) 321void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
305{ 322{
306 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); 323 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -325,9 +342,9 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
325 return -1; 342 return -1;
326 vcpu3s->context_id[0] = err; 343 vcpu3s->context_id[0] = err;
327 344
328 vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1) 345 vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1)
329 << ESID_BITS) - 1; 346 << ESID_BITS) - 1;
330 vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << ESID_BITS; 347 vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS;
331 vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; 348 vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;
332 349
333 kvmppc_mmu_hpte_init(vcpu); 350 kvmppc_mmu_hpte_init(vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 56b983e7b738..4f0caecc0f9d 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -66,10 +66,6 @@ slb_exit_skip_ ## num:
66 66
67 ld r12, PACA_SLBSHADOWPTR(r13) 67 ld r12, PACA_SLBSHADOWPTR(r13)
68 68
69 /* Save off the first entry so we can slbie it later */
70 ld r10, SHADOW_SLB_ESID(0)(r12)
71 ld r11, SHADOW_SLB_VSID(0)(r12)
72
73 /* Remove bolted entries */ 69 /* Remove bolted entries */
74 UNBOLT_SLB_ENTRY(0) 70 UNBOLT_SLB_ENTRY(0)
75 UNBOLT_SLB_ENTRY(1) 71 UNBOLT_SLB_ENTRY(1)
@@ -81,15 +77,10 @@ slb_exit_skip_ ## num:
81 77
82 /* Flush SLB */ 78 /* Flush SLB */
83 79
80 li r10, 0
81 slbmte r10, r10
84 slbia 82 slbia
85 83
86 /* r0 = esid & ESID_MASK */
87 rldicr r10, r10, 0, 35
88 /* r0 |= CLASS_BIT(VSID) */
89 rldic r12, r11, 56 - 36, 36
90 or r10, r10, r12
91 slbie r10
92
93 /* Fill SLB with our shadow */ 84 /* Fill SLB with our shadow */
94 85
95 lbz r12, SVCPU_SLB_MAX(r3) 86 lbz r12, SVCPU_SLB_MAX(r3)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index bdc40b8e77d9..19498a567a81 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1239,8 +1239,7 @@ out:
1239#ifdef CONFIG_PPC64 1239#ifdef CONFIG_PPC64
1240int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) 1240int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
1241{ 1241{
1242 /* No flags */ 1242 info->flags = KVM_PPC_1T_SEGMENTS;
1243 info->flags = 0;
1244 1243
1245 /* SLB is always 64 entries */ 1244 /* SLB is always 64 entries */
1246 info->slb_size = 64; 1245 info->slb_size = 64;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1a1b51189773..dcc94f016007 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -796,7 +796,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
796 kvmppc_fill_pt_regs(&regs); 796 kvmppc_fill_pt_regs(&regs);
797 timer_interrupt(&regs); 797 timer_interrupt(&regs);
798 break; 798 break;
799#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) 799#if defined(CONFIG_PPC_DOORBELL)
800 case BOOKE_INTERRUPT_DOORBELL: 800 case BOOKE_INTERRUPT_DOORBELL:
801 kvmppc_fill_pt_regs(&regs); 801 kvmppc_fill_pt_regs(&regs);
802 doorbell_exception(&regs); 802 doorbell_exception(&regs);
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 631a2650e4e4..2c52ada30775 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -169,6 +169,9 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
169 vcpu->arch.shared->sprg3 = spr_val; 169 vcpu->arch.shared->sprg3 = spr_val;
170 break; 170 break;
171 171
172 /* PIR can legally be written, but we ignore it */
173 case SPRN_PIR: break;
174
172 default: 175 default:
173 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, 176 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
174 spr_val); 177 spr_val);
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 16bd5d169cdb..3238d4004e84 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -62,13 +62,20 @@ struct sca_block {
62#define CPUSTAT_MCDS 0x00000100 62#define CPUSTAT_MCDS 0x00000100
63#define CPUSTAT_SM 0x00000080 63#define CPUSTAT_SM 0x00000080
64#define CPUSTAT_G 0x00000008 64#define CPUSTAT_G 0x00000008
65#define CPUSTAT_GED 0x00000004
65#define CPUSTAT_J 0x00000002 66#define CPUSTAT_J 0x00000002
66#define CPUSTAT_P 0x00000001 67#define CPUSTAT_P 0x00000001
67 68
68struct kvm_s390_sie_block { 69struct kvm_s390_sie_block {
69 atomic_t cpuflags; /* 0x0000 */ 70 atomic_t cpuflags; /* 0x0000 */
70 __u32 prefix; /* 0x0004 */ 71 __u32 prefix; /* 0x0004 */
71 __u8 reserved8[32]; /* 0x0008 */ 72 __u8 reserved08[4]; /* 0x0008 */
73#define PROG_IN_SIE (1<<0)
74 __u32 prog0c; /* 0x000c */
75 __u8 reserved10[16]; /* 0x0010 */
76#define PROG_BLOCK_SIE 0x00000001
77 atomic_t prog20; /* 0x0020 */
78 __u8 reserved24[4]; /* 0x0024 */
72 __u64 cputm; /* 0x0028 */ 79 __u64 cputm; /* 0x0028 */
73 __u64 ckc; /* 0x0030 */ 80 __u64 ckc; /* 0x0030 */
74 __u64 epoch; /* 0x0038 */ 81 __u64 epoch; /* 0x0038 */
@@ -90,7 +97,8 @@ struct kvm_s390_sie_block {
90 __u32 scaoh; /* 0x005c */ 97 __u32 scaoh; /* 0x005c */
91 __u8 reserved60; /* 0x0060 */ 98 __u8 reserved60; /* 0x0060 */
92 __u8 ecb; /* 0x0061 */ 99 __u8 ecb; /* 0x0061 */
93 __u8 reserved62[2]; /* 0x0062 */ 100 __u8 ecb2; /* 0x0062 */
101 __u8 reserved63[1]; /* 0x0063 */
94 __u32 scaol; /* 0x0064 */ 102 __u32 scaol; /* 0x0064 */
95 __u8 reserved68[4]; /* 0x0068 */ 103 __u8 reserved68[4]; /* 0x0068 */
96 __u32 todpr; /* 0x006c */ 104 __u32 todpr; /* 0x006c */
@@ -130,6 +138,7 @@ struct kvm_vcpu_stat {
130 u32 deliver_program_int; 138 u32 deliver_program_int;
131 u32 deliver_io_int; 139 u32 deliver_io_int;
132 u32 exit_wait_state; 140 u32 exit_wait_state;
141 u32 instruction_pfmf;
133 u32 instruction_stidp; 142 u32 instruction_stidp;
134 u32 instruction_spx; 143 u32 instruction_spx;
135 u32 instruction_stpx; 144 u32 instruction_stpx;
@@ -166,7 +175,7 @@ struct kvm_s390_ext_info {
166}; 175};
167 176
168#define PGM_OPERATION 0x01 177#define PGM_OPERATION 0x01
169#define PGM_PRIVILEGED_OPERATION 0x02 178#define PGM_PRIVILEGED_OP 0x02
170#define PGM_EXECUTE 0x03 179#define PGM_EXECUTE 0x03
171#define PGM_PROTECTION 0x04 180#define PGM_PROTECTION 0x04
172#define PGM_ADDRESSING 0x05 181#define PGM_ADDRESSING 0x05
@@ -219,7 +228,7 @@ struct kvm_s390_local_interrupt {
219 atomic_t active; 228 atomic_t active;
220 struct kvm_s390_float_interrupt *float_int; 229 struct kvm_s390_float_interrupt *float_int;
221 int timer_due; /* event indicator for waitqueue below */ 230 int timer_due; /* event indicator for waitqueue below */
222 wait_queue_head_t wq; 231 wait_queue_head_t *wq;
223 atomic_t *cpuflags; 232 atomic_t *cpuflags;
224 unsigned int action_bits; 233 unsigned int action_bits;
225}; 234};
@@ -266,4 +275,5 @@ struct kvm_arch{
266}; 275};
267 276
268extern int sie64a(struct kvm_s390_sie_block *, u64 *); 277extern int sie64a(struct kvm_s390_sie_block *, u64 *);
278extern char sie_exit;
269#endif 279#endif
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 5f0173a31693..1141fb3e7b21 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -14,3 +14,13 @@
14/* Per-CPU flags for PMU states */ 14/* Per-CPU flags for PMU states */
15#define PMU_F_RESERVED 0x1000 15#define PMU_F_RESERVED 0x1000
16#define PMU_F_ENABLED 0x2000 16#define PMU_F_ENABLED 0x2000
17
18#ifdef CONFIG_64BIT
19
20/* Perf callbacks */
21struct pt_regs;
22extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
23extern unsigned long perf_misc_flags(struct pt_regs *regs);
24#define perf_misc_flags(regs) perf_misc_flags(regs)
25
26#endif /* CONFIG_64BIT */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9aefa3c64eb2..0ea4e591fa78 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -296,18 +296,16 @@ extern unsigned long MODULES_END;
296#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV) 296#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV)
297 297
298/* Page status table bits for virtualization */ 298/* Page status table bits for virtualization */
299#define RCP_ACC_BITS 0xf0000000UL 299#define PGSTE_ACC_BITS 0xf0000000UL
300#define RCP_FP_BIT 0x08000000UL 300#define PGSTE_FP_BIT 0x08000000UL
301#define RCP_PCL_BIT 0x00800000UL 301#define PGSTE_PCL_BIT 0x00800000UL
302#define RCP_HR_BIT 0x00400000UL 302#define PGSTE_HR_BIT 0x00400000UL
303#define RCP_HC_BIT 0x00200000UL 303#define PGSTE_HC_BIT 0x00200000UL
304#define RCP_GR_BIT 0x00040000UL 304#define PGSTE_GR_BIT 0x00040000UL
305#define RCP_GC_BIT 0x00020000UL 305#define PGSTE_GC_BIT 0x00020000UL
306#define RCP_IN_BIT 0x00002000UL /* IPTE notify bit */ 306#define PGSTE_UR_BIT 0x00008000UL
307 307#define PGSTE_UC_BIT 0x00004000UL /* user dirty (migration) */
308/* User dirty / referenced bit for KVM's migration feature */ 308#define PGSTE_IN_BIT 0x00002000UL /* IPTE notify bit */
309#define KVM_UR_BIT 0x00008000UL
310#define KVM_UC_BIT 0x00004000UL
311 309
312#else /* CONFIG_64BIT */ 310#else /* CONFIG_64BIT */
313 311
@@ -364,18 +362,16 @@ extern unsigned long MODULES_END;
364 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO) 362 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
365 363
366/* Page status table bits for virtualization */ 364/* Page status table bits for virtualization */
367#define RCP_ACC_BITS 0xf000000000000000UL 365#define PGSTE_ACC_BITS 0xf000000000000000UL
368#define RCP_FP_BIT 0x0800000000000000UL 366#define PGSTE_FP_BIT 0x0800000000000000UL
369#define RCP_PCL_BIT 0x0080000000000000UL 367#define PGSTE_PCL_BIT 0x0080000000000000UL
370#define RCP_HR_BIT 0x0040000000000000UL 368#define PGSTE_HR_BIT 0x0040000000000000UL
371#define RCP_HC_BIT 0x0020000000000000UL 369#define PGSTE_HC_BIT 0x0020000000000000UL
372#define RCP_GR_BIT 0x0004000000000000UL 370#define PGSTE_GR_BIT 0x0004000000000000UL
373#define RCP_GC_BIT 0x0002000000000000UL 371#define PGSTE_GC_BIT 0x0002000000000000UL
374#define RCP_IN_BIT 0x0000200000000000UL /* IPTE notify bit */ 372#define PGSTE_UR_BIT 0x0000800000000000UL
375 373#define PGSTE_UC_BIT 0x0000400000000000UL /* user dirty (migration) */
376/* User dirty / referenced bit for KVM's migration feature */ 374#define PGSTE_IN_BIT 0x0000200000000000UL /* IPTE notify bit */
377#define KVM_UR_BIT 0x0000800000000000UL
378#define KVM_UC_BIT 0x0000400000000000UL
379 375
380#endif /* CONFIG_64BIT */ 376#endif /* CONFIG_64BIT */
381 377
@@ -615,8 +611,8 @@ static inline pgste_t pgste_get_lock(pte_t *ptep)
615 asm( 611 asm(
616 " lg %0,%2\n" 612 " lg %0,%2\n"
617 "0: lgr %1,%0\n" 613 "0: lgr %1,%0\n"
618 " nihh %0,0xff7f\n" /* clear RCP_PCL_BIT in old */ 614 " nihh %0,0xff7f\n" /* clear PCL bit in old */
619 " oihh %1,0x0080\n" /* set RCP_PCL_BIT in new */ 615 " oihh %1,0x0080\n" /* set PCL bit in new */
620 " csg %0,%1,%2\n" 616 " csg %0,%1,%2\n"
621 " jl 0b\n" 617 " jl 0b\n"
622 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) 618 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
@@ -629,7 +625,7 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
629{ 625{
630#ifdef CONFIG_PGSTE 626#ifdef CONFIG_PGSTE
631 asm( 627 asm(
632 " nihh %1,0xff7f\n" /* clear RCP_PCL_BIT */ 628 " nihh %1,0xff7f\n" /* clear PCL bit */
633 " stg %1,%0\n" 629 " stg %1,%0\n"
634 : "=Q" (ptep[PTRS_PER_PTE]) 630 : "=Q" (ptep[PTRS_PER_PTE])
635 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) 631 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
@@ -662,14 +658,14 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
662 else if (bits) 658 else if (bits)
663 page_reset_referenced(address); 659 page_reset_referenced(address);
664 /* Transfer page changed & referenced bit to guest bits in pgste */ 660 /* Transfer page changed & referenced bit to guest bits in pgste */
665 pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ 661 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */
666 /* Get host changed & referenced bits from pgste */ 662 /* Get host changed & referenced bits from pgste */
667 bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52; 663 bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52;
668 /* Transfer page changed & referenced bit to kvm user bits */ 664 /* Transfer page changed & referenced bit to kvm user bits */
669 pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ 665 pgste_val(pgste) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */
670 /* Clear relevant host bits in pgste. */ 666 /* Clear relevant host bits in pgste. */
671 pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT); 667 pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT);
672 pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT); 668 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
673 /* Copy page access key and fetch protection bit to pgste */ 669 /* Copy page access key and fetch protection bit to pgste */
674 pgste_val(pgste) |= 670 pgste_val(pgste) |=
675 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 671 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
@@ -690,15 +686,15 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
690 /* Get referenced bit from storage key */ 686 /* Get referenced bit from storage key */
691 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); 687 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
692 if (young) 688 if (young)
693 pgste_val(pgste) |= RCP_GR_BIT; 689 pgste_val(pgste) |= PGSTE_GR_BIT;
694 /* Get host referenced bit from pgste */ 690 /* Get host referenced bit from pgste */
695 if (pgste_val(pgste) & RCP_HR_BIT) { 691 if (pgste_val(pgste) & PGSTE_HR_BIT) {
696 pgste_val(pgste) &= ~RCP_HR_BIT; 692 pgste_val(pgste) &= ~PGSTE_HR_BIT;
697 young = 1; 693 young = 1;
698 } 694 }
699 /* Transfer referenced bit to kvm user bits and pte */ 695 /* Transfer referenced bit to kvm user bits and pte */
700 if (young) { 696 if (young) {
701 pgste_val(pgste) |= KVM_UR_BIT; 697 pgste_val(pgste) |= PGSTE_UR_BIT;
702 pte_val(*ptep) |= _PAGE_SWR; 698 pte_val(*ptep) |= _PAGE_SWR;
703 } 699 }
704#endif 700#endif
@@ -720,7 +716,7 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
720 * The guest C/R information is still in the PGSTE, set real 716 * The guest C/R information is still in the PGSTE, set real
721 * key C/R to 0. 717 * key C/R to 0.
722 */ 718 */
723 nkey = (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56; 719 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
724 page_set_storage_key(address, nkey, 0); 720 page_set_storage_key(address, nkey, 0);
725#endif 721#endif
726} 722}
@@ -750,6 +746,7 @@ struct gmap {
750 struct mm_struct *mm; 746 struct mm_struct *mm;
751 unsigned long *table; 747 unsigned long *table;
752 unsigned long asce; 748 unsigned long asce;
749 void *private;
753 struct list_head crst_list; 750 struct list_head crst_list;
754}; 751};
755 752
@@ -808,8 +805,8 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
808 pte_t *ptep, pgste_t pgste) 805 pte_t *ptep, pgste_t pgste)
809{ 806{
810#ifdef CONFIG_PGSTE 807#ifdef CONFIG_PGSTE
811 if (pgste_val(pgste) & RCP_IN_BIT) { 808 if (pgste_val(pgste) & PGSTE_IN_BIT) {
812 pgste_val(pgste) &= ~RCP_IN_BIT; 809 pgste_val(pgste) &= ~PGSTE_IN_BIT;
813 gmap_do_ipte_notify(mm, addr, ptep); 810 gmap_do_ipte_notify(mm, addr, ptep);
814 } 811 }
815#endif 812#endif
@@ -977,8 +974,8 @@ static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm,
977 if (mm_has_pgste(mm)) { 974 if (mm_has_pgste(mm)) {
978 pgste = pgste_get_lock(ptep); 975 pgste = pgste_get_lock(ptep);
979 pgste = pgste_update_all(ptep, pgste); 976 pgste = pgste_update_all(ptep, pgste);
980 dirty = !!(pgste_val(pgste) & KVM_UC_BIT); 977 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
981 pgste_val(pgste) &= ~KVM_UC_BIT; 978 pgste_val(pgste) &= ~PGSTE_UC_BIT;
982 pgste_set_unlock(ptep, pgste); 979 pgste_set_unlock(ptep, pgste);
983 return dirty; 980 return dirty;
984 } 981 }
@@ -997,8 +994,8 @@ static inline int ptep_test_and_clear_user_young(struct mm_struct *mm,
997 if (mm_has_pgste(mm)) { 994 if (mm_has_pgste(mm)) {
998 pgste = pgste_get_lock(ptep); 995 pgste = pgste_get_lock(ptep);
999 pgste = pgste_update_young(ptep, pgste); 996 pgste = pgste_update_young(ptep, pgste);
1000 young = !!(pgste_val(pgste) & KVM_UR_BIT); 997 young = !!(pgste_val(pgste) & PGSTE_UR_BIT);
1001 pgste_val(pgste) &= ~KVM_UR_BIT; 998 pgste_val(pgste) &= ~PGSTE_UR_BIT;
1002 pgste_set_unlock(ptep, pgste); 999 pgste_set_unlock(ptep, pgste);
1003 } 1000 }
1004 return young; 1001 return young;
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index d6de844bc30a..2416138ebd3e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
7#define ASM_OFFSETS_C 7#define ASM_OFFSETS_C
8 8
9#include <linux/kbuild.h> 9#include <linux/kbuild.h>
10#include <linux/kvm_host.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <asm/cputime.h> 12#include <asm/cputime.h>
12#include <asm/vdso.h> 13#include <asm/vdso.h>
@@ -162,6 +163,8 @@ int main(void)
162 DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb)); 163 DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb));
163 DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); 164 DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb));
164 DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce)); 165 DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce));
166 DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c));
167 DEFINE(__SIE_PROG20, offsetof(struct kvm_s390_sie_block, prog20));
165#endif /* CONFIG_32BIT */ 168#endif /* CONFIG_32BIT */
166 return 0; 169 return 0;
167} 170}
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index bc5864c5148b..1c039d0c24c7 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -47,7 +47,6 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
47 _TIF_MCCK_PENDING) 47 _TIF_MCCK_PENDING)
48_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ 48_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
49 _TIF_SYSCALL_TRACEPOINT) 49 _TIF_SYSCALL_TRACEPOINT)
50_TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
51 50
52#define BASED(name) name-system_call(%r13) 51#define BASED(name) name-system_call(%r13)
53 52
@@ -81,23 +80,27 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
81#endif 80#endif
82 .endm 81 .endm
83 82
84 .macro HANDLE_SIE_INTERCEPT scratch,pgmcheck 83 .macro HANDLE_SIE_INTERCEPT scratch,reason
85#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) 84#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
86 tmhh %r8,0x0001 # interrupting from user ? 85 tmhh %r8,0x0001 # interrupting from user ?
87 jnz .+42 86 jnz .+62
88 lgr \scratch,%r9 87 lgr \scratch,%r9
89 slg \scratch,BASED(.Lsie_loop) 88 slg \scratch,BASED(.Lsie_critical)
90 clg \scratch,BASED(.Lsie_length) 89 clg \scratch,BASED(.Lsie_critical_length)
91 .if \pgmcheck 90 .if \reason==1
92 # Some program interrupts are suppressing (e.g. protection). 91 # Some program interrupts are suppressing (e.g. protection).
93 # We must also check the instruction after SIE in that case. 92 # We must also check the instruction after SIE in that case.
94 # do_protection_exception will rewind to rewind_pad 93 # do_protection_exception will rewind to rewind_pad
95 jh .+22 94 jh .+42
96 .else 95 .else
97 jhe .+22 96 jhe .+42
98 .endif 97 .endif
99 lg %r9,BASED(.Lsie_loop) 98 lg %r14,__SF_EMPTY(%r15) # get control block pointer
100 LPP BASED(.Lhost_id) # set host id 99 LPP __SF_EMPTY+16(%r15) # set host id
100 ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
101 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
102 larl %r9,sie_exit # skip forward to sie_exit
103 mvi __SF_EMPTY+31(%r15),\reason # set exit reason
101#endif 104#endif
102 .endm 105 .endm
103 106
@@ -450,7 +453,7 @@ ENTRY(io_int_handler)
450 lg %r12,__LC_THREAD_INFO 453 lg %r12,__LC_THREAD_INFO
451 larl %r13,system_call 454 larl %r13,system_call
452 lmg %r8,%r9,__LC_IO_OLD_PSW 455 lmg %r8,%r9,__LC_IO_OLD_PSW
453 HANDLE_SIE_INTERCEPT %r14,0 456 HANDLE_SIE_INTERCEPT %r14,2
454 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT 457 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
455 tmhh %r8,0x0001 # interrupting from user? 458 tmhh %r8,0x0001 # interrupting from user?
456 jz io_skip 459 jz io_skip
@@ -603,7 +606,7 @@ ENTRY(ext_int_handler)
603 lg %r12,__LC_THREAD_INFO 606 lg %r12,__LC_THREAD_INFO
604 larl %r13,system_call 607 larl %r13,system_call
605 lmg %r8,%r9,__LC_EXT_OLD_PSW 608 lmg %r8,%r9,__LC_EXT_OLD_PSW
606 HANDLE_SIE_INTERCEPT %r14,0 609 HANDLE_SIE_INTERCEPT %r14,3
607 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT 610 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
608 tmhh %r8,0x0001 # interrupting from user ? 611 tmhh %r8,0x0001 # interrupting from user ?
609 jz ext_skip 612 jz ext_skip
@@ -651,7 +654,7 @@ ENTRY(mcck_int_handler)
651 lg %r12,__LC_THREAD_INFO 654 lg %r12,__LC_THREAD_INFO
652 larl %r13,system_call 655 larl %r13,system_call
653 lmg %r8,%r9,__LC_MCK_OLD_PSW 656 lmg %r8,%r9,__LC_MCK_OLD_PSW
654 HANDLE_SIE_INTERCEPT %r14,0 657 HANDLE_SIE_INTERCEPT %r14,4
655 tm __LC_MCCK_CODE,0x80 # system damage? 658 tm __LC_MCCK_CODE,0x80 # system damage?
656 jo mcck_panic # yes -> rest of mcck code invalid 659 jo mcck_panic # yes -> rest of mcck code invalid
657 lghi %r14,__LC_CPU_TIMER_SAVE_AREA 660 lghi %r14,__LC_CPU_TIMER_SAVE_AREA
@@ -945,56 +948,50 @@ ENTRY(sie64a)
945 stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers 948 stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
946 stg %r2,__SF_EMPTY(%r15) # save control block pointer 949 stg %r2,__SF_EMPTY(%r15) # save control block pointer
947 stg %r3,__SF_EMPTY+8(%r15) # save guest register save area 950 stg %r3,__SF_EMPTY+8(%r15) # save guest register save area
948 xc __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # host id == 0 951 xc __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
949 lmg %r0,%r13,0(%r3) # load guest gprs 0-13 952 lmg %r0,%r13,0(%r3) # load guest gprs 0-13
950# some program checks are suppressing. C code (e.g. do_protection_exception)
951# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
952# instructions in the sie_loop should not cause program interrupts. So
953# lets use a nop (47 00 00 00) as a landing pad.
954# See also HANDLE_SIE_INTERCEPT
955rewind_pad:
956 nop 0
957sie_loop:
958 lg %r14,__LC_THREAD_INFO # pointer thread_info struct
959 tm __TI_flags+7(%r14),_TIF_EXIT_SIE
960 jnz sie_exit
961 lg %r14,__LC_GMAP # get gmap pointer 953 lg %r14,__LC_GMAP # get gmap pointer
962 ltgr %r14,%r14 954 ltgr %r14,%r14
963 jz sie_gmap 955 jz sie_gmap
964 lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce 956 lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce
965sie_gmap: 957sie_gmap:
966 lg %r14,__SF_EMPTY(%r15) # get control block pointer 958 lg %r14,__SF_EMPTY(%r15) # get control block pointer
959 oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
960 tm __SIE_PROG20+3(%r14),1 # last exit...
961 jnz sie_done
967 LPP __SF_EMPTY(%r15) # set guest id 962 LPP __SF_EMPTY(%r15) # set guest id
968 sie 0(%r14) 963 sie 0(%r14)
969sie_done: 964sie_done:
970 LPP __SF_EMPTY+16(%r15) # set host id 965 LPP __SF_EMPTY+16(%r15) # set host id
971 lg %r14,__LC_THREAD_INFO # pointer thread_info struct 966 ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
972sie_exit:
973 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce 967 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
968# some program checks are suppressing. C code (e.g. do_protection_exception)
969# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
970# instructions beween sie64a and sie_done should not cause program
971# interrupts. So lets use a nop (47 00 00 00) as a landing pad.
972# See also HANDLE_SIE_INTERCEPT
973rewind_pad:
974 nop 0
975 .globl sie_exit
976sie_exit:
974 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area 977 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area
975 stmg %r0,%r13,0(%r14) # save guest gprs 0-13 978 stmg %r0,%r13,0(%r14) # save guest gprs 0-13
976 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers 979 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
977 lghi %r2,0 980 lg %r2,__SF_EMPTY+24(%r15) # return exit reason code
978 br %r14 981 br %r14
979sie_fault: 982sie_fault:
980 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce 983 lghi %r14,-EFAULT
981 lg %r14,__LC_THREAD_INFO # pointer thread_info struct 984 stg %r14,__SF_EMPTY+24(%r15) # set exit reason code
982 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area 985 j sie_exit
983 stmg %r0,%r13,0(%r14) # save guest gprs 0-13
984 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
985 lghi %r2,-EFAULT
986 br %r14
987 986
988 .align 8 987 .align 8
989.Lsie_loop: 988.Lsie_critical:
990 .quad sie_loop 989 .quad sie_gmap
991.Lsie_length: 990.Lsie_critical_length:
992 .quad sie_done - sie_loop 991 .quad sie_done - sie_gmap
993.Lhost_id:
994 .quad 0
995 992
996 EX_TABLE(rewind_pad,sie_fault) 993 EX_TABLE(rewind_pad,sie_fault)
997 EX_TABLE(sie_loop,sie_fault) 994 EX_TABLE(sie_exit,sie_fault)
998#endif 995#endif
999 996
1000 .section .rodata, "a" 997 .section .rodata, "a"
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index f58f37f66824..a6fc037671b1 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16#include <linux/kvm_host.h>
16#include <linux/percpu.h> 17#include <linux/percpu.h>
17#include <linux/export.h> 18#include <linux/export.h>
18#include <asm/irq.h> 19#include <asm/irq.h>
@@ -39,6 +40,57 @@ int perf_num_counters(void)
39} 40}
40EXPORT_SYMBOL(perf_num_counters); 41EXPORT_SYMBOL(perf_num_counters);
41 42
43static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
44{
45 struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
46
47 if (!stack)
48 return NULL;
49
50 return (struct kvm_s390_sie_block *) stack->empty1[0];
51}
52
53static bool is_in_guest(struct pt_regs *regs)
54{
55 unsigned long ip = instruction_pointer(regs);
56
57 if (user_mode(regs))
58 return false;
59
60 return ip == (unsigned long) &sie_exit;
61}
62
63static unsigned long guest_is_user_mode(struct pt_regs *regs)
64{
65 return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE;
66}
67
68static unsigned long instruction_pointer_guest(struct pt_regs *regs)
69{
70 return sie_block(regs)->gpsw.addr & PSW_ADDR_INSN;
71}
72
73unsigned long perf_instruction_pointer(struct pt_regs *regs)
74{
75 return is_in_guest(regs) ? instruction_pointer_guest(regs)
76 : instruction_pointer(regs);
77}
78
79static unsigned long perf_misc_guest_flags(struct pt_regs *regs)
80{
81 return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER
82 : PERF_RECORD_MISC_GUEST_KERNEL;
83}
84
85unsigned long perf_misc_flags(struct pt_regs *regs)
86{
87 if (is_in_guest(regs))
88 return perf_misc_guest_flags(regs);
89
90 return user_mode(regs) ? PERF_RECORD_MISC_USER
91 : PERF_RECORD_MISC_KERNEL;
92}
93
42void perf_event_print_debug(void) 94void perf_event_print_debug(void)
43{ 95{
44 struct cpumf_ctr_info cf_info; 96 struct cpumf_ctr_info cf_info;
diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 9bdbcef1da9e..3bac589844a7 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -7,6 +7,7 @@ EXPORT_SYMBOL(_mcount);
7#endif 7#endif
8#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) 8#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
9EXPORT_SYMBOL(sie64a); 9EXPORT_SYMBOL(sie64a);
10EXPORT_SYMBOL(sie_exit);
10#endif 11#endif
11EXPORT_SYMBOL(memcpy); 12EXPORT_SYMBOL(memcpy);
12EXPORT_SYMBOL(memset); 13EXPORT_SYMBOL(memset);
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 8fe9d65a4585..40b4c6470f88 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -6,7 +6,8 @@
6# it under the terms of the GNU General Public License (version 2 only) 6# it under the terms of the GNU General Public License (version 2 only)
7# as published by the Free Software Foundation. 7# as published by the Free Software Foundation.
8 8
9common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o) 9KVM := ../../../virt/kvm
10common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
10 11
11ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 12ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
12 13
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1c01a9912989..3074475c8ae0 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -132,6 +132,9 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
132{ 132{
133 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; 133 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
134 134
135 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
136 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
137
135 trace_kvm_s390_handle_diag(vcpu, code); 138 trace_kvm_s390_handle_diag(vcpu, code);
136 switch (code) { 139 switch (code) {
137 case 0x10: 140 case 0x10:
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index b7d1b2edeeb3..5ee56e5acc23 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -22,87 +22,6 @@
22#include "trace.h" 22#include "trace.h"
23#include "trace-s390.h" 23#include "trace-s390.h"
24 24
25static int handle_lctlg(struct kvm_vcpu *vcpu)
26{
27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
29 u64 useraddr;
30 int reg, rc;
31
32 vcpu->stat.instruction_lctlg++;
33
34 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
35
36 if (useraddr & 7)
37 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
38
39 reg = reg1;
40
41 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
42 useraddr);
43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
44
45 do {
46 rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
47 (u64 __user *) useraddr);
48 if (rc)
49 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
50 useraddr += 8;
51 if (reg == reg3)
52 break;
53 reg = (reg + 1) % 16;
54 } while (1);
55 return 0;
56}
57
58static int handle_lctl(struct kvm_vcpu *vcpu)
59{
60 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
61 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
62 u64 useraddr;
63 u32 val = 0;
64 int reg, rc;
65
66 vcpu->stat.instruction_lctl++;
67
68 useraddr = kvm_s390_get_base_disp_rs(vcpu);
69
70 if (useraddr & 3)
71 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
72
73 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
74 useraddr);
75 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
76
77 reg = reg1;
78 do {
79 rc = get_guest(vcpu, val, (u32 __user *) useraddr);
80 if (rc)
81 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
82 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
83 vcpu->arch.sie_block->gcr[reg] |= val;
84 useraddr += 4;
85 if (reg == reg3)
86 break;
87 reg = (reg + 1) % 16;
88 } while (1);
89 return 0;
90}
91
92static const intercept_handler_t eb_handlers[256] = {
93 [0x2f] = handle_lctlg,
94 [0x8a] = kvm_s390_handle_priv_eb,
95};
96
97static int handle_eb(struct kvm_vcpu *vcpu)
98{
99 intercept_handler_t handler;
100
101 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
102 if (handler)
103 return handler(vcpu);
104 return -EOPNOTSUPP;
105}
106 25
107static const intercept_handler_t instruction_handlers[256] = { 26static const intercept_handler_t instruction_handlers[256] = {
108 [0x01] = kvm_s390_handle_01, 27 [0x01] = kvm_s390_handle_01,
@@ -110,10 +29,10 @@ static const intercept_handler_t instruction_handlers[256] = {
110 [0x83] = kvm_s390_handle_diag, 29 [0x83] = kvm_s390_handle_diag,
111 [0xae] = kvm_s390_handle_sigp, 30 [0xae] = kvm_s390_handle_sigp,
112 [0xb2] = kvm_s390_handle_b2, 31 [0xb2] = kvm_s390_handle_b2,
113 [0xb7] = handle_lctl, 32 [0xb7] = kvm_s390_handle_lctl,
114 [0xb9] = kvm_s390_handle_b9, 33 [0xb9] = kvm_s390_handle_b9,
115 [0xe5] = kvm_s390_handle_e5, 34 [0xe5] = kvm_s390_handle_e5,
116 [0xeb] = handle_eb, 35 [0xeb] = kvm_s390_handle_eb,
117}; 36};
118 37
119static int handle_noop(struct kvm_vcpu *vcpu) 38static int handle_noop(struct kvm_vcpu *vcpu)
@@ -174,47 +93,12 @@ static int handle_stop(struct kvm_vcpu *vcpu)
174 93
175static int handle_validity(struct kvm_vcpu *vcpu) 94static int handle_validity(struct kvm_vcpu *vcpu)
176{ 95{
177 unsigned long vmaddr;
178 int viwhy = vcpu->arch.sie_block->ipb >> 16; 96 int viwhy = vcpu->arch.sie_block->ipb >> 16;
179 int rc;
180 97
181 vcpu->stat.exit_validity++; 98 vcpu->stat.exit_validity++;
182 trace_kvm_s390_intercept_validity(vcpu, viwhy); 99 trace_kvm_s390_intercept_validity(vcpu, viwhy);
183 if (viwhy == 0x37) { 100 WARN_ONCE(true, "kvm: unhandled validity intercept 0x%x\n", viwhy);
184 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, 101 return -EOPNOTSUPP;
185 vcpu->arch.gmap);
186 if (IS_ERR_VALUE(vmaddr)) {
187 rc = -EOPNOTSUPP;
188 goto out;
189 }
190 rc = fault_in_pages_writeable((char __user *) vmaddr,
191 PAGE_SIZE);
192 if (rc) {
193 /* user will receive sigsegv, exit to user */
194 rc = -EOPNOTSUPP;
195 goto out;
196 }
197 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix + PAGE_SIZE,
198 vcpu->arch.gmap);
199 if (IS_ERR_VALUE(vmaddr)) {
200 rc = -EOPNOTSUPP;
201 goto out;
202 }
203 rc = fault_in_pages_writeable((char __user *) vmaddr,
204 PAGE_SIZE);
205 if (rc) {
206 /* user will receive sigsegv, exit to user */
207 rc = -EOPNOTSUPP;
208 goto out;
209 }
210 } else
211 rc = -EOPNOTSUPP;
212
213out:
214 if (rc)
215 VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
216 viwhy);
217 return rc;
218} 102}
219 103
220static int handle_instruction(struct kvm_vcpu *vcpu) 104static int handle_instruction(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5c948177529e..7f35cb33e510 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -438,7 +438,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
438no_timer: 438no_timer:
439 spin_lock(&vcpu->arch.local_int.float_int->lock); 439 spin_lock(&vcpu->arch.local_int.float_int->lock);
440 spin_lock_bh(&vcpu->arch.local_int.lock); 440 spin_lock_bh(&vcpu->arch.local_int.lock);
441 add_wait_queue(&vcpu->arch.local_int.wq, &wait); 441 add_wait_queue(&vcpu->wq, &wait);
442 while (list_empty(&vcpu->arch.local_int.list) && 442 while (list_empty(&vcpu->arch.local_int.list) &&
443 list_empty(&vcpu->arch.local_int.float_int->list) && 443 list_empty(&vcpu->arch.local_int.float_int->list) &&
444 (!vcpu->arch.local_int.timer_due) && 444 (!vcpu->arch.local_int.timer_due) &&
@@ -452,7 +452,7 @@ no_timer:
452 } 452 }
453 __unset_cpu_idle(vcpu); 453 __unset_cpu_idle(vcpu);
454 __set_current_state(TASK_RUNNING); 454 __set_current_state(TASK_RUNNING);
455 remove_wait_queue(&vcpu->arch.local_int.wq, &wait); 455 remove_wait_queue(&vcpu->wq, &wait);
456 spin_unlock_bh(&vcpu->arch.local_int.lock); 456 spin_unlock_bh(&vcpu->arch.local_int.lock);
457 spin_unlock(&vcpu->arch.local_int.float_int->lock); 457 spin_unlock(&vcpu->arch.local_int.float_int->lock);
458 hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); 458 hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
@@ -465,8 +465,8 @@ void kvm_s390_tasklet(unsigned long parm)
465 465
466 spin_lock(&vcpu->arch.local_int.lock); 466 spin_lock(&vcpu->arch.local_int.lock);
467 vcpu->arch.local_int.timer_due = 1; 467 vcpu->arch.local_int.timer_due = 1;
468 if (waitqueue_active(&vcpu->arch.local_int.wq)) 468 if (waitqueue_active(&vcpu->wq))
469 wake_up_interruptible(&vcpu->arch.local_int.wq); 469 wake_up_interruptible(&vcpu->wq);
470 spin_unlock(&vcpu->arch.local_int.lock); 470 spin_unlock(&vcpu->arch.local_int.lock);
471} 471}
472 472
@@ -613,7 +613,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
613 spin_lock_bh(&li->lock); 613 spin_lock_bh(&li->lock);
614 list_add(&inti->list, &li->list); 614 list_add(&inti->list, &li->list);
615 atomic_set(&li->active, 1); 615 atomic_set(&li->active, 1);
616 BUG_ON(waitqueue_active(&li->wq)); 616 BUG_ON(waitqueue_active(li->wq));
617 spin_unlock_bh(&li->lock); 617 spin_unlock_bh(&li->lock);
618 return 0; 618 return 0;
619} 619}
@@ -746,8 +746,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
746 li = fi->local_int[sigcpu]; 746 li = fi->local_int[sigcpu];
747 spin_lock_bh(&li->lock); 747 spin_lock_bh(&li->lock);
748 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 748 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
749 if (waitqueue_active(&li->wq)) 749 if (waitqueue_active(li->wq))
750 wake_up_interruptible(&li->wq); 750 wake_up_interruptible(li->wq);
751 spin_unlock_bh(&li->lock); 751 spin_unlock_bh(&li->lock);
752 spin_unlock(&fi->lock); 752 spin_unlock(&fi->lock);
753 mutex_unlock(&kvm->lock); 753 mutex_unlock(&kvm->lock);
@@ -832,8 +832,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
832 if (inti->type == KVM_S390_SIGP_STOP) 832 if (inti->type == KVM_S390_SIGP_STOP)
833 li->action_bits |= ACTION_STOP_ON_STOP; 833 li->action_bits |= ACTION_STOP_ON_STOP;
834 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 834 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
835 if (waitqueue_active(&li->wq)) 835 if (waitqueue_active(&vcpu->wq))
836 wake_up_interruptible(&vcpu->arch.local_int.wq); 836 wake_up_interruptible(&vcpu->wq);
837 spin_unlock_bh(&li->lock); 837 spin_unlock_bh(&li->lock);
838 mutex_unlock(&vcpu->kvm->lock); 838 mutex_unlock(&vcpu->kvm->lock);
839 return 0; 839 return 0;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c1c7c683fa26..ba694d2ba51e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -59,6 +59,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
59 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, 59 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
60 { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, 60 { "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
61 { "exit_wait_state", VCPU_STAT(exit_wait_state) }, 61 { "exit_wait_state", VCPU_STAT(exit_wait_state) },
62 { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
62 { "instruction_stidp", VCPU_STAT(instruction_stidp) }, 63 { "instruction_stidp", VCPU_STAT(instruction_stidp) },
63 { "instruction_spx", VCPU_STAT(instruction_spx) }, 64 { "instruction_spx", VCPU_STAT(instruction_spx) },
64 { "instruction_stpx", VCPU_STAT(instruction_stpx) }, 65 { "instruction_stpx", VCPU_STAT(instruction_stpx) },
@@ -84,6 +85,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
84}; 85};
85 86
86static unsigned long long *facilities; 87static unsigned long long *facilities;
88static struct gmap_notifier gmap_notifier;
87 89
88/* Section: not file related */ 90/* Section: not file related */
89int kvm_arch_hardware_enable(void *garbage) 91int kvm_arch_hardware_enable(void *garbage)
@@ -96,13 +98,18 @@ void kvm_arch_hardware_disable(void *garbage)
96{ 98{
97} 99}
98 100
101static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
102
99int kvm_arch_hardware_setup(void) 103int kvm_arch_hardware_setup(void)
100{ 104{
105 gmap_notifier.notifier_call = kvm_gmap_notifier;
106 gmap_register_ipte_notifier(&gmap_notifier);
101 return 0; 107 return 0;
102} 108}
103 109
104void kvm_arch_hardware_unsetup(void) 110void kvm_arch_hardware_unsetup(void)
105{ 111{
112 gmap_unregister_ipte_notifier(&gmap_notifier);
106} 113}
107 114
108void kvm_arch_check_processor_compat(void *rtn) 115void kvm_arch_check_processor_compat(void *rtn)
@@ -239,6 +246,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
239 kvm->arch.gmap = gmap_alloc(current->mm); 246 kvm->arch.gmap = gmap_alloc(current->mm);
240 if (!kvm->arch.gmap) 247 if (!kvm->arch.gmap)
241 goto out_nogmap; 248 goto out_nogmap;
249 kvm->arch.gmap->private = kvm;
242 } 250 }
243 251
244 kvm->arch.css_support = 0; 252 kvm->arch.css_support = 0;
@@ -270,7 +278,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
270 278
271 free_page((unsigned long)(vcpu->arch.sie_block)); 279 free_page((unsigned long)(vcpu->arch.sie_block));
272 kvm_vcpu_uninit(vcpu); 280 kvm_vcpu_uninit(vcpu);
273 kfree(vcpu); 281 kmem_cache_free(kvm_vcpu_cache, vcpu);
274} 282}
275 283
276static void kvm_free_vcpus(struct kvm *kvm) 284static void kvm_free_vcpus(struct kvm *kvm)
@@ -309,6 +317,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
309 vcpu->arch.gmap = gmap_alloc(current->mm); 317 vcpu->arch.gmap = gmap_alloc(current->mm);
310 if (!vcpu->arch.gmap) 318 if (!vcpu->arch.gmap)
311 return -ENOMEM; 319 return -ENOMEM;
320 vcpu->arch.gmap->private = vcpu->kvm;
312 return 0; 321 return 0;
313 } 322 }
314 323
@@ -373,8 +382,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
373{ 382{
374 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 383 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
375 CPUSTAT_SM | 384 CPUSTAT_SM |
376 CPUSTAT_STOPPED); 385 CPUSTAT_STOPPED |
386 CPUSTAT_GED);
377 vcpu->arch.sie_block->ecb = 6; 387 vcpu->arch.sie_block->ecb = 6;
388 vcpu->arch.sie_block->ecb2 = 8;
378 vcpu->arch.sie_block->eca = 0xC1002001U; 389 vcpu->arch.sie_block->eca = 0xC1002001U;
379 vcpu->arch.sie_block->fac = (int) (long) facilities; 390 vcpu->arch.sie_block->fac = (int) (long) facilities;
380 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 391 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -397,7 +408,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
397 408
398 rc = -ENOMEM; 409 rc = -ENOMEM;
399 410
400 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 411 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
401 if (!vcpu) 412 if (!vcpu)
402 goto out; 413 goto out;
403 414
@@ -427,7 +438,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
427 vcpu->arch.local_int.float_int = &kvm->arch.float_int; 438 vcpu->arch.local_int.float_int = &kvm->arch.float_int;
428 spin_lock(&kvm->arch.float_int.lock); 439 spin_lock(&kvm->arch.float_int.lock);
429 kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; 440 kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
430 init_waitqueue_head(&vcpu->arch.local_int.wq); 441 vcpu->arch.local_int.wq = &vcpu->wq;
431 vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; 442 vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
432 spin_unlock(&kvm->arch.float_int.lock); 443 spin_unlock(&kvm->arch.float_int.lock);
433 444
@@ -442,7 +453,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
442out_free_sie_block: 453out_free_sie_block:
443 free_page((unsigned long)(vcpu->arch.sie_block)); 454 free_page((unsigned long)(vcpu->arch.sie_block));
444out_free_cpu: 455out_free_cpu:
445 kfree(vcpu); 456 kmem_cache_free(kvm_vcpu_cache, vcpu);
446out: 457out:
447 return ERR_PTR(rc); 458 return ERR_PTR(rc);
448} 459}
@@ -454,6 +465,50 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
454 return 0; 465 return 0;
455} 466}
456 467
468void s390_vcpu_block(struct kvm_vcpu *vcpu)
469{
470 atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
471}
472
473void s390_vcpu_unblock(struct kvm_vcpu *vcpu)
474{
475 atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
476}
477
478/*
479 * Kick a guest cpu out of SIE and wait until SIE is not running.
480 * If the CPU is not running (e.g. waiting as idle) the function will
481 * return immediately. */
482void exit_sie(struct kvm_vcpu *vcpu)
483{
484 atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
485 while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
486 cpu_relax();
487}
488
489/* Kick a guest cpu out of SIE and prevent SIE-reentry */
490void exit_sie_sync(struct kvm_vcpu *vcpu)
491{
492 s390_vcpu_block(vcpu);
493 exit_sie(vcpu);
494}
495
496static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
497{
498 int i;
499 struct kvm *kvm = gmap->private;
500 struct kvm_vcpu *vcpu;
501
502 kvm_for_each_vcpu(i, vcpu, kvm) {
503 /* match against both prefix pages */
504 if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) {
505 VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
506 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
507 exit_sie_sync(vcpu);
508 }
509 }
510}
511
457int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 512int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
458{ 513{
459 /* kvm common code refers to this, but never calls it */ 514 /* kvm common code refers to this, but never calls it */
@@ -606,6 +661,27 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
606 return -EINVAL; /* not implemented yet */ 661 return -EINVAL; /* not implemented yet */
607} 662}
608 663
664static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
665{
666 /*
667 * We use MMU_RELOAD just to re-arm the ipte notifier for the
668 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
669 * This ensures that the ipte instruction for this request has
670 * already finished. We might race against a second unmapper that
671 * wants to set the blocking bit. Lets just retry the request loop.
672 */
673 while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
674 int rc;
675 rc = gmap_ipte_notify(vcpu->arch.gmap,
676 vcpu->arch.sie_block->prefix,
677 PAGE_SIZE * 2);
678 if (rc)
679 return rc;
680 s390_vcpu_unblock(vcpu);
681 }
682 return 0;
683}
684
609static int __vcpu_run(struct kvm_vcpu *vcpu) 685static int __vcpu_run(struct kvm_vcpu *vcpu)
610{ 686{
611 int rc; 687 int rc;
@@ -621,6 +697,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
621 if (!kvm_is_ucontrol(vcpu->kvm)) 697 if (!kvm_is_ucontrol(vcpu->kvm))
622 kvm_s390_deliver_pending_interrupts(vcpu); 698 kvm_s390_deliver_pending_interrupts(vcpu);
623 699
700 rc = kvm_s390_handle_requests(vcpu);
701 if (rc)
702 return rc;
703
624 vcpu->arch.sie_block->icptcode = 0; 704 vcpu->arch.sie_block->icptcode = 0;
625 preempt_disable(); 705 preempt_disable();
626 kvm_guest_enter(); 706 kvm_guest_enter();
@@ -630,7 +710,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
630 trace_kvm_s390_sie_enter(vcpu, 710 trace_kvm_s390_sie_enter(vcpu,
631 atomic_read(&vcpu->arch.sie_block->cpuflags)); 711 atomic_read(&vcpu->arch.sie_block->cpuflags));
632 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); 712 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
633 if (rc) { 713 if (rc > 0)
714 rc = 0;
715 if (rc < 0) {
634 if (kvm_is_ucontrol(vcpu->kvm)) { 716 if (kvm_is_ucontrol(vcpu->kvm)) {
635 rc = SIE_INTERCEPT_UCONTROL; 717 rc = SIE_INTERCEPT_UCONTROL;
636 } else { 718 } else {
@@ -1046,7 +1128,7 @@ static int __init kvm_s390_init(void)
1046 return -ENOMEM; 1128 return -ENOMEM;
1047 } 1129 }
1048 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 1130 memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
1049 facilities[0] &= 0xff00fff3f47c0000ULL; 1131 facilities[0] &= 0xff82fff3f47c0000ULL;
1050 facilities[1] &= 0x001c000000000000ULL; 1132 facilities[1] &= 0x001c000000000000ULL;
1051 return 0; 1133 return 0;
1052} 1134}
@@ -1059,3 +1141,12 @@ static void __exit kvm_s390_exit(void)
1059 1141
1060module_init(kvm_s390_init); 1142module_init(kvm_s390_init);
1061module_exit(kvm_s390_exit); 1143module_exit(kvm_s390_exit);
1144
1145/*
1146 * Enable autoloading of the kvm module.
1147 * Note that we add the module alias here instead of virt/kvm/kvm_main.c
1148 * since x86 takes a different approach.
1149 */
1150#include <linux/miscdevice.h>
1151MODULE_ALIAS_MISCDEV(KVM_MINOR);
1152MODULE_ALIAS("devname:kvm");
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index efc14f687265..028ca9fd2158 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -63,6 +63,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
63{ 63{
64 vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; 64 vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
65 vcpu->arch.sie_block->ihcpu = 0xffff; 65 vcpu->arch.sie_block->ihcpu = 0xffff;
66 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
66} 67}
67 68
68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) 69static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
@@ -85,6 +86,12 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 86 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
86} 87}
87 88
89static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2)
90{
91 *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20;
92 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
93}
94
88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) 95static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
89{ 96{
90 u32 base2 = vcpu->arch.sie_block->ipb >> 28; 97 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
@@ -125,7 +132,8 @@ int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
125int kvm_s390_handle_01(struct kvm_vcpu *vcpu); 132int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
126int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); 133int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
127int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); 134int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
128int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu); 135int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
136int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
129 137
130/* implemented in sigp.c */ 138/* implemented in sigp.c */
131int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 139int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -133,6 +141,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
133/* implemented in kvm-s390.c */ 141/* implemented in kvm-s390.c */
134int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 142int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
135 unsigned long addr); 143 unsigned long addr);
144void s390_vcpu_block(struct kvm_vcpu *vcpu);
145void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
146void exit_sie(struct kvm_vcpu *vcpu);
147void exit_sie_sync(struct kvm_vcpu *vcpu);
136/* implemented in diag.c */ 148/* implemented in diag.c */
137int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 149int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
138 150
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 6bbd7b5a0bbe..0da3e6eb6be6 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * handling privileged instructions 2 * handling privileged instructions
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008, 2013
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -20,6 +20,9 @@
20#include <asm/debug.h> 20#include <asm/debug.h>
21#include <asm/ebcdic.h> 21#include <asm/ebcdic.h>
22#include <asm/sysinfo.h> 22#include <asm/sysinfo.h>
23#include <asm/pgtable.h>
24#include <asm/pgalloc.h>
25#include <asm/io.h>
23#include <asm/ptrace.h> 26#include <asm/ptrace.h>
24#include <asm/compat.h> 27#include <asm/compat.h>
25#include "gaccess.h" 28#include "gaccess.h"
@@ -34,6 +37,9 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
34 37
35 vcpu->stat.instruction_spx++; 38 vcpu->stat.instruction_spx++;
36 39
40 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
41 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
42
37 operand2 = kvm_s390_get_base_disp_s(vcpu); 43 operand2 = kvm_s390_get_base_disp_s(vcpu);
38 44
39 /* must be word boundary */ 45 /* must be word boundary */
@@ -65,6 +71,9 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
65 71
66 vcpu->stat.instruction_stpx++; 72 vcpu->stat.instruction_stpx++;
67 73
74 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
75 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
76
68 operand2 = kvm_s390_get_base_disp_s(vcpu); 77 operand2 = kvm_s390_get_base_disp_s(vcpu);
69 78
70 /* must be word boundary */ 79 /* must be word boundary */
@@ -89,6 +98,9 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
89 98
90 vcpu->stat.instruction_stap++; 99 vcpu->stat.instruction_stap++;
91 100
101 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
102 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
103
92 useraddr = kvm_s390_get_base_disp_s(vcpu); 104 useraddr = kvm_s390_get_base_disp_s(vcpu);
93 105
94 if (useraddr & 1) 106 if (useraddr & 1)
@@ -105,7 +117,12 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
105static int handle_skey(struct kvm_vcpu *vcpu) 117static int handle_skey(struct kvm_vcpu *vcpu)
106{ 118{
107 vcpu->stat.instruction_storage_key++; 119 vcpu->stat.instruction_storage_key++;
108 vcpu->arch.sie_block->gpsw.addr -= 4; 120
121 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
122 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
123
124 vcpu->arch.sie_block->gpsw.addr =
125 __rewind_psw(vcpu->arch.sie_block->gpsw, 4);
109 VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); 126 VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
110 return 0; 127 return 0;
111} 128}
@@ -129,9 +146,10 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
129 * Store the two-word I/O interruption code into the 146 * Store the two-word I/O interruption code into the
130 * provided area. 147 * provided area.
131 */ 148 */
132 put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr); 149 if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr)
133 put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2)); 150 || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2))
134 put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4)); 151 || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4)))
152 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
135 } else { 153 } else {
136 /* 154 /*
137 * Store the three-word I/O interruption code into 155 * Store the three-word I/O interruption code into
@@ -182,6 +200,9 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
182{ 200{
183 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction"); 201 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
184 202
203 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
204 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
205
185 if (vcpu->kvm->arch.css_support) { 206 if (vcpu->kvm->arch.css_support) {
186 /* 207 /*
187 * Most I/O instructions will be handled by userspace. 208 * Most I/O instructions will be handled by userspace.
@@ -210,8 +231,12 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
210 int rc; 231 int rc;
211 232
212 vcpu->stat.instruction_stfl++; 233 vcpu->stat.instruction_stfl++;
234
235 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
236 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
237
213 /* only pass the facility bits, which we can handle */ 238 /* only pass the facility bits, which we can handle */
214 facility_list = S390_lowcore.stfl_fac_list & 0xff00fff3; 239 facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
215 240
216 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 241 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
217 &facility_list, sizeof(facility_list)); 242 &facility_list, sizeof(facility_list));
@@ -255,8 +280,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
255 u64 addr; 280 u64 addr;
256 281
257 if (gpsw->mask & PSW_MASK_PSTATE) 282 if (gpsw->mask & PSW_MASK_PSTATE)
258 return kvm_s390_inject_program_int(vcpu, 283 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
259 PGM_PRIVILEGED_OPERATION); 284
260 addr = kvm_s390_get_base_disp_s(vcpu); 285 addr = kvm_s390_get_base_disp_s(vcpu);
261 if (addr & 7) 286 if (addr & 7)
262 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 287 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -278,6 +303,9 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
278 psw_t new_psw; 303 psw_t new_psw;
279 u64 addr; 304 u64 addr;
280 305
306 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
307 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
308
281 addr = kvm_s390_get_base_disp_s(vcpu); 309 addr = kvm_s390_get_base_disp_s(vcpu);
282 if (addr & 7) 310 if (addr & 7)
283 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 311 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -296,6 +324,9 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
296 324
297 vcpu->stat.instruction_stidp++; 325 vcpu->stat.instruction_stidp++;
298 326
327 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
328 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
329
299 operand2 = kvm_s390_get_base_disp_s(vcpu); 330 operand2 = kvm_s390_get_base_disp_s(vcpu);
300 331
301 if (operand2 & 7) 332 if (operand2 & 7)
@@ -351,16 +382,30 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
351 vcpu->stat.instruction_stsi++; 382 vcpu->stat.instruction_stsi++;
352 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); 383 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
353 384
354 operand2 = kvm_s390_get_base_disp_s(vcpu); 385 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
386 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
387
388 if (fc > 3) {
389 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; /* cc 3 */
390 return 0;
391 }
355 392
356 if (operand2 & 0xfff && fc > 0) 393 if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
394 || vcpu->run->s.regs.gprs[1] & 0xffff0000)
357 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 395 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
358 396
359 switch (fc) { 397 if (fc == 0) {
360 case 0:
361 vcpu->run->s.regs.gprs[0] = 3 << 28; 398 vcpu->run->s.regs.gprs[0] = 3 << 28;
362 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 399 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); /* cc 0 */
363 return 0; 400 return 0;
401 }
402
403 operand2 = kvm_s390_get_base_disp_s(vcpu);
404
405 if (operand2 & 0xfff)
406 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
407
408 switch (fc) {
364 case 1: /* same handling for 1 and 2 */ 409 case 1: /* same handling for 1 and 2 */
365 case 2: 410 case 2:
366 mem = get_zeroed_page(GFP_KERNEL); 411 mem = get_zeroed_page(GFP_KERNEL);
@@ -377,8 +422,6 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
377 goto out_no_data; 422 goto out_no_data;
378 handle_stsi_3_2_2(vcpu, (void *) mem); 423 handle_stsi_3_2_2(vcpu, (void *) mem);
379 break; 424 break;
380 default:
381 goto out_no_data;
382 } 425 }
383 426
384 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { 427 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
@@ -432,20 +475,14 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
432 intercept_handler_t handler; 475 intercept_handler_t handler;
433 476
434 /* 477 /*
435 * a lot of B2 instructions are priviledged. We first check for 478 * A lot of B2 instructions are priviledged. Here we check for
436 * the privileged ones, that we can handle in the kernel. If the 479 * the privileged ones, that we can handle in the kernel.
437 * kernel can handle this instruction, we check for the problem 480 * Anything else goes to userspace.
438 * state bit and (a) handle the instruction or (b) send a code 2 481 */
439 * program check.
440 * Anything else goes to userspace.*/
441 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 482 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
442 if (handler) { 483 if (handler)
443 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 484 return handler(vcpu);
444 return kvm_s390_inject_program_int(vcpu, 485
445 PGM_PRIVILEGED_OPERATION);
446 else
447 return handler(vcpu);
448 }
449 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
450} 487}
451 488
@@ -453,8 +490,7 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
453{ 490{
454 int reg1, reg2; 491 int reg1, reg2;
455 492
456 reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24; 493 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
457 reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
458 494
459 /* This basically extracts the mask half of the psw. */ 495 /* This basically extracts the mask half of the psw. */
460 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000; 496 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
@@ -467,9 +503,88 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
467 return 0; 503 return 0;
468} 504}
469 505
506#define PFMF_RESERVED 0xfffc0101UL
507#define PFMF_SK 0x00020000UL
508#define PFMF_CF 0x00010000UL
509#define PFMF_UI 0x00008000UL
510#define PFMF_FSC 0x00007000UL
511#define PFMF_NQ 0x00000800UL
512#define PFMF_MR 0x00000400UL
513#define PFMF_MC 0x00000200UL
514#define PFMF_KEY 0x000000feUL
515
516static int handle_pfmf(struct kvm_vcpu *vcpu)
517{
518 int reg1, reg2;
519 unsigned long start, end;
520
521 vcpu->stat.instruction_pfmf++;
522
523 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
524
525 if (!MACHINE_HAS_PFMF)
526 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
527
528 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
529 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
530
531 if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED)
532 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
533
534 /* Only provide non-quiescing support if the host supports it */
535 if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ &&
536 S390_lowcore.stfl_fac_list & 0x00020000)
537 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
538
539 /* No support for conditional-SSKE */
540 if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
541 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
542
543 start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
544 switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
545 case 0x00000000:
546 end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
547 break;
548 case 0x00001000:
549 end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
550 break;
551 /* We dont support EDAT2
552 case 0x00002000:
553 end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
554 break;*/
555 default:
556 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
557 }
558 while (start < end) {
559 unsigned long useraddr;
560
561 useraddr = gmap_translate(start, vcpu->arch.gmap);
562 if (IS_ERR((void *)useraddr))
563 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
564
565 if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
566 if (clear_user((void __user *)useraddr, PAGE_SIZE))
567 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
568 }
569
570 if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
571 if (set_guest_storage_key(current->mm, useraddr,
572 vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
573 vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
574 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
575 }
576
577 start += PAGE_SIZE;
578 }
579 if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
580 vcpu->run->s.regs.gprs[reg2] = end;
581 return 0;
582}
583
470static const intercept_handler_t b9_handlers[256] = { 584static const intercept_handler_t b9_handlers[256] = {
471 [0x8d] = handle_epsw, 585 [0x8d] = handle_epsw,
472 [0x9c] = handle_io_inst, 586 [0x9c] = handle_io_inst,
587 [0xaf] = handle_pfmf,
473}; 588};
474 589
475int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) 590int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
@@ -478,29 +593,96 @@ int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
478 593
479 /* This is handled just as for the B2 instructions. */ 594 /* This is handled just as for the B2 instructions. */
480 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 595 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
481 if (handler) { 596 if (handler)
482 if ((handler != handle_epsw) && 597 return handler(vcpu);
483 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)) 598
484 return kvm_s390_inject_program_int(vcpu,
485 PGM_PRIVILEGED_OPERATION);
486 else
487 return handler(vcpu);
488 }
489 return -EOPNOTSUPP; 599 return -EOPNOTSUPP;
490} 600}
491 601
602int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
603{
604 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
605 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
606 u64 useraddr;
607 u32 val = 0;
608 int reg, rc;
609
610 vcpu->stat.instruction_lctl++;
611
612 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
613 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
614
615 useraddr = kvm_s390_get_base_disp_rs(vcpu);
616
617 if (useraddr & 3)
618 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
619
620 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
621 useraddr);
622 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
623
624 reg = reg1;
625 do {
626 rc = get_guest(vcpu, val, (u32 __user *) useraddr);
627 if (rc)
628 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
629 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
630 vcpu->arch.sie_block->gcr[reg] |= val;
631 useraddr += 4;
632 if (reg == reg3)
633 break;
634 reg = (reg + 1) % 16;
635 } while (1);
636
637 return 0;
638}
639
640static int handle_lctlg(struct kvm_vcpu *vcpu)
641{
642 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
643 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
644 u64 useraddr;
645 int reg, rc;
646
647 vcpu->stat.instruction_lctlg++;
648
649 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
650 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
651
652 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
653
654 if (useraddr & 7)
655 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
656
657 reg = reg1;
658
659 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
660 useraddr);
661 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
662
663 do {
664 rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
665 (u64 __user *) useraddr);
666 if (rc)
667 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
668 useraddr += 8;
669 if (reg == reg3)
670 break;
671 reg = (reg + 1) % 16;
672 } while (1);
673
674 return 0;
675}
676
492static const intercept_handler_t eb_handlers[256] = { 677static const intercept_handler_t eb_handlers[256] = {
678 [0x2f] = handle_lctlg,
493 [0x8a] = handle_io_inst, 679 [0x8a] = handle_io_inst,
494}; 680};
495 681
496int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu) 682int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
497{ 683{
498 intercept_handler_t handler; 684 intercept_handler_t handler;
499 685
500 /* All eb instructions that end up here are privileged. */
501 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
502 return kvm_s390_inject_program_int(vcpu,
503 PGM_PRIVILEGED_OPERATION);
504 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; 686 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
505 if (handler) 687 if (handler)
506 return handler(vcpu); 688 return handler(vcpu);
@@ -515,6 +697,9 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
515 697
516 vcpu->stat.instruction_tprot++; 698 vcpu->stat.instruction_tprot++;
517 699
700 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
701 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
702
518 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2); 703 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
519 704
520 /* we only handle the Linux memory detection case: 705 /* we only handle the Linux memory detection case:
@@ -560,8 +745,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
560 u32 value; 745 u32 value;
561 746
562 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 747 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
563 return kvm_s390_inject_program_int(vcpu, 748 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
564 PGM_PRIVILEGED_OPERATION);
565 749
566 if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000) 750 if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000)
567 return kvm_s390_inject_program_int(vcpu, 751 return kvm_s390_inject_program_int(vcpu,
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 1c48ab2845e0..bec398c57acf 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -79,8 +79,8 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
79 list_add_tail(&inti->list, &li->list); 79 list_add_tail(&inti->list, &li->list);
80 atomic_set(&li->active, 1); 80 atomic_set(&li->active, 1);
81 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 81 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
82 if (waitqueue_active(&li->wq)) 82 if (waitqueue_active(li->wq))
83 wake_up_interruptible(&li->wq); 83 wake_up_interruptible(li->wq);
84 spin_unlock_bh(&li->lock); 84 spin_unlock_bh(&li->lock);
85 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 85 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
86 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 86 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
@@ -117,8 +117,8 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
117 list_add_tail(&inti->list, &li->list); 117 list_add_tail(&inti->list, &li->list);
118 atomic_set(&li->active, 1); 118 atomic_set(&li->active, 1);
119 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 119 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
120 if (waitqueue_active(&li->wq)) 120 if (waitqueue_active(li->wq))
121 wake_up_interruptible(&li->wq); 121 wake_up_interruptible(li->wq);
122 spin_unlock_bh(&li->lock); 122 spin_unlock_bh(&li->lock);
123 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 123 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
124 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 124 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
@@ -145,8 +145,8 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
145 atomic_set(&li->active, 1); 145 atomic_set(&li->active, 1);
146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
147 li->action_bits |= action; 147 li->action_bits |= action;
148 if (waitqueue_active(&li->wq)) 148 if (waitqueue_active(li->wq))
149 wake_up_interruptible(&li->wq); 149 wake_up_interruptible(li->wq);
150out: 150out:
151 spin_unlock_bh(&li->lock); 151 spin_unlock_bh(&li->lock);
152 152
@@ -250,8 +250,8 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
250 250
251 list_add_tail(&inti->list, &li->list); 251 list_add_tail(&inti->list, &li->list);
252 atomic_set(&li->active, 1); 252 atomic_set(&li->active, 1);
253 if (waitqueue_active(&li->wq)) 253 if (waitqueue_active(li->wq))
254 wake_up_interruptible(&li->wq); 254 wake_up_interruptible(li->wq);
255 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 255 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
256 256
257 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); 257 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
@@ -333,8 +333,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
333 333
334 /* sigp in userspace can exit */ 334 /* sigp in userspace can exit */
335 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 335 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
336 return kvm_s390_inject_program_int(vcpu, 336 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
337 PGM_PRIVILEGED_OPERATION);
338 337
339 order_code = kvm_s390_get_base_disp_rs(vcpu); 338 order_code = kvm_s390_get_base_disp_rs(vcpu);
340 339
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 74c29d922458..17bf4d3d303a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -689,7 +689,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
689 entry = *ptep; 689 entry = *ptep;
690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { 690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
691 pgste = pgste_get_lock(ptep); 691 pgste = pgste_get_lock(ptep);
692 pgste_val(pgste) |= RCP_IN_BIT; 692 pgste_val(pgste) |= PGSTE_IN_BIT;
693 pgste_set_unlock(ptep, pgste); 693 pgste_set_unlock(ptep, pgste);
694 start += PAGE_SIZE; 694 start += PAGE_SIZE;
695 len -= PAGE_SIZE; 695 len -= PAGE_SIZE;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af9c5525434d..f87f7fcefa0a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,14 +222,22 @@ struct kvm_mmu_page {
222 int root_count; /* Currently serving as active root */ 222 int root_count; /* Currently serving as active root */
223 unsigned int unsync_children; 223 unsigned int unsync_children;
224 unsigned long parent_ptes; /* Reverse mapping for parent_pte */ 224 unsigned long parent_ptes; /* Reverse mapping for parent_pte */
225
226 /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
227 unsigned long mmu_valid_gen;
228
225 DECLARE_BITMAP(unsync_child_bitmap, 512); 229 DECLARE_BITMAP(unsync_child_bitmap, 512);
226 230
227#ifdef CONFIG_X86_32 231#ifdef CONFIG_X86_32
232 /*
233 * Used out of the mmu-lock to avoid reading spte values while an
234 * update is in progress; see the comments in __get_spte_lockless().
235 */
228 int clear_spte_count; 236 int clear_spte_count;
229#endif 237#endif
230 238
239 /* Number of writes since the last time traversal visited this page. */
231 int write_flooding_count; 240 int write_flooding_count;
232 bool mmio_cached;
233}; 241};
234 242
235struct kvm_pio_request { 243struct kvm_pio_request {
@@ -529,11 +537,14 @@ struct kvm_arch {
529 unsigned int n_requested_mmu_pages; 537 unsigned int n_requested_mmu_pages;
530 unsigned int n_max_mmu_pages; 538 unsigned int n_max_mmu_pages;
531 unsigned int indirect_shadow_pages; 539 unsigned int indirect_shadow_pages;
540 unsigned long mmu_valid_gen;
532 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 541 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
533 /* 542 /*
534 * Hash table of struct kvm_mmu_page. 543 * Hash table of struct kvm_mmu_page.
535 */ 544 */
536 struct list_head active_mmu_pages; 545 struct list_head active_mmu_pages;
546 struct list_head zapped_obsolete_pages;
547
537 struct list_head assigned_dev_head; 548 struct list_head assigned_dev_head;
538 struct iommu_domain *iommu_domain; 549 struct iommu_domain *iommu_domain;
539 int iommu_flags; 550 int iommu_flags;
@@ -769,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
769 struct kvm_memory_slot *slot, 780 struct kvm_memory_slot *slot,
770 gfn_t gfn_offset, unsigned long mask); 781 gfn_t gfn_offset, unsigned long mask);
771void kvm_mmu_zap_all(struct kvm *kvm); 782void kvm_mmu_zap_all(struct kvm *kvm);
772void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); 783void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
773unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 784unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
774void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 785void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
775 786
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d84048..bf4fb04d0112 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8KVM := ../../../virt/kvm
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9
10 irqchip.o) 10kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
11kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ 11 $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
12 assigned-dev.o iommu.o) 12 $(KVM)/eventfd.o $(KVM)/irqchip.o
13kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
14kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
14 15
15kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 16kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
16 i8254.o cpuid.o pmu.o 17 i8254.o cpuid.o pmu.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dcea752d..2bc1e81045b0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -61,6 +61,8 @@
61#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ 62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ 63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
64#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
65#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
64 66
65#define OpBits 5 /* Width of operand field */ 67#define OpBits 5 /* Width of operand field */
66#define OpMask ((1ull << OpBits) - 1) 68#define OpMask ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
86#define DstMem64 (OpMem64 << DstShift) 88#define DstMem64 (OpMem64 << DstShift)
87#define DstImmUByte (OpImmUByte << DstShift) 89#define DstImmUByte (OpImmUByte << DstShift)
88#define DstDX (OpDX << DstShift) 90#define DstDX (OpDX << DstShift)
91#define DstAccLo (OpAccLo << DstShift)
89#define DstMask (OpMask << DstShift) 92#define DstMask (OpMask << DstShift)
90/* Source operand type. */ 93/* Source operand type. */
91#define SrcShift 6 94#define SrcShift 6
@@ -108,6 +111,7 @@
108#define SrcImm64 (OpImm64 << SrcShift) 111#define SrcImm64 (OpImm64 << SrcShift)
109#define SrcDX (OpDX << SrcShift) 112#define SrcDX (OpDX << SrcShift)
110#define SrcMem8 (OpMem8 << SrcShift) 113#define SrcMem8 (OpMem8 << SrcShift)
114#define SrcAccHi (OpAccHi << SrcShift)
111#define SrcMask (OpMask << SrcShift) 115#define SrcMask (OpMask << SrcShift)
112#define BitOp (1<<11) 116#define BitOp (1<<11)
113#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 117#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -138,6 +142,7 @@
138/* Source 2 operand type */ 142/* Source 2 operand type */
139#define Src2Shift (31) 143#define Src2Shift (31)
140#define Src2None (OpNone << Src2Shift) 144#define Src2None (OpNone << Src2Shift)
145#define Src2Mem (OpMem << Src2Shift)
141#define Src2CL (OpCL << Src2Shift) 146#define Src2CL (OpCL << Src2Shift)
142#define Src2ImmByte (OpImmByte << Src2Shift) 147#define Src2ImmByte (OpImmByte << Src2Shift)
143#define Src2One (OpOne << Src2Shift) 148#define Src2One (OpOne << Src2Shift)
@@ -155,6 +160,9 @@
155#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 160#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
156#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ 161#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
157#define NoWrite ((u64)1 << 45) /* No writeback */ 162#define NoWrite ((u64)1 << 45) /* No writeback */
163#define SrcWrite ((u64)1 << 46) /* Write back src operand */
164
165#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
158 166
159#define X2(x...) x, x 167#define X2(x...) x, x
160#define X3(x...) X2(x), x 168#define X3(x...) X2(x), x
@@ -171,10 +179,11 @@
171/* 179/*
172 * fastop functions have a special calling convention: 180 * fastop functions have a special calling convention:
173 * 181 *
174 * dst: [rdx]:rax (in/out) 182 * dst: rax (in/out)
175 * src: rbx (in/out) 183 * src: rdx (in/out)
176 * src2: rcx (in) 184 * src2: rcx (in)
177 * flags: rflags (in/out) 185 * flags: rflags (in/out)
186 * ex: rsi (in:fastop pointer, out:zero if exception)
178 * 187 *
179 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for 188 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
180 * different operand sizes can be reached by calculation, rather than a jump 189 * different operand sizes can be reached by calculation, rather than a jump
@@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
276} 285}
277 286
278/* 287/*
279 * Instruction emulation:
280 * Most instructions are emulated directly via a fragment of inline assembly
281 * code. This allows us to save/restore EFLAGS and thus very easily pick up
282 * any modified flags.
283 */
284
285#if defined(CONFIG_X86_64)
286#define _LO32 "k" /* force 32-bit operand */
287#define _STK "%%rsp" /* stack pointer */
288#elif defined(__i386__)
289#define _LO32 "" /* force 32-bit operand */
290#define _STK "%%esp" /* stack pointer */
291#endif
292
293/*
294 * These EFLAGS bits are restored from saved value during emulation, and 288 * These EFLAGS bits are restored from saved value during emulation, and
295 * any changes are written back to the saved value after emulation. 289 * any changes are written back to the saved value after emulation.
296 */ 290 */
297#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) 291#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
298 292
299/* Before executing instruction: restore necessary bits in EFLAGS. */
300#define _PRE_EFLAGS(_sav, _msk, _tmp) \
301 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
302 "movl %"_sav",%"_LO32 _tmp"; " \
303 "push %"_tmp"; " \
304 "push %"_tmp"; " \
305 "movl %"_msk",%"_LO32 _tmp"; " \
306 "andl %"_LO32 _tmp",("_STK"); " \
307 "pushf; " \
308 "notl %"_LO32 _tmp"; " \
309 "andl %"_LO32 _tmp",("_STK"); " \
310 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
311 "pop %"_tmp"; " \
312 "orl %"_LO32 _tmp",("_STK"); " \
313 "popf; " \
314 "pop %"_sav"; "
315
316/* After executing instruction: write-back necessary bits in EFLAGS. */
317#define _POST_EFLAGS(_sav, _msk, _tmp) \
318 /* _sav |= EFLAGS & _msk; */ \
319 "pushf; " \
320 "pop %"_tmp"; " \
321 "andl %"_msk",%"_LO32 _tmp"; " \
322 "orl %"_LO32 _tmp",%"_sav"; "
323
324#ifdef CONFIG_X86_64 293#ifdef CONFIG_X86_64
325#define ON64(x) x 294#define ON64(x) x
326#else 295#else
327#define ON64(x) 296#define ON64(x)
328#endif 297#endif
329 298
330#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
331 do { \
332 __asm__ __volatile__ ( \
333 _PRE_EFLAGS("0", "4", "2") \
334 _op _suffix " %"_x"3,%1; " \
335 _POST_EFLAGS("0", "4", "2") \
336 : "=m" ((ctxt)->eflags), \
337 "+q" (*(_dsttype*)&(ctxt)->dst.val), \
338 "=&r" (_tmp) \
339 : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
340 } while (0)
341
342
343/* Raw emulation: instruction has two explicit operands. */
344#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
345 do { \
346 unsigned long _tmp; \
347 \
348 switch ((ctxt)->dst.bytes) { \
349 case 2: \
350 ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
351 break; \
352 case 4: \
353 ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
354 break; \
355 case 8: \
356 ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
357 break; \
358 } \
359 } while (0)
360
361#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
362 do { \
363 unsigned long _tmp; \
364 switch ((ctxt)->dst.bytes) { \
365 case 1: \
366 ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
367 break; \
368 default: \
369 __emulate_2op_nobyte(ctxt, _op, \
370 _wx, _wy, _lx, _ly, _qx, _qy); \
371 break; \
372 } \
373 } while (0)
374
375/* Source operand is byte-sized and may be restricted to just %cl. */
376#define emulate_2op_SrcB(ctxt, _op) \
377 __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
378
379/* Source operand is byte, word, long or quad sized. */
380#define emulate_2op_SrcV(ctxt, _op) \
381 __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
382
383/* Source operand is word, long or quad sized. */
384#define emulate_2op_SrcV_nobyte(ctxt, _op) \
385 __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
386
387/* Instruction has three operands and one operand is stored in ECX register */
388#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
389 do { \
390 unsigned long _tmp; \
391 _type _clv = (ctxt)->src2.val; \
392 _type _srcv = (ctxt)->src.val; \
393 _type _dstv = (ctxt)->dst.val; \
394 \
395 __asm__ __volatile__ ( \
396 _PRE_EFLAGS("0", "5", "2") \
397 _op _suffix " %4,%1 \n" \
398 _POST_EFLAGS("0", "5", "2") \
399 : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
400 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
401 ); \
402 \
403 (ctxt)->src2.val = (unsigned long) _clv; \
404 (ctxt)->src2.val = (unsigned long) _srcv; \
405 (ctxt)->dst.val = (unsigned long) _dstv; \
406 } while (0)
407
408#define emulate_2op_cl(ctxt, _op) \
409 do { \
410 switch ((ctxt)->dst.bytes) { \
411 case 2: \
412 __emulate_2op_cl(ctxt, _op, "w", u16); \
413 break; \
414 case 4: \
415 __emulate_2op_cl(ctxt, _op, "l", u32); \
416 break; \
417 case 8: \
418 ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
419 break; \
420 } \
421 } while (0)
422
423#define __emulate_1op(ctxt, _op, _suffix) \
424 do { \
425 unsigned long _tmp; \
426 \
427 __asm__ __volatile__ ( \
428 _PRE_EFLAGS("0", "3", "2") \
429 _op _suffix " %1; " \
430 _POST_EFLAGS("0", "3", "2") \
431 : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
432 "=&r" (_tmp) \
433 : "i" (EFLAGS_MASK)); \
434 } while (0)
435
436/* Instruction has only one explicit operand (no source operand). */
437#define emulate_1op(ctxt, _op) \
438 do { \
439 switch ((ctxt)->dst.bytes) { \
440 case 1: __emulate_1op(ctxt, _op, "b"); break; \
441 case 2: __emulate_1op(ctxt, _op, "w"); break; \
442 case 4: __emulate_1op(ctxt, _op, "l"); break; \
443 case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
444 } \
445 } while (0)
446
447static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); 299static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
448 300
449#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" 301#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
462#define FOPNOP() FOP_ALIGN FOP_RET 314#define FOPNOP() FOP_ALIGN FOP_RET
463 315
464#define FOP1E(op, dst) \ 316#define FOP1E(op, dst) \
465 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET 317 FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
318
319#define FOP1EEX(op, dst) \
320 FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
466 321
467#define FASTOP1(op) \ 322#define FASTOP1(op) \
468 FOP_START(op) \ 323 FOP_START(op) \
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
472 ON64(FOP1E(op##q, rax)) \ 327 ON64(FOP1E(op##q, rax)) \
473 FOP_END 328 FOP_END
474 329
330/* 1-operand, using src2 (for MUL/DIV r/m) */
331#define FASTOP1SRC2(op, name) \
332 FOP_START(name) \
333 FOP1E(op, cl) \
334 FOP1E(op, cx) \
335 FOP1E(op, ecx) \
336 ON64(FOP1E(op, rcx)) \
337 FOP_END
338
339/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
340#define FASTOP1SRC2EX(op, name) \
341 FOP_START(name) \
342 FOP1EEX(op, cl) \
343 FOP1EEX(op, cx) \
344 FOP1EEX(op, ecx) \
345 ON64(FOP1EEX(op, rcx)) \
346 FOP_END
347
475#define FOP2E(op, dst, src) \ 348#define FOP2E(op, dst, src) \
476 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET 349 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
477 350
478#define FASTOP2(op) \ 351#define FASTOP2(op) \
479 FOP_START(op) \ 352 FOP_START(op) \
480 FOP2E(op##b, al, bl) \ 353 FOP2E(op##b, al, dl) \
481 FOP2E(op##w, ax, bx) \ 354 FOP2E(op##w, ax, dx) \
482 FOP2E(op##l, eax, ebx) \ 355 FOP2E(op##l, eax, edx) \
483 ON64(FOP2E(op##q, rax, rbx)) \ 356 ON64(FOP2E(op##q, rax, rdx)) \
484 FOP_END 357 FOP_END
485 358
486/* 2 operand, word only */ 359/* 2 operand, word only */
487#define FASTOP2W(op) \ 360#define FASTOP2W(op) \
488 FOP_START(op) \ 361 FOP_START(op) \
489 FOPNOP() \ 362 FOPNOP() \
490 FOP2E(op##w, ax, bx) \ 363 FOP2E(op##w, ax, dx) \
491 FOP2E(op##l, eax, ebx) \ 364 FOP2E(op##l, eax, edx) \
492 ON64(FOP2E(op##q, rax, rbx)) \ 365 ON64(FOP2E(op##q, rax, rdx)) \
493 FOP_END 366 FOP_END
494 367
495/* 2 operand, src is CL */ 368/* 2 operand, src is CL */
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
508#define FASTOP3WCL(op) \ 381#define FASTOP3WCL(op) \
509 FOP_START(op) \ 382 FOP_START(op) \
510 FOPNOP() \ 383 FOPNOP() \
511 FOP3E(op##w, ax, bx, cl) \ 384 FOP3E(op##w, ax, dx, cl) \
512 FOP3E(op##l, eax, ebx, cl) \ 385 FOP3E(op##l, eax, edx, cl) \
513 ON64(FOP3E(op##q, rax, rbx, cl)) \ 386 ON64(FOP3E(op##q, rax, rdx, cl)) \
514 FOP_END 387 FOP_END
515 388
516/* Special case for SETcc - 1 instruction per cc */ 389/* Special case for SETcc - 1 instruction per cc */
517#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" 390#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
518 391
392asm(".global kvm_fastop_exception \n"
393 "kvm_fastop_exception: xor %esi, %esi; ret");
394
519FOP_START(setcc) 395FOP_START(setcc)
520FOP_SETCC(seto) 396FOP_SETCC(seto)
521FOP_SETCC(setno) 397FOP_SETCC(setno)
@@ -538,47 +414,6 @@ FOP_END;
538FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET 414FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
539FOP_END; 415FOP_END;
540 416
541#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
542 do { \
543 unsigned long _tmp; \
544 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
545 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
546 \
547 __asm__ __volatile__ ( \
548 _PRE_EFLAGS("0", "5", "1") \
549 "1: \n\t" \
550 _op _suffix " %6; " \
551 "2: \n\t" \
552 _POST_EFLAGS("0", "5", "1") \
553 ".pushsection .fixup,\"ax\" \n\t" \
554 "3: movb $1, %4 \n\t" \
555 "jmp 2b \n\t" \
556 ".popsection \n\t" \
557 _ASM_EXTABLE(1b, 3b) \
558 : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
559 "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
560 : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \
561 } while (0)
562
563/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
564#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
565 do { \
566 switch((ctxt)->src.bytes) { \
567 case 1: \
568 __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
569 break; \
570 case 2: \
571 __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
572 break; \
573 case 4: \
574 __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
575 break; \
576 case 8: ON64( \
577 __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
578 break; \
579 } \
580 } while (0)
581
582static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 417static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
583 enum x86_intercept intercept, 418 enum x86_intercept intercept,
584 enum x86_intercept_stage stage) 419 enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@ FASTOP2(xor);
988FASTOP2(cmp); 823FASTOP2(cmp);
989FASTOP2(test); 824FASTOP2(test);
990 825
826FASTOP1SRC2(mul, mul_ex);
827FASTOP1SRC2(imul, imul_ex);
828FASTOP1SRC2EX(div, div_ex);
829FASTOP1SRC2EX(idiv, idiv_ex);
830
991FASTOP3WCL(shld); 831FASTOP3WCL(shld);
992FASTOP3WCL(shrd); 832FASTOP3WCL(shrd);
993 833
@@ -1013,6 +853,8 @@ FASTOP2W(bts);
1013FASTOP2W(btr); 853FASTOP2W(btr);
1014FASTOP2W(btc); 854FASTOP2W(btc);
1015 855
856FASTOP2(xadd);
857
1016static u8 test_cc(unsigned int condition, unsigned long flags) 858static u8 test_cc(unsigned int condition, unsigned long flags)
1017{ 859{
1018 u8 rc; 860 u8 rc;
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)
1726 } 1568 }
1727} 1569}
1728 1570
1729static int writeback(struct x86_emulate_ctxt *ctxt) 1571static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
1730{ 1572{
1731 int rc; 1573 int rc;
1732 1574
1733 if (ctxt->d & NoWrite) 1575 switch (op->type) {
1734 return X86EMUL_CONTINUE;
1735
1736 switch (ctxt->dst.type) {
1737 case OP_REG: 1576 case OP_REG:
1738 write_register_operand(&ctxt->dst); 1577 write_register_operand(op);
1739 break; 1578 break;
1740 case OP_MEM: 1579 case OP_MEM:
1741 if (ctxt->lock_prefix) 1580 if (ctxt->lock_prefix)
1742 rc = segmented_cmpxchg(ctxt, 1581 rc = segmented_cmpxchg(ctxt,
1743 ctxt->dst.addr.mem, 1582 op->addr.mem,
1744 &ctxt->dst.orig_val, 1583 &op->orig_val,
1745 &ctxt->dst.val, 1584 &op->val,
1746 ctxt->dst.bytes); 1585 op->bytes);
1747 else 1586 else
1748 rc = segmented_write(ctxt, 1587 rc = segmented_write(ctxt,
1749 ctxt->dst.addr.mem, 1588 op->addr.mem,
1750 &ctxt->dst.val, 1589 &op->val,
1751 ctxt->dst.bytes); 1590 op->bytes);
1752 if (rc != X86EMUL_CONTINUE) 1591 if (rc != X86EMUL_CONTINUE)
1753 return rc; 1592 return rc;
1754 break; 1593 break;
1755 case OP_MEM_STR: 1594 case OP_MEM_STR:
1756 rc = segmented_write(ctxt, 1595 rc = segmented_write(ctxt,
1757 ctxt->dst.addr.mem, 1596 op->addr.mem,
1758 ctxt->dst.data, 1597 op->data,
1759 ctxt->dst.bytes * ctxt->dst.count); 1598 op->bytes * op->count);
1760 if (rc != X86EMUL_CONTINUE) 1599 if (rc != X86EMUL_CONTINUE)
1761 return rc; 1600 return rc;
1762 break; 1601 break;
1763 case OP_XMM: 1602 case OP_XMM:
1764 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1603 write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
1765 break; 1604 break;
1766 case OP_MM: 1605 case OP_MM:
1767 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); 1606 write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
1768 break; 1607 break;
1769 case OP_NONE: 1608 case OP_NONE:
1770 /* no writeback */ 1609 /* no writeback */
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
2117 return X86EMUL_CONTINUE; 1956 return X86EMUL_CONTINUE;
2118} 1957}
2119 1958
2120static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
2121{
2122 u8 ex = 0;
2123
2124 emulate_1op_rax_rdx(ctxt, "mul", ex);
2125 return X86EMUL_CONTINUE;
2126}
2127
2128static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
2129{
2130 u8 ex = 0;
2131
2132 emulate_1op_rax_rdx(ctxt, "imul", ex);
2133 return X86EMUL_CONTINUE;
2134}
2135
2136static int em_div_ex(struct x86_emulate_ctxt *ctxt)
2137{
2138 u8 de = 0;
2139
2140 emulate_1op_rax_rdx(ctxt, "div", de);
2141 if (de)
2142 return emulate_de(ctxt);
2143 return X86EMUL_CONTINUE;
2144}
2145
2146static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
2147{
2148 u8 de = 0;
2149
2150 emulate_1op_rax_rdx(ctxt, "idiv", de);
2151 if (de)
2152 return emulate_de(ctxt);
2153 return X86EMUL_CONTINUE;
2154}
2155
2156static int em_grp45(struct x86_emulate_ctxt *ctxt) 1959static int em_grp45(struct x86_emulate_ctxt *ctxt)
2157{ 1960{
2158 int rc = X86EMUL_CONTINUE; 1961 int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {
3734 F(DstMem | SrcImm | NoWrite, em_test), 3537 F(DstMem | SrcImm | NoWrite, em_test),
3735 F(DstMem | SrcNone | Lock, em_not), 3538 F(DstMem | SrcNone | Lock, em_not),
3736 F(DstMem | SrcNone | Lock, em_neg), 3539 F(DstMem | SrcNone | Lock, em_neg),
3737 I(SrcMem, em_mul_ex), 3540 F(DstXacc | Src2Mem, em_mul_ex),
3738 I(SrcMem, em_imul_ex), 3541 F(DstXacc | Src2Mem, em_imul_ex),
3739 I(SrcMem, em_div_ex), 3542 F(DstXacc | Src2Mem, em_div_ex),
3740 I(SrcMem, em_idiv_ex), 3543 F(DstXacc | Src2Mem, em_idiv_ex),
3741}; 3544};
3742 3545
3743static const struct opcode group4[] = { 3546static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {
4064 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), 3867 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
4065 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3868 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
4066 /* 0xC0 - 0xC7 */ 3869 /* 0xC0 - 0xC7 */
4067 D2bv(DstMem | SrcReg | ModRM | Lock), 3870 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
4068 N, D(DstMem | SrcReg | ModRM | Mov), 3871 N, D(DstMem | SrcReg | ModRM | Mov),
4069 N, N, N, GD(0, &group9), 3872 N, N, N, GD(0, &group9),
4070 /* 0xC8 - 0xCF */ 3873 /* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4172 fetch_register_operand(op); 3975 fetch_register_operand(op);
4173 op->orig_val = op->val; 3976 op->orig_val = op->val;
4174 break; 3977 break;
3978 case OpAccLo:
3979 op->type = OP_REG;
3980 op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
3981 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3982 fetch_register_operand(op);
3983 op->orig_val = op->val;
3984 break;
3985 case OpAccHi:
3986 if (ctxt->d & ByteOp) {
3987 op->type = OP_NONE;
3988 break;
3989 }
3990 op->type = OP_REG;
3991 op->bytes = ctxt->op_bytes;
3992 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3993 fetch_register_operand(op);
3994 op->orig_val = op->val;
3995 break;
4175 case OpDI: 3996 case OpDI:
4176 op->type = OP_MEM; 3997 op->type = OP_MEM;
4177 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3998 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4553static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) 4374static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4554{ 4375{
4555 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; 4376 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4556 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; 4377 if (!(ctxt->d & ByteOp))
4378 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4557 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" 4379 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4558 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) 4380 : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
4559 : "c"(ctxt->src2.val), [fastop]"S"(fop)); 4381 [fastop]"+S"(fop)
4382 : "c"(ctxt->src2.val));
4560 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); 4383 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4384 if (!fop) /* exception is returned in fop variable */
4385 return emulate_de(ctxt);
4561 return X86EMUL_CONTINUE; 4386 return X86EMUL_CONTINUE;
4562} 4387}
4563 4388
@@ -4773,9 +4598,17 @@ special_insn:
4773 goto done; 4598 goto done;
4774 4599
4775writeback: 4600writeback:
4776 rc = writeback(ctxt); 4601 if (!(ctxt->d & NoWrite)) {
4777 if (rc != X86EMUL_CONTINUE) 4602 rc = writeback(ctxt, &ctxt->dst);
4778 goto done; 4603 if (rc != X86EMUL_CONTINUE)
4604 goto done;
4605 }
4606 if (ctxt->d & SrcWrite) {
4607 BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
4608 rc = writeback(ctxt, &ctxt->src);
4609 if (rc != X86EMUL_CONTINUE)
4610 goto done;
4611 }
4779 4612
4780 /* 4613 /*
4781 * restore dst type in case the decoding will be reused 4614 * restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@ twobyte_insn:
4872 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : 4705 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4873 (s16) ctxt->src.val; 4706 (s16) ctxt->src.val;
4874 break; 4707 break;
4875 case 0xc0 ... 0xc1: /* xadd */
4876 fastop(ctxt, em_add);
4877 /* Write back the register source. */
4878 ctxt->src.val = ctxt->dst.orig_val;
4879 write_register_operand(&ctxt->src);
4880 break;
4881 case 0xc3: /* movnti */ 4708 case 0xc3: /* movnti */
4882 ctxt->dst.bytes = ctxt->op_bytes; 4709 ctxt->dst.bytes = ctxt->op_bytes;
4883 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : 4710 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8b64d1..afc11245827c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1608 return; 1608 return;
1609 1609
1610 if (atomic_read(&apic->lapic_timer.pending) > 0) { 1610 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1611 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1611 kvm_apic_local_deliver(apic, APIC_LVTT);
1612 atomic_dec(&apic->lapic_timer.pending); 1612 atomic_set(&apic->lapic_timer.pending, 0);
1613 } 1613 }
1614} 1614}
1615 1615
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
197} 197}
198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200/*
201 * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
202 * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
203 * number.
204 */
205#define MMIO_SPTE_GEN_LOW_SHIFT 3
206#define MMIO_SPTE_GEN_HIGH_SHIFT 52
207
208#define MMIO_GEN_SHIFT 19
209#define MMIO_GEN_LOW_SHIFT 9
210#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
211#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
212#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
213
214static u64 generation_mmio_spte_mask(unsigned int gen)
201{ 215{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 216 u64 mask;
217
218 WARN_ON(gen > MMIO_MAX_GEN);
219
220 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
221 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
222 return mask;
223}
224
225static unsigned int get_mmio_spte_generation(u64 spte)
226{
227 unsigned int gen;
228
229 spte &= ~shadow_mmio_mask;
230
231 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
232 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
233 return gen;
234}
235
236static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
237{
238 /*
239 * Init kvm generation close to MMIO_MAX_GEN to easily test the
240 * code of handling generation number wrap-around.
241 */
242 return (kvm_memslots(kvm)->generation +
243 MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
244}
245
246static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
247 unsigned access)
248{
249 unsigned int gen = kvm_current_mmio_generation(kvm);
250 u64 mask = generation_mmio_spte_mask(gen);
203 251
204 access &= ACC_WRITE_MASK | ACC_USER_MASK; 252 access &= ACC_WRITE_MASK | ACC_USER_MASK;
253 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
205 254
206 sp->mmio_cached = true; 255 trace_mark_mmio_spte(sptep, gfn, access, gen);
207 trace_mark_mmio_spte(sptep, gfn, access); 256 mmu_spte_set(sptep, mask);
208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
209} 257}
210 258
211static bool is_mmio_spte(u64 spte) 259static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
215 263
216static gfn_t get_mmio_spte_gfn(u64 spte) 264static gfn_t get_mmio_spte_gfn(u64 spte)
217{ 265{
218 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; 266 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
267 return (spte & ~mask) >> PAGE_SHIFT;
219} 268}
220 269
221static unsigned get_mmio_spte_access(u64 spte) 270static unsigned get_mmio_spte_access(u64 spte)
222{ 271{
223 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; 272 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
273 return (spte & ~mask) & ~PAGE_MASK;
224} 274}
225 275
226static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) 276static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
277 pfn_t pfn, unsigned access)
227{ 278{
228 if (unlikely(is_noslot_pfn(pfn))) { 279 if (unlikely(is_noslot_pfn(pfn))) {
229 mark_mmio_spte(sptep, gfn, access); 280 mark_mmio_spte(kvm, sptep, gfn, access);
230 return true; 281 return true;
231 } 282 }
232 283
233 return false; 284 return false;
234} 285}
235 286
287static bool check_mmio_spte(struct kvm *kvm, u64 spte)
288{
289 unsigned int kvm_gen, spte_gen;
290
291 kvm_gen = kvm_current_mmio_generation(kvm);
292 spte_gen = get_mmio_spte_generation(spte);
293
294 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
295 return likely(kvm_gen == spte_gen);
296}
297
236static inline u64 rsvd_bits(int s, int e) 298static inline u64 rsvd_bits(int s, int e)
237{ 299{
238 return ((1ULL << (e - s + 1)) - 1) << s; 300 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
404/* 466/*
405 * The idea using the light way get the spte on x86_32 guest is from 467 * The idea using the light way get the spte on x86_32 guest is from
406 * gup_get_pte(arch/x86/mm/gup.c). 468 * gup_get_pte(arch/x86/mm/gup.c).
407 * The difference is we can not catch the spte tlb flush if we leave 469 *
408 * guest mode, so we emulate it by increase clear_spte_count when spte 470 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
409 * is cleared. 471 * coalesces them and we are running out of the MMU lock. Therefore
472 * we need to protect against in-progress updates of the spte.
473 *
474 * Reading the spte while an update is in progress may get the old value
475 * for the high part of the spte. The race is fine for a present->non-present
476 * change (because the high part of the spte is ignored for non-present spte),
477 * but for a present->present change we must reread the spte.
478 *
479 * All such changes are done in two steps (present->non-present and
480 * non-present->present), hence it is enough to count the number of
481 * present->non-present updates: if it changed while reading the spte,
482 * we might have hit the race. This is done using clear_spte_count.
410 */ 483 */
411static u64 __get_spte_lockless(u64 *sptep) 484static u64 __get_spte_lockless(u64 *sptep)
412{ 485{
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1511 if (!direct) 1584 if (!direct)
1512 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1585 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1513 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1586 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1587
1588 /*
1589 * The active_mmu_pages list is the FIFO list, do not move the
1590 * page until it is zapped. kvm_zap_obsolete_pages depends on
1591 * this feature. See the comments in kvm_zap_obsolete_pages().
1592 */
1514 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1593 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1515 sp->parent_ptes = 0; 1594 sp->parent_ptes = 0;
1516 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1595 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1727static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list); 1728 struct list_head *invalid_list);
1650 1729
1730/*
1731 * NOTE: we should pay more attention on the zapped-obsolete page
1732 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
1733 * since it has been deleted from active_mmu_pages but still can be found
1734 * at hast list.
1735 *
1736 * for_each_gfn_indirect_valid_sp has skipped that kind of page and
1737 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
1738 * all the obsolete pages.
1739 */
1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1740#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1652 hlist_for_each_entry(_sp, \ 1741 hlist_for_each_entry(_sp, \
1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1742 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
1838 __clear_sp_write_flooding_count(sp); 1927 __clear_sp_write_flooding_count(sp);
1839} 1928}
1840 1929
1930static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1931{
1932 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1933}
1934
1841static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1935static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1842 gfn_t gfn, 1936 gfn_t gfn,
1843 gva_t gaddr, 1937 gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1864 role.quadrant = quadrant; 1958 role.quadrant = quadrant;
1865 } 1959 }
1866 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 1960 for_each_gfn_sp(vcpu->kvm, sp, gfn) {
1961 if (is_obsolete_sp(vcpu->kvm, sp))
1962 continue;
1963
1867 if (!need_sync && sp->unsync) 1964 if (!need_sync && sp->unsync)
1868 need_sync = true; 1965 need_sync = true;
1869 1966
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1900 1997
1901 account_shadowed(vcpu->kvm, gfn); 1998 account_shadowed(vcpu->kvm, gfn);
1902 } 1999 }
2000 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1903 init_shadow_page_table(sp); 2001 init_shadow_page_table(sp);
1904 trace_kvm_mmu_get_page(sp, true); 2002 trace_kvm_mmu_get_page(sp, true);
1905 return sp; 2003 return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2070 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2168 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
2071 kvm_mmu_page_unlink_children(kvm, sp); 2169 kvm_mmu_page_unlink_children(kvm, sp);
2072 kvm_mmu_unlink_parents(kvm, sp); 2170 kvm_mmu_unlink_parents(kvm, sp);
2171
2073 if (!sp->role.invalid && !sp->role.direct) 2172 if (!sp->role.invalid && !sp->role.direct)
2074 unaccount_shadowed(kvm, sp->gfn); 2173 unaccount_shadowed(kvm, sp->gfn);
2174
2075 if (sp->unsync) 2175 if (sp->unsync)
2076 kvm_unlink_unsync_page(kvm, sp); 2176 kvm_unlink_unsync_page(kvm, sp);
2077 if (!sp->root_count) { 2177 if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2081 kvm_mod_used_mmu_pages(kvm, -1); 2181 kvm_mod_used_mmu_pages(kvm, -1);
2082 } else { 2182 } else {
2083 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2183 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2084 kvm_reload_remote_mmus(kvm); 2184
2185 /*
2186 * The obsolete pages can not be used on any vcpus.
2187 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2188 */
2189 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2190 kvm_reload_remote_mmus(kvm);
2085 } 2191 }
2086 2192
2087 sp->role.invalid = 1; 2193 sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2331 u64 spte; 2437 u64 spte;
2332 int ret = 0; 2438 int ret = 0;
2333 2439
2334 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2440 if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
2335 return 0; 2441 return 0;
2336 2442
2337 spte = PT_PRESENT_MASK; 2443 spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2869 2975
2870 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2976 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2871 return; 2977 return;
2872 spin_lock(&vcpu->kvm->mmu_lock); 2978
2873 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 2979 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2874 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 2980 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2875 vcpu->arch.mmu.direct_map)) { 2981 vcpu->arch.mmu.direct_map)) {
2876 hpa_t root = vcpu->arch.mmu.root_hpa; 2982 hpa_t root = vcpu->arch.mmu.root_hpa;
2877 2983
2984 spin_lock(&vcpu->kvm->mmu_lock);
2878 sp = page_header(root); 2985 sp = page_header(root);
2879 --sp->root_count; 2986 --sp->root_count;
2880 if (!sp->root_count && sp->role.invalid) { 2987 if (!sp->root_count && sp->role.invalid) {
2881 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2988 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2882 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2989 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2883 } 2990 }
2884 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2885 spin_unlock(&vcpu->kvm->mmu_lock); 2991 spin_unlock(&vcpu->kvm->mmu_lock);
2992 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2886 return; 2993 return;
2887 } 2994 }
2995
2996 spin_lock(&vcpu->kvm->mmu_lock);
2888 for (i = 0; i < 4; ++i) { 2997 for (i = 0; i < 4; ++i) {
2889 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2998 hpa_t root = vcpu->arch.mmu.pae_root[i];
2890 2999
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3148 return spte; 3257 return spte;
3149} 3258}
3150 3259
3151/*
3152 * If it is a real mmio page fault, return 1 and emulat the instruction
3153 * directly, return 0 to let CPU fault again on the address, -1 is
3154 * returned if bug is detected.
3155 */
3156int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3260int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3157{ 3261{
3158 u64 spte; 3262 u64 spte;
3159 3263
3160 if (quickly_check_mmio_pf(vcpu, addr, direct)) 3264 if (quickly_check_mmio_pf(vcpu, addr, direct))
3161 return 1; 3265 return RET_MMIO_PF_EMULATE;
3162 3266
3163 spte = walk_shadow_page_get_mmio_spte(vcpu, addr); 3267 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
3164 3268
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3166 gfn_t gfn = get_mmio_spte_gfn(spte); 3270 gfn_t gfn = get_mmio_spte_gfn(spte);
3167 unsigned access = get_mmio_spte_access(spte); 3271 unsigned access = get_mmio_spte_access(spte);
3168 3272
3273 if (!check_mmio_spte(vcpu->kvm, spte))
3274 return RET_MMIO_PF_INVALID;
3275
3169 if (direct) 3276 if (direct)
3170 addr = 0; 3277 addr = 0;
3171 3278
3172 trace_handle_mmio_page_fault(addr, gfn, access); 3279 trace_handle_mmio_page_fault(addr, gfn, access);
3173 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3280 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3174 return 1; 3281 return RET_MMIO_PF_EMULATE;
3175 } 3282 }
3176 3283
3177 /* 3284 /*
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3179 * it's a BUG if the gfn is not a mmio page. 3286 * it's a BUG if the gfn is not a mmio page.
3180 */ 3287 */
3181 if (direct && !check_direct_spte_mmio_pf(spte)) 3288 if (direct && !check_direct_spte_mmio_pf(spte))
3182 return -1; 3289 return RET_MMIO_PF_BUG;
3183 3290
3184 /* 3291 /*
3185 * If the page table is zapped by other cpus, let CPU fault again on 3292 * If the page table is zapped by other cpus, let CPU fault again on
3186 * the address. 3293 * the address.
3187 */ 3294 */
3188 return 0; 3295 return RET_MMIO_PF_RETRY;
3189} 3296}
3190EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); 3297EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
3191 3298
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3195 int ret; 3302 int ret;
3196 3303
3197 ret = handle_mmio_page_fault_common(vcpu, addr, direct); 3304 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3198 WARN_ON(ret < 0); 3305 WARN_ON(ret == RET_MMIO_PF_BUG);
3199 return ret; 3306 return ret;
3200} 3307}
3201 3308
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3207 3314
3208 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3315 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3209 3316
3210 if (unlikely(error_code & PFERR_RSVD_MASK)) 3317 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3211 return handle_mmio_page_fault(vcpu, gva, error_code, true); 3318 r = handle_mmio_page_fault(vcpu, gva, error_code, true);
3319
3320 if (likely(r != RET_MMIO_PF_INVALID))
3321 return r;
3322 }
3212 3323
3213 r = mmu_topup_memory_caches(vcpu); 3324 r = mmu_topup_memory_caches(vcpu);
3214 if (r) 3325 if (r)
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3284 ASSERT(vcpu); 3395 ASSERT(vcpu);
3285 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3396 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3286 3397
3287 if (unlikely(error_code & PFERR_RSVD_MASK)) 3398 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3288 return handle_mmio_page_fault(vcpu, gpa, error_code, true); 3399 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
3400
3401 if (likely(r != RET_MMIO_PF_INVALID))
3402 return r;
3403 }
3289 3404
3290 r = mmu_topup_memory_caches(vcpu); 3405 r = mmu_topup_memory_caches(vcpu);
3291 if (r) 3406 if (r)
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3391 *access &= mask; 3506 *access &= mask;
3392} 3507}
3393 3508
3394static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3509static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3395 int *nr_present) 3510 unsigned access, int *nr_present)
3396{ 3511{
3397 if (unlikely(is_mmio_spte(*sptep))) { 3512 if (unlikely(is_mmio_spte(*sptep))) {
3398 if (gfn != get_mmio_spte_gfn(*sptep)) { 3513 if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3401 } 3516 }
3402 3517
3403 (*nr_present)++; 3518 (*nr_present)++;
3404 mark_mmio_spte(sptep, gfn, access); 3519 mark_mmio_spte(kvm, sptep, gfn, access);
3405 return true; 3520 return true;
3406 } 3521 }
3407 3522
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
3764 if (r) 3879 if (r)
3765 goto out; 3880 goto out;
3766 r = mmu_alloc_roots(vcpu); 3881 r = mmu_alloc_roots(vcpu);
3767 spin_lock(&vcpu->kvm->mmu_lock); 3882 kvm_mmu_sync_roots(vcpu);
3768 mmu_sync_roots(vcpu);
3769 spin_unlock(&vcpu->kvm->mmu_lock);
3770 if (r) 3883 if (r)
3771 goto out; 3884 goto out;
3772 /* set_cr3() should ensure TLB has been flushed */ 3885 /* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4179 spin_unlock(&kvm->mmu_lock); 4292 spin_unlock(&kvm->mmu_lock);
4180} 4293}
4181 4294
4182void kvm_mmu_zap_all(struct kvm *kvm) 4295#define BATCH_ZAP_PAGES 10
4296static void kvm_zap_obsolete_pages(struct kvm *kvm)
4183{ 4297{
4184 struct kvm_mmu_page *sp, *node; 4298 struct kvm_mmu_page *sp, *node;
4185 LIST_HEAD(invalid_list); 4299 int batch = 0;
4186 4300
4187 spin_lock(&kvm->mmu_lock);
4188restart: 4301restart:
4189 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 4302 list_for_each_entry_safe_reverse(sp, node,
4190 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 4303 &kvm->arch.active_mmu_pages, link) {
4304 int ret;
4305
4306 /*
4307 * No obsolete page exists before new created page since
4308 * active_mmu_pages is the FIFO list.
4309 */
4310 if (!is_obsolete_sp(kvm, sp))
4311 break;
4312
4313 /*
4314 * Since we are reversely walking the list and the invalid
4315 * list will be moved to the head, skip the invalid page
4316 * can help us to avoid the infinity list walking.
4317 */
4318 if (sp->role.invalid)
4319 continue;
4320
4321 /*
4322 * Need not flush tlb since we only zap the sp with invalid
4323 * generation number.
4324 */
4325 if (batch >= BATCH_ZAP_PAGES &&
4326 cond_resched_lock(&kvm->mmu_lock)) {
4327 batch = 0;
4328 goto restart;
4329 }
4330
4331 ret = kvm_mmu_prepare_zap_page(kvm, sp,
4332 &kvm->arch.zapped_obsolete_pages);
4333 batch += ret;
4334
4335 if (ret)
4191 goto restart; 4336 goto restart;
4337 }
4192 4338
4193 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4339 /*
4194 spin_unlock(&kvm->mmu_lock); 4340 * Should flush tlb before free page tables since lockless-walking
4341 * may use the pages.
4342 */
4343 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
4195} 4344}
4196 4345
4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) 4346/*
4347 * Fast invalidate all shadow pages and use lock-break technique
4348 * to zap obsolete pages.
4349 *
4350 * It's required when memslot is being deleted or VM is being
4351 * destroyed, in these cases, we should ensure that KVM MMU does
4352 * not use any resource of the being-deleted slot or all slots
4353 * after calling the function.
4354 */
4355void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
4198{ 4356{
4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4201
4202 spin_lock(&kvm->mmu_lock); 4357 spin_lock(&kvm->mmu_lock);
4203restart: 4358 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 4359 kvm->arch.mmu_valid_gen++;
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4210 4360
4211 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4361 /*
4362 * Notify all vcpus to reload its shadow page table
4363 * and flush TLB. Then all vcpus will switch to new
4364 * shadow page table with the new mmu_valid_gen.
4365 *
4366 * Note: we should do this under the protection of
4367 * mmu-lock, otherwise, vcpu would purge shadow page
4368 * but miss tlb flush.
4369 */
4370 kvm_reload_remote_mmus(kvm);
4371
4372 kvm_zap_obsolete_pages(kvm);
4212 spin_unlock(&kvm->mmu_lock); 4373 spin_unlock(&kvm->mmu_lock);
4213} 4374}
4214 4375
4376static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
4377{
4378 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
4379}
4380
4381void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4382{
4383 /*
4384 * The very rare case: if the generation-number is round,
4385 * zap all shadow pages.
4386 *
4387 * The max value is MMIO_MAX_GEN - 1 since it is not called
4388 * when mark memslot invalid.
4389 */
4390 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
4391 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4392 kvm_mmu_invalidate_zap_all_pages(kvm);
4393 }
4394}
4395
4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4396static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4216{ 4397{
4217 struct kvm *kvm; 4398 struct kvm *kvm;
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4240 * want to shrink a VM that only started to populate its MMU 4421 * want to shrink a VM that only started to populate its MMU
4241 * anyway. 4422 * anyway.
4242 */ 4423 */
4243 if (!kvm->arch.n_used_mmu_pages) 4424 if (!kvm->arch.n_used_mmu_pages &&
4425 !kvm_has_zapped_obsolete_pages(kvm))
4244 continue; 4426 continue;
4245 4427
4246 idx = srcu_read_lock(&kvm->srcu); 4428 idx = srcu_read_lock(&kvm->srcu);
4247 spin_lock(&kvm->mmu_lock); 4429 spin_lock(&kvm->mmu_lock);
4248 4430
4431 if (kvm_has_zapped_obsolete_pages(kvm)) {
4432 kvm_mmu_commit_zap_page(kvm,
4433 &kvm->arch.zapped_obsolete_pages);
4434 goto unlock;
4435 }
4436
4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list); 4437 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4250 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4438 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4251 4439
4440unlock:
4252 spin_unlock(&kvm->mmu_lock); 4441 spin_unlock(&kvm->mmu_lock);
4253 srcu_read_unlock(&kvm->srcu, idx); 4442 srcu_read_unlock(&kvm->srcu, idx);
4254 4443
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2cac6d..5b59c573aba7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,23 @@
52 52
53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); 54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
55
56/*
57 * Return values of handle_mmio_page_fault_common:
58 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
59 * directly.
60 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
61 * fault path update the mmio spte.
62 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
63 * RET_MMIO_PF_BUG: bug is detected.
64 */
65enum {
66 RET_MMIO_PF_EMULATE = 1,
67 RET_MMIO_PF_INVALID = 2,
68 RET_MMIO_PF_RETRY = 0,
69 RET_MMIO_PF_BUG = -1
70};
71
55int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
56int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 74
@@ -97,4 +114,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
97 return (mmu->permissions[pfec >> 1] >> pte_access) & 1; 114 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
98} 115}
99 116
117void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
100#endif 118#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172f4174..9d2e0ffcb190 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9 9
10#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
11 __field(__u64, gfn) \ 11 __field(unsigned long, mmu_valid_gen) \
12 __field(__u32, role) \ 12 __field(__u64, gfn) \
13 __field(__u32, root_count) \ 13 __field(__u32, role) \
14 __field(__u32, root_count) \
14 __field(bool, unsync) 15 __field(bool, unsync)
15 16
16#define KVM_MMU_PAGE_ASSIGN(sp) \ 17#define KVM_MMU_PAGE_ASSIGN(sp) \
17 __entry->gfn = sp->gfn; \ 18 __entry->mmu_valid_gen = sp->mmu_valid_gen; \
18 __entry->role = sp->role.word; \ 19 __entry->gfn = sp->gfn; \
19 __entry->root_count = sp->root_count; \ 20 __entry->role = sp->role.word; \
21 __entry->root_count = sp->root_count; \
20 __entry->unsync = sp->unsync; 22 __entry->unsync = sp->unsync;
21 23
22#define KVM_MMU_PAGE_PRINTK() ({ \ 24#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -28,8 +30,8 @@
28 \ 30 \
29 role.word = __entry->role; \ 31 role.word = __entry->role; \
30 \ 32 \
31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ 33 trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
32 " %snxe root %u %s%c", \ 34 " %snxe root %u %s%c", __entry->mmu_valid_gen, \
33 __entry->gfn, role.level, \ 35 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \ 36 role.cr4_pae ? " pae" : "", \
35 role.quadrant, \ 37 role.quadrant, \
@@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
197 199
198TRACE_EVENT( 200TRACE_EVENT(
199 mark_mmio_spte, 201 mark_mmio_spte,
200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 202 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
201 TP_ARGS(sptep, gfn, access), 203 TP_ARGS(sptep, gfn, access, gen),
202 204
203 TP_STRUCT__entry( 205 TP_STRUCT__entry(
204 __field(void *, sptep) 206 __field(void *, sptep)
205 __field(gfn_t, gfn) 207 __field(gfn_t, gfn)
206 __field(unsigned, access) 208 __field(unsigned, access)
209 __field(unsigned int, gen)
207 ), 210 ),
208 211
209 TP_fast_assign( 212 TP_fast_assign(
210 __entry->sptep = sptep; 213 __entry->sptep = sptep;
211 __entry->gfn = gfn; 214 __entry->gfn = gfn;
212 __entry->access = access; 215 __entry->access = access;
216 __entry->gen = gen;
213 ), 217 ),
214 218
215 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, 219 TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
216 __entry->access) 220 __entry->gfn, __entry->access, __entry->gen)
217); 221);
218 222
219TRACE_EVENT( 223TRACE_EVENT(
@@ -274,6 +278,50 @@ TRACE_EVENT(
274 __spte_satisfied(old_spte), __spte_satisfied(new_spte) 278 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
275 ) 279 )
276); 280);
281
282TRACE_EVENT(
283 kvm_mmu_invalidate_zap_all_pages,
284 TP_PROTO(struct kvm *kvm),
285 TP_ARGS(kvm),
286
287 TP_STRUCT__entry(
288 __field(unsigned long, mmu_valid_gen)
289 __field(unsigned int, mmu_used_pages)
290 ),
291
292 TP_fast_assign(
293 __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
294 __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
295 ),
296
297 TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
298 __entry->mmu_valid_gen, __entry->mmu_used_pages
299 )
300);
301
302
303TRACE_EVENT(
304 check_mmio_spte,
305 TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
306 TP_ARGS(spte, kvm_gen, spte_gen),
307
308 TP_STRUCT__entry(
309 __field(unsigned int, kvm_gen)
310 __field(unsigned int, spte_gen)
311 __field(u64, spte)
312 ),
313
314 TP_fast_assign(
315 __entry->kvm_gen = kvm_gen;
316 __entry->spte_gen = spte_gen;
317 __entry->spte = spte;
318 ),
319
320 TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
321 __entry->kvm_gen, __entry->spte_gen,
322 __entry->kvm_gen == __entry->spte_gen
323 )
324);
277#endif /* _TRACE_KVMMMU_H */ 325#endif /* _TRACE_KVMMMU_H */
278 326
279#undef TRACE_INCLUDE_PATH 327#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860b457a..7769699d48a8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
552 552
553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
554 554
555 if (unlikely(error_code & PFERR_RSVD_MASK)) 555 if (unlikely(error_code & PFERR_RSVD_MASK)) {
556 return handle_mmio_page_fault(vcpu, addr, error_code, 556 r = handle_mmio_page_fault(vcpu, addr, error_code,
557 mmu_is_nested(vcpu)); 557 mmu_is_nested(vcpu));
558 if (likely(r != RET_MMIO_PF_INVALID))
559 return r;
560 };
558 561
559 r = mmu_topup_memory_caches(vcpu); 562 r = mmu_topup_memory_caches(vcpu);
560 if (r) 563 if (r)
@@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
792 pte_access &= gpte_access(vcpu, gpte); 795 pte_access &= gpte_access(vcpu, gpte);
793 protect_clean_gpte(&pte_access, gpte); 796 protect_clean_gpte(&pte_access, gpte);
794 797
795 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present))
796 continue; 800 continue;
797 801
798 if (gfn != sp->gfns[i]) { 802 if (gfn != sp->gfns[i]) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6eaf871d..c0bc80391e40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1026 g_tsc_offset = svm->vmcb->control.tsc_offset - 1026 g_tsc_offset = svm->vmcb->control.tsc_offset -
1027 svm->nested.hsave->control.tsc_offset; 1027 svm->nested.hsave->control.tsc_offset;
1028 svm->nested.hsave->control.tsc_offset = offset; 1028 svm->nested.hsave->control.tsc_offset = offset;
1029 } 1029 } else
1030 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1031 svm->vmcb->control.tsc_offset,
1032 offset);
1030 1033
1031 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1034 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1032 1035
@@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
1044 svm->vmcb->control.tsc_offset += adjustment; 1047 svm->vmcb->control.tsc_offset += adjustment;
1045 if (is_guest_mode(vcpu)) 1048 if (is_guest_mode(vcpu))
1046 svm->nested.hsave->control.tsc_offset += adjustment; 1049 svm->nested.hsave->control.tsc_offset += adjustment;
1050 else
1051 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1052 svm->vmcb->control.tsc_offset - adjustment,
1053 svm->vmcb->control.tsc_offset);
1054
1047 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1055 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1048} 1056}
1049 1057
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index fe5e00ed7036..545245d7cc63 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -756,6 +756,27 @@ TRACE_EVENT(
756 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
757); 757);
758 758
759TRACE_EVENT(kvm_write_tsc_offset,
760 TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
761 __u64 next_tsc_offset),
762 TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
763
764 TP_STRUCT__entry(
765 __field( unsigned int, vcpu_id )
766 __field( __u64, previous_tsc_offset )
767 __field( __u64, next_tsc_offset )
768 ),
769
770 TP_fast_assign(
771 __entry->vcpu_id = vcpu_id;
772 __entry->previous_tsc_offset = previous_tsc_offset;
773 __entry->next_tsc_offset = next_tsc_offset;
774 ),
775
776 TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
777 __entry->previous_tsc_offset, __entry->next_tsc_offset)
778);
779
759#ifdef CONFIG_X86_64 780#ifdef CONFIG_X86_64
760 781
761#define host_clocks \ 782#define host_clocks \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b30f5a54a2ab..a7e18551c968 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2097 vmcs12->tsc_offset : 0)); 2097 vmcs12->tsc_offset : 0));
2098 } else { 2098 } else {
2099 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2100 vmcs_read64(TSC_OFFSET), offset);
2099 vmcs_write64(TSC_OFFSET, offset); 2101 vmcs_write64(TSC_OFFSET, offset);
2100 } 2102 }
2101} 2103}
@@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2103static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2105static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2104{ 2106{
2105 u64 offset = vmcs_read64(TSC_OFFSET); 2107 u64 offset = vmcs_read64(TSC_OFFSET);
2108
2106 vmcs_write64(TSC_OFFSET, offset + adjustment); 2109 vmcs_write64(TSC_OFFSET, offset + adjustment);
2107 if (is_guest_mode(vcpu)) { 2110 if (is_guest_mode(vcpu)) {
2108 /* Even when running L2, the adjustment needs to apply to L1 */ 2111 /* Even when running L2, the adjustment needs to apply to L1 */
2109 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2112 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2110 } 2113 } else
2114 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2115 offset + adjustment);
2111} 2116}
2112 2117
2113static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2118static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -4176,10 +4181,10 @@ static void ept_set_mmio_spte_mask(void)
4176 /* 4181 /*
4177 * EPT Misconfigurations can be generated if the value of bits 2:0 4182 * EPT Misconfigurations can be generated if the value of bits 2:0
4178 * of an EPT paging-structure entry is 110b (write/execute). 4183 * of an EPT paging-structure entry is 110b (write/execute).
4179 * Also, magic bits (0xffull << 49) is set to quickly identify mmio 4184 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4180 * spte. 4185 * spte.
4181 */ 4186 */
4182 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); 4187 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4183} 4188}
4184 4189
4185/* 4190/*
@@ -5366,10 +5371,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5366 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5371 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5367 5372
5368 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5373 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5369 if (likely(ret == 1)) 5374 if (likely(ret == RET_MMIO_PF_EMULATE))
5370 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5375 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5371 EMULATE_DONE; 5376 EMULATE_DONE;
5372 if (unlikely(!ret)) 5377
5378 if (unlikely(ret == RET_MMIO_PF_INVALID))
5379 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5380
5381 if (unlikely(ret == RET_MMIO_PF_RETRY))
5373 return 1; 5382 return 1;
5374 5383
5375 /* It is the real ept misconfig */ 5384 /* It is the real ept misconfig */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 292e6ca89f42..d21bce505315 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1193,20 +1193,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1193 elapsed = ns - kvm->arch.last_tsc_nsec; 1193 elapsed = ns - kvm->arch.last_tsc_nsec;
1194 1194
1195 if (vcpu->arch.virtual_tsc_khz) { 1195 if (vcpu->arch.virtual_tsc_khz) {
1196 int faulted = 0;
1197
1196 /* n.b - signed multiplication and division required */ 1198 /* n.b - signed multiplication and division required */
1197 usdiff = data - kvm->arch.last_tsc_write; 1199 usdiff = data - kvm->arch.last_tsc_write;
1198#ifdef CONFIG_X86_64 1200#ifdef CONFIG_X86_64
1199 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1201 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1200#else 1202#else
1201 /* do_div() only does unsigned */ 1203 /* do_div() only does unsigned */
1202 asm("idivl %2; xor %%edx, %%edx" 1204 asm("1: idivl %[divisor]\n"
1203 : "=A"(usdiff) 1205 "2: xor %%edx, %%edx\n"
1204 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1206 " movl $0, %[faulted]\n"
1207 "3:\n"
1208 ".section .fixup,\"ax\"\n"
1209 "4: movl $1, %[faulted]\n"
1210 " jmp 3b\n"
1211 ".previous\n"
1212
1213 _ASM_EXTABLE(1b, 4b)
1214
1215 : "=A"(usdiff), [faulted] "=r" (faulted)
1216 : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1217
1205#endif 1218#endif
1206 do_div(elapsed, 1000); 1219 do_div(elapsed, 1000);
1207 usdiff -= elapsed; 1220 usdiff -= elapsed;
1208 if (usdiff < 0) 1221 if (usdiff < 0)
1209 usdiff = -usdiff; 1222 usdiff = -usdiff;
1223
1224 /* idivl overflow => difference is larger than USEC_PER_SEC */
1225 if (faulted)
1226 usdiff = USEC_PER_SEC;
1210 } else 1227 } else
1211 usdiff = USEC_PER_SEC; /* disable TSC match window below */ 1228 usdiff = USEC_PER_SEC; /* disable TSC match window below */
1212 1229
@@ -1587,6 +1604,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1587 return 0; 1604 return 0;
1588} 1605}
1589 1606
1607/*
1608 * kvmclock updates which are isolated to a given vcpu, such as
1609 * vcpu->cpu migration, should not allow system_timestamp from
1610 * the rest of the vcpus to remain static. Otherwise ntp frequency
1611 * correction applies to one vcpu's system_timestamp but not
1612 * the others.
1613 *
1614 * So in those cases, request a kvmclock update for all vcpus.
1615 * The worst case for a remote vcpu to update its kvmclock
1616 * is then bounded by maximum nohz sleep latency.
1617 */
1618
1619static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1620{
1621 int i;
1622 struct kvm *kvm = v->kvm;
1623 struct kvm_vcpu *vcpu;
1624
1625 kvm_for_each_vcpu(i, vcpu, kvm) {
1626 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1627 kvm_vcpu_kick(vcpu);
1628 }
1629}
1630
1590static bool msr_mtrr_valid(unsigned msr) 1631static bool msr_mtrr_valid(unsigned msr)
1591{ 1632{
1592 switch (msr) { 1633 switch (msr) {
@@ -1984,7 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1984 kvmclock_reset(vcpu); 2025 kvmclock_reset(vcpu);
1985 2026
1986 vcpu->arch.time = data; 2027 vcpu->arch.time = data;
1987 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2028 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
1988 2029
1989 /* we verify if the enable bit is set... */ 2030 /* we verify if the enable bit is set... */
1990 if (!(data & 1)) 2031 if (!(data & 1))
@@ -2701,7 +2742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2701 * kvmclock on vcpu->cpu migration 2742 * kvmclock on vcpu->cpu migration
2702 */ 2743 */
2703 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 2744 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2704 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2745 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2705 if (vcpu->cpu != cpu) 2746 if (vcpu->cpu != cpu)
2706 kvm_migrate_timers(vcpu); 2747 kvm_migrate_timers(vcpu);
2707 vcpu->cpu = cpu; 2748 vcpu->cpu = cpu;
@@ -5238,7 +5279,13 @@ static void kvm_set_mmio_spte_mask(void)
5238 * Set the reserved bits and the present bit of an paging-structure 5279 * Set the reserved bits and the present bit of an paging-structure
5239 * entry to generate page fault with PFER.RSV = 1. 5280 * entry to generate page fault with PFER.RSV = 1.
5240 */ 5281 */
5241 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; 5282 /* Mask the reserved physical address bits. */
5283 mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
5284
5285 /* Bit 62 is always reserved for 32bit host. */
5286 mask |= 0x3ull << 62;
5287
5288 /* Set the present bit. */
5242 mask |= 1ull; 5289 mask |= 1ull;
5243 5290
5244#ifdef CONFIG_X86_64 5291#ifdef CONFIG_X86_64
@@ -5498,13 +5545,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5498 char instruction[3]; 5545 char instruction[3];
5499 unsigned long rip = kvm_rip_read(vcpu); 5546 unsigned long rip = kvm_rip_read(vcpu);
5500 5547
5501 /*
5502 * Blow out the MMU to ensure that no other VCPU has an active mapping
5503 * to ensure that the updated hypercall appears atomically across all
5504 * VCPUs.
5505 */
5506 kvm_mmu_zap_all(vcpu->kvm);
5507
5508 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5548 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5509 5549
5510 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 5550 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
@@ -5702,6 +5742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5702 __kvm_migrate_timers(vcpu); 5742 __kvm_migrate_timers(vcpu);
5703 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 5743 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5704 kvm_gen_update_masterclock(vcpu->kvm); 5744 kvm_gen_update_masterclock(vcpu->kvm);
5745 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
5746 kvm_gen_kvmclock_update(vcpu);
5705 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5747 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5706 r = kvm_guest_time_update(vcpu); 5748 r = kvm_guest_time_update(vcpu);
5707 if (unlikely(r)) 5749 if (unlikely(r))
@@ -6812,6 +6854,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6812 return -EINVAL; 6854 return -EINVAL;
6813 6855
6814 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6856 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6857 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
6815 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6858 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6816 6859
6817 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6860 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -7040,22 +7083,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7040 * If memory slot is created, or moved, we need to clear all 7083 * If memory slot is created, or moved, we need to clear all
7041 * mmio sptes. 7084 * mmio sptes.
7042 */ 7085 */
7043 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7044 kvm_mmu_zap_mmio_sptes(kvm);
7045 kvm_reload_remote_mmus(kvm);
7046 }
7047} 7087}
7048 7088
7049void kvm_arch_flush_shadow_all(struct kvm *kvm) 7089void kvm_arch_flush_shadow_all(struct kvm *kvm)
7050{ 7090{
7051 kvm_mmu_zap_all(kvm); 7091 kvm_mmu_invalidate_zap_all_pages(kvm);
7052 kvm_reload_remote_mmus(kvm);
7053} 7092}
7054 7093
7055void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7094void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7056 struct kvm_memory_slot *slot) 7095 struct kvm_memory_slot *slot)
7057{ 7096{
7058 kvm_arch_flush_shadow_all(kvm); 7097 kvm_mmu_invalidate_zap_all_pages(kvm);
7059} 7098}
7060 7099
7061int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 7100int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -7263,3 +7302,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7263EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 7302EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7264EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 7303EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7265EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7304EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7305EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
diff --git a/arch/arm/include/asm/kvm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 68cb9e1dfb81..6d9aeddc09bf 100644
--- a/arch/arm/include/asm/kvm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -61,6 +61,8 @@ struct arch_timer_cpu {
61#ifdef CONFIG_KVM_ARM_TIMER 61#ifdef CONFIG_KVM_ARM_TIMER
62int kvm_timer_hyp_init(void); 62int kvm_timer_hyp_init(void);
63int kvm_timer_init(struct kvm *kvm); 63int kvm_timer_init(struct kvm *kvm);
64void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
65 const struct kvm_irq_level *irq);
64void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); 66void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
65void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); 67void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
66void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); 68void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
@@ -76,6 +78,8 @@ static inline int kvm_timer_init(struct kvm *kvm)
76 return 0; 78 return 0;
77} 79}
78 80
81static inline void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
82 const struct kvm_irq_level *irq) {}
79static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {} 83static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {}
80static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {} 84static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {}
81static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {} 85static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm/include/asm/kvm_vgic.h b/include/kvm/arm_vgic.h
index 343744e4809c..343744e4809c 100644
--- a/arch/arm/include/asm/kvm_vgic.h
+++ b/include/kvm/arm_vgic.h
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8db53cfaccdb..a63d83ebd151 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -125,6 +125,7 @@ static inline bool is_error_page(struct page *page)
125#define KVM_REQ_MCLOCK_INPROGRESS 19 125#define KVM_REQ_MCLOCK_INPROGRESS 19
126#define KVM_REQ_EPR_EXIT 20 126#define KVM_REQ_EPR_EXIT 20
127#define KVM_REQ_SCAN_IOAPIC 21 127#define KVM_REQ_SCAN_IOAPIC 21
128#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
128 129
129#define KVM_USERSPACE_IRQ_SOURCE_ID 0 130#define KVM_USERSPACE_IRQ_SOURCE_ID 0
130#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 131#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
@@ -145,7 +146,8 @@ struct kvm_io_range {
145#define NR_IOBUS_DEVS 1000 146#define NR_IOBUS_DEVS 1000
146 147
147struct kvm_io_bus { 148struct kvm_io_bus {
148 int dev_count; 149 int dev_count;
150 int ioeventfd_count;
149 struct kvm_io_range range[]; 151 struct kvm_io_range range[];
150}; 152};
151 153
diff --git a/arch/arm/kvm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 49a7516d81c7..c2e1ef4604e8 100644
--- a/arch/arm/kvm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -25,14 +25,12 @@
25#include <clocksource/arm_arch_timer.h> 25#include <clocksource/arm_arch_timer.h>
26#include <asm/arch_timer.h> 26#include <asm/arch_timer.h>
27 27
28#include <asm/kvm_vgic.h> 28#include <kvm/arm_vgic.h>
29#include <asm/kvm_arch_timer.h> 29#include <kvm/arm_arch_timer.h>
30 30
31static struct timecounter *timecounter; 31static struct timecounter *timecounter;
32static struct workqueue_struct *wqueue; 32static struct workqueue_struct *wqueue;
33static struct kvm_irq_level timer_irq = { 33static unsigned int host_vtimer_irq;
34 .level = 1,
35};
36 34
37static cycle_t kvm_phys_timer_read(void) 35static cycle_t kvm_phys_timer_read(void)
38{ 36{
@@ -67,8 +65,8 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
67 65
68 timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK; 66 timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
69 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, 67 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
70 vcpu->arch.timer_cpu.irq->irq, 68 timer->irq->irq,
71 vcpu->arch.timer_cpu.irq->level); 69 timer->irq->level);
72} 70}
73 71
74static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) 72static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@@ -156,6 +154,20 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
156 timer_arm(timer, ns); 154 timer_arm(timer, ns);
157} 155}
158 156
157void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
158 const struct kvm_irq_level *irq)
159{
160 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
161
162 /*
163 * The vcpu timer irq number cannot be determined in
164 * kvm_timer_vcpu_init() because it is called much before
165 * kvm_vcpu_set_target(). To handle this, we determine
166 * vcpu timer irq number when the vcpu is reset.
167 */
168 timer->irq = irq;
169}
170
159void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) 171void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
160{ 172{
161 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 173 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -163,12 +175,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
163 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); 175 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
164 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 176 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
165 timer->timer.function = kvm_timer_expire; 177 timer->timer.function = kvm_timer_expire;
166 timer->irq = &timer_irq;
167} 178}
168 179
169static void kvm_timer_init_interrupt(void *info) 180static void kvm_timer_init_interrupt(void *info)
170{ 181{
171 enable_percpu_irq(timer_irq.irq, 0); 182 enable_percpu_irq(host_vtimer_irq, 0);
172} 183}
173 184
174 185
@@ -182,7 +193,7 @@ static int kvm_timer_cpu_notify(struct notifier_block *self,
182 break; 193 break;
183 case CPU_DYING: 194 case CPU_DYING:
184 case CPU_DYING_FROZEN: 195 case CPU_DYING_FROZEN:
185 disable_percpu_irq(timer_irq.irq); 196 disable_percpu_irq(host_vtimer_irq);
186 break; 197 break;
187 } 198 }
188 199
@@ -230,7 +241,7 @@ int kvm_timer_hyp_init(void)
230 goto out; 241 goto out;
231 } 242 }
232 243
233 timer_irq.irq = ppi; 244 host_vtimer_irq = ppi;
234 245
235 err = register_cpu_notifier(&kvm_timer_cpu_nb); 246 err = register_cpu_notifier(&kvm_timer_cpu_nb);
236 if (err) { 247 if (err) {
diff --git a/arch/arm/kvm/vgic.c b/virt/kvm/arm/vgic.c
index 17c5ac7d10ed..17c5ac7d10ed 100644
--- a/arch/arm/kvm/vgic.c
+++ b/virt/kvm/arm/vgic.c
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 64ee720b75c7..1550637d1b10 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -753,6 +753,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
753 if (ret < 0) 753 if (ret < 0)
754 goto unlock_fail; 754 goto unlock_fail;
755 755
756 kvm->buses[bus_idx]->ioeventfd_count++;
756 list_add_tail(&p->list, &kvm->ioeventfds); 757 list_add_tail(&p->list, &kvm->ioeventfds);
757 758
758 mutex_unlock(&kvm->slots_lock); 759 mutex_unlock(&kvm->slots_lock);
@@ -798,6 +799,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
798 continue; 799 continue;
799 800
800 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 801 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
802 kvm->buses[bus_idx]->ioeventfd_count--;
801 ioeventfd_release(p); 803 ioeventfd_release(p);
802 ret = 0; 804 ret = 0;
803 break; 805 break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 302681c4aa44..1580dd4ace4e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2926,7 +2926,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2926 struct kvm_io_bus *new_bus, *bus; 2926 struct kvm_io_bus *new_bus, *bus;
2927 2927
2928 bus = kvm->buses[bus_idx]; 2928 bus = kvm->buses[bus_idx];
2929 if (bus->dev_count > NR_IOBUS_DEVS - 1) 2929 /* exclude ioeventfd which is limited by maximum fd */
2930 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
2930 return -ENOSPC; 2931 return -ENOSPC;
2931 2932
2932 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 2933 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
@@ -3181,6 +3182,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
3181 3182
3182out_undebugfs: 3183out_undebugfs:
3183 unregister_syscore_ops(&kvm_syscore_ops); 3184 unregister_syscore_ops(&kvm_syscore_ops);
3185 misc_deregister(&kvm_dev);
3184out_unreg: 3186out_unreg:
3185 kvm_async_pf_deinit(); 3187 kvm_async_pf_deinit();
3186out_free: 3188out_free: