aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 16:21:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 16:21:40 -0400
commitfe489bf4505ae26d3c6d6a1f1d3064c2a9c5cd85 (patch)
tree46596fd7edf7c4da1dafdb2c62011841e71cf32d /arch
parent3e34131a65127e73fbae68c82748f32c8af7e4a4 (diff)
parenta3ff5fbc94a829680d4aa005cd17add1c1a1fb5b (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "On the x86 side, there are some optimizations and documentation updates. The big ARM/KVM change for 3.11, support for AArch64, will come through Catalin Marinas's tree. s390 and PPC have misc cleanups and bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (87 commits) KVM: PPC: Ignore PIR writes KVM: PPC: Book3S PR: Invalidate SLB entries properly KVM: PPC: Book3S PR: Allow guest to use 1TB segments KVM: PPC: Book3S PR: Don't keep scanning HPTEG after we find a match KVM: PPC: Book3S PR: Fix invalidation of SLB entry 0 on guest entry KVM: PPC: Book3S PR: Fix proto-VSID calculations KVM: PPC: Guard doorbell exception with CONFIG_PPC_DOORBELL KVM: Fix RTC interrupt coalescing tracking kvm: Add a tracepoint write_tsc_offset KVM: MMU: Inform users of mmio generation wraparound KVM: MMU: document fast invalidate all mmio sptes KVM: MMU: document fast invalidate all pages KVM: MMU: document fast page fault KVM: MMU: document mmio page fault KVM: MMU: document write_flooding_count KVM: MMU: document clear_spte_count KVM: MMU: drop kvm_mmu_zap_mmio_sptes KVM: MMU: init kvm generation close to mmio wrap-around value KVM: MMU: add tracepoint for check_mmio_spte KVM: MMU: fast invalidate all mmio sptes ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_arch_timer.h85
-rw-r--r--arch/arm/include/asm/kvm_arm.h1
-rw-r--r--arch/arm/include/asm/kvm_asm.h24
-rw-r--r--arch/arm/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm/include/asm/kvm_host.h13
-rw-r--r--arch/arm/include/asm/kvm_vgic.h220
-rw-r--r--arch/arm/kvm/Kconfig8
-rw-r--r--arch/arm/kvm/Makefile7
-rw-r--r--arch/arm/kvm/arch_timer.c273
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/coproc.c4
-rw-r--r--arch/arm/kvm/handle_exit.c3
-rw-r--r--arch/arm/kvm/interrupts.S16
-rw-r--r--arch/arm/kvm/interrupts_head.S10
-rw-r--r--arch/arm/kvm/mmio.c6
-rw-r--r--arch/arm/kvm/mmu.c3
-rw-r--r--arch/arm/kvm/psci.c2
-rw-r--r--arch/arm/kvm/reset.c12
-rw-r--r--arch/arm/kvm/vgic.c1499
-rw-r--r--arch/ia64/kvm/Makefile7
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h6
-rw-r--r--arch/powerpc/kvm/Makefile13
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c81
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c21
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S13
-rw-r--r--arch/powerpc/kvm/book3s_pr.c3
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/emulate.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h18
-rw-r--r--arch/s390/include/asm/perf_event.h10
-rw-r--r--arch/s390/include/asm/pgtable.h83
-rw-r--r--arch/s390/kernel/asm-offsets.c3
-rw-r--r--arch/s390/kernel/entry64.S81
-rw-r--r--arch/s390/kernel/perf_event.c52
-rw-r--r--arch/s390/kernel/s390_ksyms.c1
-rw-r--r--arch/s390/kvm/Makefile3
-rw-r--r--arch/s390/kvm/diag.c3
-rw-r--r--arch/s390/kvm/intercept.c124
-rw-r--r--arch/s390/kvm/interrupt.c18
-rw-r--r--arch/s390/kvm/kvm-s390.c105
-rw-r--r--arch/s390/kvm/kvm-s390.h14
-rw-r--r--arch/s390/kvm/priv.c274
-rw-r--r--arch/s390/kvm/sigp.c19
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/x86/include/asm/kvm_host.h15
-rw-r--r--arch/x86/kvm/Makefile13
-rw-r--r--arch/x86/kvm/emulate.c391
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c301
-rw-r--r--arch/x86/kvm/mmu.h18
-rw-r--r--arch/x86/kvm/mmutrace.h76
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/trace.h21
-rw-r--r--arch/x86/kvm/vmx.c19
-rw-r--r--arch/x86/kvm/x86.c80
56 files changed, 1259 insertions, 2857 deletions
diff --git a/arch/arm/include/asm/kvm_arch_timer.h b/arch/arm/include/asm/kvm_arch_timer.h
deleted file mode 100644
index 68cb9e1dfb81..000000000000
--- a/arch/arm/include/asm/kvm_arch_timer.h
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * Copyright (C) 2012 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef __ASM_ARM_KVM_ARCH_TIMER_H
20#define __ASM_ARM_KVM_ARCH_TIMER_H
21
22#include <linux/clocksource.h>
23#include <linux/hrtimer.h>
24#include <linux/workqueue.h>
25
26struct arch_timer_kvm {
27#ifdef CONFIG_KVM_ARM_TIMER
28 /* Is the timer enabled */
29 bool enabled;
30
31 /* Virtual offset */
32 cycle_t cntvoff;
33#endif
34};
35
36struct arch_timer_cpu {
37#ifdef CONFIG_KVM_ARM_TIMER
38 /* Registers: control register, timer value */
39 u32 cntv_ctl; /* Saved/restored */
40 cycle_t cntv_cval; /* Saved/restored */
41
42 /*
43 * Anything that is not used directly from assembly code goes
44 * here.
45 */
46
47 /* Background timer used when the guest is not running */
48 struct hrtimer timer;
49
50 /* Work queued with the above timer expires */
51 struct work_struct expired;
52
53 /* Background timer active */
54 bool armed;
55
56 /* Timer IRQ */
57 const struct kvm_irq_level *irq;
58#endif
59};
60
61#ifdef CONFIG_KVM_ARM_TIMER
62int kvm_timer_hyp_init(void);
63int kvm_timer_init(struct kvm *kvm);
64void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
65void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
66void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
67void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
68#else
69static inline int kvm_timer_hyp_init(void)
70{
71 return 0;
72};
73
74static inline int kvm_timer_init(struct kvm *kvm)
75{
76 return 0;
77}
78
79static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {}
80static inline void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) {}
81static inline void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) {}
82static inline void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) {}
83#endif
84
85#endif
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 124623e5ef14..64e96960de29 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -135,7 +135,6 @@
135#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL) 135#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1ULL)
136#define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30)) 136#define PTRS_PER_S2_PGD (1ULL << (KVM_PHYS_SHIFT - 30))
137#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) 137#define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
138#define S2_PGD_SIZE (1 << S2_PGD_ORDER)
139 138
140/* Virtualization Translation Control Register (VTCR) bits */ 139/* Virtualization Translation Control Register (VTCR) bits */
141#define VTCR_SH0 (3 << 12) 140#define VTCR_SH0 (3 << 12)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 18d50322a9e2..a2f43ddcc300 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -37,16 +37,18 @@
37#define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */ 37#define c5_AIFSR 15 /* Auxilary Instrunction Fault Status R */
38#define c6_DFAR 16 /* Data Fault Address Register */ 38#define c6_DFAR 16 /* Data Fault Address Register */
39#define c6_IFAR 17 /* Instruction Fault Address Register */ 39#define c6_IFAR 17 /* Instruction Fault Address Register */
40#define c9_L2CTLR 18 /* Cortex A15 L2 Control Register */ 40#define c7_PAR 18 /* Physical Address Register */
41#define c10_PRRR 19 /* Primary Region Remap Register */ 41#define c7_PAR_high 19 /* PAR top 32 bits */
42#define c10_NMRR 20 /* Normal Memory Remap Register */ 42#define c9_L2CTLR 20 /* Cortex A15 L2 Control Register */
43#define c12_VBAR 21 /* Vector Base Address Register */ 43#define c10_PRRR 21 /* Primary Region Remap Register */
44#define c13_CID 22 /* Context ID Register */ 44#define c10_NMRR 22 /* Normal Memory Remap Register */
45#define c13_TID_URW 23 /* Thread ID, User R/W */ 45#define c12_VBAR 23 /* Vector Base Address Register */
46#define c13_TID_URO 24 /* Thread ID, User R/O */ 46#define c13_CID 24 /* Context ID Register */
47#define c13_TID_PRIV 25 /* Thread ID, Privileged */ 47#define c13_TID_URW 25 /* Thread ID, User R/W */
48#define c14_CNTKCTL 26 /* Timer Control Register (PL1) */ 48#define c13_TID_URO 26 /* Thread ID, User R/O */
49#define NR_CP15_REGS 27 /* Number of regs (incl. invalid) */ 49#define c13_TID_PRIV 27 /* Thread ID, Privileged */
50#define c14_CNTKCTL 28 /* Timer Control Register (PL1) */
51#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */
50 52
51#define ARM_EXCEPTION_RESET 0 53#define ARM_EXCEPTION_RESET 0
52#define ARM_EXCEPTION_UNDEFINED 1 54#define ARM_EXCEPTION_UNDEFINED 1
@@ -72,8 +74,6 @@ extern char __kvm_hyp_vector[];
72extern char __kvm_hyp_code_start[]; 74extern char __kvm_hyp_code_start[];
73extern char __kvm_hyp_code_end[]; 75extern char __kvm_hyp_code_end[];
74 76
75extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
76
77extern void __kvm_flush_vm_context(void); 77extern void __kvm_flush_vm_context(void);
78extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); 78extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
79 79
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 82b4babead2c..a464e8d7b6c5 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -65,11 +65,6 @@ static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
65 return cpsr_mode > USR_MODE;; 65 return cpsr_mode > USR_MODE;;
66} 66}
67 67
68static inline bool kvm_vcpu_reg_is_pc(struct kvm_vcpu *vcpu, int reg)
69{
70 return reg == 15;
71}
72
73static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu) 68static inline u32 kvm_vcpu_get_hsr(struct kvm_vcpu *vcpu)
74{ 69{
75 return vcpu->arch.fault.hsr; 70 return vcpu->arch.fault.hsr;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 57cb786a6203..7d22517d8071 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -23,9 +23,14 @@
23#include <asm/kvm_asm.h> 23#include <asm/kvm_asm.h>
24#include <asm/kvm_mmio.h> 24#include <asm/kvm_mmio.h>
25#include <asm/fpstate.h> 25#include <asm/fpstate.h>
26#include <asm/kvm_arch_timer.h> 26#include <kvm/arm_arch_timer.h>
27 27
28#if defined(CONFIG_KVM_ARM_MAX_VCPUS)
28#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS 29#define KVM_MAX_VCPUS CONFIG_KVM_ARM_MAX_VCPUS
30#else
31#define KVM_MAX_VCPUS 0
32#endif
33
29#define KVM_USER_MEM_SLOTS 32 34#define KVM_USER_MEM_SLOTS 32
30#define KVM_PRIVATE_MEM_SLOTS 4 35#define KVM_PRIVATE_MEM_SLOTS 4
31#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 36#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -38,7 +43,7 @@
38#define KVM_NR_PAGE_SIZES 1 43#define KVM_NR_PAGE_SIZES 1
39#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) 44#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
40 45
41#include <asm/kvm_vgic.h> 46#include <kvm/arm_vgic.h>
42 47
43struct kvm_vcpu; 48struct kvm_vcpu;
44u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); 49u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
@@ -190,8 +195,8 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
190int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, 195int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
191 int exception_index); 196 int exception_index);
192 197
193static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr, 198static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
194 unsigned long long pgd_ptr, 199 phys_addr_t pgd_ptr,
195 unsigned long hyp_stack_ptr, 200 unsigned long hyp_stack_ptr,
196 unsigned long vector_ptr) 201 unsigned long vector_ptr)
197{ 202{
diff --git a/arch/arm/include/asm/kvm_vgic.h b/arch/arm/include/asm/kvm_vgic.h
deleted file mode 100644
index 343744e4809c..000000000000
--- a/arch/arm/include/asm/kvm_vgic.h
+++ /dev/null
@@ -1,220 +0,0 @@
1/*
2 * Copyright (C) 2012 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef __ASM_ARM_KVM_VGIC_H
20#define __ASM_ARM_KVM_VGIC_H
21
22#include <linux/kernel.h>
23#include <linux/kvm.h>
24#include <linux/irqreturn.h>
25#include <linux/spinlock.h>
26#include <linux/types.h>
27#include <linux/irqchip/arm-gic.h>
28
29#define VGIC_NR_IRQS 128
30#define VGIC_NR_SGIS 16
31#define VGIC_NR_PPIS 16
32#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
33#define VGIC_NR_SHARED_IRQS (VGIC_NR_IRQS - VGIC_NR_PRIVATE_IRQS)
34#define VGIC_MAX_CPUS KVM_MAX_VCPUS
35#define VGIC_MAX_LRS (1 << 6)
36
37/* Sanity checks... */
38#if (VGIC_MAX_CPUS > 8)
39#error Invalid number of CPU interfaces
40#endif
41
42#if (VGIC_NR_IRQS & 31)
43#error "VGIC_NR_IRQS must be a multiple of 32"
44#endif
45
46#if (VGIC_NR_IRQS > 1024)
47#error "VGIC_NR_IRQS must be <= 1024"
48#endif
49
50/*
51 * The GIC distributor registers describing interrupts have two parts:
52 * - 32 per-CPU interrupts (SGI + PPI)
53 * - a bunch of shared interrupts (SPI)
54 */
55struct vgic_bitmap {
56 union {
57 u32 reg[VGIC_NR_PRIVATE_IRQS / 32];
58 DECLARE_BITMAP(reg_ul, VGIC_NR_PRIVATE_IRQS);
59 } percpu[VGIC_MAX_CPUS];
60 union {
61 u32 reg[VGIC_NR_SHARED_IRQS / 32];
62 DECLARE_BITMAP(reg_ul, VGIC_NR_SHARED_IRQS);
63 } shared;
64};
65
66struct vgic_bytemap {
67 u32 percpu[VGIC_MAX_CPUS][VGIC_NR_PRIVATE_IRQS / 4];
68 u32 shared[VGIC_NR_SHARED_IRQS / 4];
69};
70
71struct vgic_dist {
72#ifdef CONFIG_KVM_ARM_VGIC
73 spinlock_t lock;
74 bool ready;
75
76 /* Virtual control interface mapping */
77 void __iomem *vctrl_base;
78
79 /* Distributor and vcpu interface mapping in the guest */
80 phys_addr_t vgic_dist_base;
81 phys_addr_t vgic_cpu_base;
82
83 /* Distributor enabled */
84 u32 enabled;
85
86 /* Interrupt enabled (one bit per IRQ) */
87 struct vgic_bitmap irq_enabled;
88
89 /* Interrupt 'pin' level */
90 struct vgic_bitmap irq_state;
91
92 /* Level-triggered interrupt in progress */
93 struct vgic_bitmap irq_active;
94
95 /* Interrupt priority. Not used yet. */
96 struct vgic_bytemap irq_priority;
97
98 /* Level/edge triggered */
99 struct vgic_bitmap irq_cfg;
100
101 /* Source CPU per SGI and target CPU */
102 u8 irq_sgi_sources[VGIC_MAX_CPUS][VGIC_NR_SGIS];
103
104 /* Target CPU for each IRQ */
105 u8 irq_spi_cpu[VGIC_NR_SHARED_IRQS];
106 struct vgic_bitmap irq_spi_target[VGIC_MAX_CPUS];
107
108 /* Bitmap indicating which CPU has something pending */
109 unsigned long irq_pending_on_cpu;
110#endif
111};
112
113struct vgic_cpu {
114#ifdef CONFIG_KVM_ARM_VGIC
115 /* per IRQ to LR mapping */
116 u8 vgic_irq_lr_map[VGIC_NR_IRQS];
117
118 /* Pending interrupts on this VCPU */
119 DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
120 DECLARE_BITMAP( pending_shared, VGIC_NR_SHARED_IRQS);
121
122 /* Bitmap of used/free list registers */
123 DECLARE_BITMAP( lr_used, VGIC_MAX_LRS);
124
125 /* Number of list registers on this CPU */
126 int nr_lr;
127
128 /* CPU vif control registers for world switch */
129 u32 vgic_hcr;
130 u32 vgic_vmcr;
131 u32 vgic_misr; /* Saved only */
132 u32 vgic_eisr[2]; /* Saved only */
133 u32 vgic_elrsr[2]; /* Saved only */
134 u32 vgic_apr;
135 u32 vgic_lr[VGIC_MAX_LRS];
136#endif
137};
138
139#define LR_EMPTY 0xff
140
141struct kvm;
142struct kvm_vcpu;
143struct kvm_run;
144struct kvm_exit_mmio;
145
146#ifdef CONFIG_KVM_ARM_VGIC
147int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr);
148int kvm_vgic_hyp_init(void);
149int kvm_vgic_init(struct kvm *kvm);
150int kvm_vgic_create(struct kvm *kvm);
151int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
152void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
153void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
154int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
155 bool level);
156int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
157bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
158 struct kvm_exit_mmio *mmio);
159
160#define irqchip_in_kernel(k) (!!((k)->arch.vgic.vctrl_base))
161#define vgic_initialized(k) ((k)->arch.vgic.ready)
162
163#else
164static inline int kvm_vgic_hyp_init(void)
165{
166 return 0;
167}
168
169static inline int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr)
170{
171 return 0;
172}
173
174static inline int kvm_vgic_init(struct kvm *kvm)
175{
176 return 0;
177}
178
179static inline int kvm_vgic_create(struct kvm *kvm)
180{
181 return 0;
182}
183
184static inline int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
185{
186 return 0;
187}
188
189static inline void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) {}
190static inline void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) {}
191
192static inline int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid,
193 unsigned int irq_num, bool level)
194{
195 return 0;
196}
197
198static inline int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
199{
200 return 0;
201}
202
203static inline bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
204 struct kvm_exit_mmio *mmio)
205{
206 return false;
207}
208
209static inline int irqchip_in_kernel(struct kvm *kvm)
210{
211 return 0;
212}
213
214static inline bool vgic_initialized(struct kvm *kvm)
215{
216 return true;
217}
218#endif
219
220#endif
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 370e1a8af6ac..ebf5015508b5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -41,9 +41,9 @@ config KVM_ARM_HOST
41 Provides host support for ARM processors. 41 Provides host support for ARM processors.
42 42
43config KVM_ARM_MAX_VCPUS 43config KVM_ARM_MAX_VCPUS
44 int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST 44 int "Number maximum supported virtual CPUs per VM"
45 default 4 if KVM_ARM_HOST 45 depends on KVM_ARM_HOST
46 default 0 46 default 4
47 help 47 help
48 Static number of max supported virtual CPUs per VM. 48 Static number of max supported virtual CPUs per VM.
49 49
@@ -67,6 +67,4 @@ config KVM_ARM_TIMER
67 ---help--- 67 ---help---
68 Adds support for the Architected Timers in virtual machines 68 Adds support for the Architected Timers in virtual machines
69 69
70source drivers/virtio/Kconfig
71
72endif # VIRTUALIZATION 70endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 53c5ed83d16f..d99bee4950e5 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -14,10 +14,11 @@ CFLAGS_mmu.o := -I.
14AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) 14AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) 15AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
16 16
17kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 17KVM := ../../../virt/kvm
18kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o
18 19
19obj-y += kvm-arm.o init.o interrupts.o 20obj-y += kvm-arm.o init.o interrupts.o
20obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 21obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
21obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o 22obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
22obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o 23obj-$(CONFIG_KVM_ARM_VGIC) += $(KVM)/arm/vgic.o
23obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o 24obj-$(CONFIG_KVM_ARM_TIMER) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arch_timer.c b/arch/arm/kvm/arch_timer.c
deleted file mode 100644
index 49a7516d81c7..000000000000
--- a/arch/arm/kvm/arch_timer.c
+++ /dev/null
@@ -1,273 +0,0 @@
1/*
2 * Copyright (C) 2012 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/cpu.h>
20#include <linux/of_irq.h>
21#include <linux/kvm.h>
22#include <linux/kvm_host.h>
23#include <linux/interrupt.h>
24
25#include <clocksource/arm_arch_timer.h>
26#include <asm/arch_timer.h>
27
28#include <asm/kvm_vgic.h>
29#include <asm/kvm_arch_timer.h>
30
31static struct timecounter *timecounter;
32static struct workqueue_struct *wqueue;
33static struct kvm_irq_level timer_irq = {
34 .level = 1,
35};
36
37static cycle_t kvm_phys_timer_read(void)
38{
39 return timecounter->cc->read(timecounter->cc);
40}
41
42static bool timer_is_armed(struct arch_timer_cpu *timer)
43{
44 return timer->armed;
45}
46
47/* timer_arm: as in "arm the timer", not as in ARM the company */
48static void timer_arm(struct arch_timer_cpu *timer, u64 ns)
49{
50 timer->armed = true;
51 hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns),
52 HRTIMER_MODE_ABS);
53}
54
55static void timer_disarm(struct arch_timer_cpu *timer)
56{
57 if (timer_is_armed(timer)) {
58 hrtimer_cancel(&timer->timer);
59 cancel_work_sync(&timer->expired);
60 timer->armed = false;
61 }
62}
63
64static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
65{
66 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
67
68 timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
69 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
70 vcpu->arch.timer_cpu.irq->irq,
71 vcpu->arch.timer_cpu.irq->level);
72}
73
74static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
75{
76 struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
77
78 /*
79 * We disable the timer in the world switch and let it be
80 * handled by kvm_timer_sync_hwstate(). Getting a timer
81 * interrupt at this point is a sure sign of some major
82 * breakage.
83 */
84 pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu);
85 return IRQ_HANDLED;
86}
87
88static void kvm_timer_inject_irq_work(struct work_struct *work)
89{
90 struct kvm_vcpu *vcpu;
91
92 vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
93 vcpu->arch.timer_cpu.armed = false;
94 kvm_timer_inject_irq(vcpu);
95}
96
97static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
98{
99 struct arch_timer_cpu *timer;
100 timer = container_of(hrt, struct arch_timer_cpu, timer);
101 queue_work(wqueue, &timer->expired);
102 return HRTIMER_NORESTART;
103}
104
105/**
106 * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
107 * @vcpu: The vcpu pointer
108 *
109 * Disarm any pending soft timers, since the world-switch code will write the
110 * virtual timer state back to the physical CPU.
111 */
112void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
113{
114 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
115
116 /*
117 * We're about to run this vcpu again, so there is no need to
118 * keep the background timer running, as we're about to
119 * populate the CPU timer again.
120 */
121 timer_disarm(timer);
122}
123
124/**
125 * kvm_timer_sync_hwstate - sync timer state from cpu
126 * @vcpu: The vcpu pointer
127 *
128 * Check if the virtual timer was armed and either schedule a corresponding
129 * soft timer or inject directly if already expired.
130 */
131void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
132{
133 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
134 cycle_t cval, now;
135 u64 ns;
136
137 if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
138 !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
139 return;
140
141 cval = timer->cntv_cval;
142 now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
143
144 BUG_ON(timer_is_armed(timer));
145
146 if (cval <= now) {
147 /*
148 * Timer has already expired while we were not
149 * looking. Inject the interrupt and carry on.
150 */
151 kvm_timer_inject_irq(vcpu);
152 return;
153 }
154
155 ns = cyclecounter_cyc2ns(timecounter->cc, cval - now);
156 timer_arm(timer, ns);
157}
158
159void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
160{
161 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
162
163 INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
164 hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
165 timer->timer.function = kvm_timer_expire;
166 timer->irq = &timer_irq;
167}
168
169static void kvm_timer_init_interrupt(void *info)
170{
171 enable_percpu_irq(timer_irq.irq, 0);
172}
173
174
175static int kvm_timer_cpu_notify(struct notifier_block *self,
176 unsigned long action, void *cpu)
177{
178 switch (action) {
179 case CPU_STARTING:
180 case CPU_STARTING_FROZEN:
181 kvm_timer_init_interrupt(NULL);
182 break;
183 case CPU_DYING:
184 case CPU_DYING_FROZEN:
185 disable_percpu_irq(timer_irq.irq);
186 break;
187 }
188
189 return NOTIFY_OK;
190}
191
192static struct notifier_block kvm_timer_cpu_nb = {
193 .notifier_call = kvm_timer_cpu_notify,
194};
195
196static const struct of_device_id arch_timer_of_match[] = {
197 { .compatible = "arm,armv7-timer", },
198 { .compatible = "arm,armv8-timer", },
199 {},
200};
201
202int kvm_timer_hyp_init(void)
203{
204 struct device_node *np;
205 unsigned int ppi;
206 int err;
207
208 timecounter = arch_timer_get_timecounter();
209 if (!timecounter)
210 return -ENODEV;
211
212 np = of_find_matching_node(NULL, arch_timer_of_match);
213 if (!np) {
214 kvm_err("kvm_arch_timer: can't find DT node\n");
215 return -ENODEV;
216 }
217
218 ppi = irq_of_parse_and_map(np, 2);
219 if (!ppi) {
220 kvm_err("kvm_arch_timer: no virtual timer interrupt\n");
221 err = -EINVAL;
222 goto out;
223 }
224
225 err = request_percpu_irq(ppi, kvm_arch_timer_handler,
226 "kvm guest timer", kvm_get_running_vcpus());
227 if (err) {
228 kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
229 ppi, err);
230 goto out;
231 }
232
233 timer_irq.irq = ppi;
234
235 err = register_cpu_notifier(&kvm_timer_cpu_nb);
236 if (err) {
237 kvm_err("Cannot register timer CPU notifier\n");
238 goto out_free;
239 }
240
241 wqueue = create_singlethread_workqueue("kvm_arch_timer");
242 if (!wqueue) {
243 err = -ENOMEM;
244 goto out_free;
245 }
246
247 kvm_info("%s IRQ%d\n", np->name, ppi);
248 on_each_cpu(kvm_timer_init_interrupt, NULL, 1);
249
250 goto out;
251out_free:
252 free_percpu_irq(ppi, kvm_get_running_vcpus());
253out:
254 of_node_put(np);
255 return err;
256}
257
258void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
259{
260 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
261
262 timer_disarm(timer);
263}
264
265int kvm_timer_init(struct kvm *kvm)
266{
267 if (timecounter && wqueue) {
268 kvm->arch.timer.cntvoff = kvm_phys_timer_read();
269 kvm->arch.timer.enabled = 1;
270 }
271
272 return 0;
273}
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index ef1703b9587b..741f66a2edbd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -800,8 +800,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
800 800
801static void cpu_init_hyp_mode(void *dummy) 801static void cpu_init_hyp_mode(void *dummy)
802{ 802{
803 unsigned long long boot_pgd_ptr; 803 phys_addr_t boot_pgd_ptr;
804 unsigned long long pgd_ptr; 804 phys_addr_t pgd_ptr;
805 unsigned long hyp_stack_ptr; 805 unsigned long hyp_stack_ptr;
806 unsigned long stack_page; 806 unsigned long stack_page;
807 unsigned long vector_ptr; 807 unsigned long vector_ptr;
@@ -809,8 +809,8 @@ static void cpu_init_hyp_mode(void *dummy)
809 /* Switch from the HYP stub to our own HYP init vector */ 809 /* Switch from the HYP stub to our own HYP init vector */
810 __hyp_set_vectors(kvm_get_idmap_vector()); 810 __hyp_set_vectors(kvm_get_idmap_vector());
811 811
812 boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr(); 812 boot_pgd_ptr = kvm_mmu_get_boot_httbr();
813 pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); 813 pgd_ptr = kvm_mmu_get_httbr();
814 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); 814 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
815 hyp_stack_ptr = stack_page + PAGE_SIZE; 815 hyp_stack_ptr = stack_page + PAGE_SIZE;
816 vector_ptr = (unsigned long)__kvm_hyp_vector; 816 vector_ptr = (unsigned long)__kvm_hyp_vector;
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 8eea97be1ed5..4a5199070430 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -180,6 +180,10 @@ static const struct coproc_reg cp15_regs[] = {
180 NULL, reset_unknown, c6_DFAR }, 180 NULL, reset_unknown, c6_DFAR },
181 { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, 181 { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
182 NULL, reset_unknown, c6_IFAR }, 182 NULL, reset_unknown, c6_IFAR },
183
184 /* PAR swapped by interrupt.S */
185 { CRn( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
186
183 /* 187 /*
184 * DC{C,I,CI}SW operations: 188 * DC{C,I,CI}SW operations:
185 */ 189 */
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 3d74a0be47db..df4c82d47ad7 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -52,9 +52,6 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
52 52
53static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) 53static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
54{ 54{
55 if (kvm_psci_call(vcpu))
56 return 1;
57
58 kvm_inject_undefined(vcpu); 55 kvm_inject_undefined(vcpu);
59 return 1; 56 return 1;
60} 57}
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index f7793df62f58..16cd4ba5d7fd 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -49,6 +49,7 @@ __kvm_hyp_code_start:
49ENTRY(__kvm_tlb_flush_vmid_ipa) 49ENTRY(__kvm_tlb_flush_vmid_ipa)
50 push {r2, r3} 50 push {r2, r3}
51 51
52 dsb ishst
52 add r0, r0, #KVM_VTTBR 53 add r0, r0, #KVM_VTTBR
53 ldrd r2, r3, [r0] 54 ldrd r2, r3, [r0]
54 mcrr p15, 6, r2, r3, c2 @ Write VTTBR 55 mcrr p15, 6, r2, r3, c2 @ Write VTTBR
@@ -291,6 +292,7 @@ THUMB( orr r2, r2, #PSR_T_BIT )
291 ldr r2, =BSYM(panic) 292 ldr r2, =BSYM(panic)
292 msr ELR_hyp, r2 293 msr ELR_hyp, r2
293 ldr r0, =\panic_str 294 ldr r0, =\panic_str
295 clrex @ Clear exclusive monitor
294 eret 296 eret
295.endm 297.endm
296 298
@@ -414,6 +416,10 @@ guest_trap:
414 mrcne p15, 4, r2, c6, c0, 4 @ HPFAR 416 mrcne p15, 4, r2, c6, c0, 4 @ HPFAR
415 bne 3f 417 bne 3f
416 418
419 /* Preserve PAR */
420 mrrc p15, 0, r0, r1, c7 @ PAR
421 push {r0, r1}
422
417 /* Resolve IPA using the xFAR */ 423 /* Resolve IPA using the xFAR */
418 mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR 424 mcr p15, 0, r2, c7, c8, 0 @ ATS1CPR
419 isb 425 isb
@@ -424,13 +430,20 @@ guest_trap:
424 lsl r2, r2, #4 430 lsl r2, r2, #4
425 orr r2, r2, r1, lsl #24 431 orr r2, r2, r1, lsl #24
426 432
433 /* Restore PAR */
434 pop {r0, r1}
435 mcrr p15, 0, r0, r1, c7 @ PAR
436
4273: load_vcpu @ Load VCPU pointer to r0 4373: load_vcpu @ Load VCPU pointer to r0
428 str r2, [r0, #VCPU_HPFAR] 438 str r2, [r0, #VCPU_HPFAR]
429 439
4301: mov r1, #ARM_EXCEPTION_HVC 4401: mov r1, #ARM_EXCEPTION_HVC
431 b __kvm_vcpu_return 441 b __kvm_vcpu_return
432 442
4334: pop {r0, r1, r2} @ Failed translation, return to guest 4434: pop {r0, r1} @ Failed translation, return to guest
444 mcrr p15, 0, r0, r1, c7 @ PAR
445 clrex
446 pop {r0, r1, r2}
434 eret 447 eret
435 448
436/* 449/*
@@ -456,6 +469,7 @@ switch_to_guest_vfp:
456 469
457 pop {r3-r7} 470 pop {r3-r7}
458 pop {r0-r2} 471 pop {r0-r2}
472 clrex
459 eret 473 eret
460#endif 474#endif
461 475
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index d43cfb5b37c4..6f18695a09cb 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -302,11 +302,14 @@ vcpu .req r0 @ vcpu pointer always in r0
302 .endif 302 .endif
303 303
304 mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL 304 mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL
305 mrrc p15, 0, r4, r5, c7 @ PAR
305 306
306 .if \store_to_vcpu == 0 307 .if \store_to_vcpu == 0
307 push {r2} 308 push {r2,r4-r5}
308 .else 309 .else
309 str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] 310 str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
311 add r12, vcpu, #CP15_OFFSET(c7_PAR)
312 strd r4, r5, [r12]
310 .endif 313 .endif
311.endm 314.endm
312 315
@@ -319,12 +322,15 @@ vcpu .req r0 @ vcpu pointer always in r0
319 */ 322 */
320.macro write_cp15_state read_from_vcpu 323.macro write_cp15_state read_from_vcpu
321 .if \read_from_vcpu == 0 324 .if \read_from_vcpu == 0
322 pop {r2} 325 pop {r2,r4-r5}
323 .else 326 .else
324 ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)] 327 ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
328 add r12, vcpu, #CP15_OFFSET(c7_PAR)
329 ldrd r4, r5, [r12]
325 .endif 330 .endif
326 331
327 mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL 332 mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL
333 mcrr p15, 0, r4, r5, c7 @ PAR
328 334
329 .if \read_from_vcpu == 0 335 .if \read_from_vcpu == 0
330 pop {r2-r12} 336 pop {r2-r12}
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 72a12f2171b2..b8e06b7a2833 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -86,12 +86,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
86 sign_extend = kvm_vcpu_dabt_issext(vcpu); 86 sign_extend = kvm_vcpu_dabt_issext(vcpu);
87 rt = kvm_vcpu_dabt_get_rd(vcpu); 87 rt = kvm_vcpu_dabt_get_rd(vcpu);
88 88
89 if (kvm_vcpu_reg_is_pc(vcpu, rt)) {
90 /* IO memory trying to read/write pc */
91 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
92 return 1;
93 }
94
95 mmio->is_write = is_write; 89 mmio->is_write = is_write;
96 mmio->phys_addr = fault_ipa; 90 mmio->phys_addr = fault_ipa;
97 mmio->len = len; 91 mmio->len = len;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 84ba67b982c0..ca6bea4859b4 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -382,9 +382,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
382 if (!pgd) 382 if (!pgd)
383 return -ENOMEM; 383 return -ENOMEM;
384 384
385 /* stage-2 pgd must be aligned to its size */
386 VM_BUG_ON((unsigned long)pgd & (S2_PGD_SIZE - 1));
387
388 memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t)); 385 memset(pgd, 0, PTRS_PER_S2_PGD * sizeof(pgd_t));
389 kvm_clean_pgd(pgd); 386 kvm_clean_pgd(pgd);
390 kvm->arch.pgd = pgd; 387 kvm->arch.pgd = pgd;
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 7ee5bb7a3667..86a693a02ba3 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -75,7 +75,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
75 * kvm_psci_call - handle PSCI call if r0 value is in range 75 * kvm_psci_call - handle PSCI call if r0 value is in range
76 * @vcpu: Pointer to the VCPU struct 76 * @vcpu: Pointer to the VCPU struct
77 * 77 *
78 * Handle PSCI calls from guests through traps from HVC or SMC instructions. 78 * Handle PSCI calls from guests through traps from HVC instructions.
79 * The calling convention is similar to SMC calls to the secure world where 79 * The calling convention is similar to SMC calls to the secure world where
80 * the function number is placed in r0 and this function returns true if the 80 * the function number is placed in r0 and this function returns true if the
81 * function number specified in r0 is withing the PSCI range, and false 81 * function number specified in r0 is withing the PSCI range, and false
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index b80256b554cd..b7840e7aa452 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -27,6 +27,8 @@
27#include <asm/kvm_arm.h> 27#include <asm/kvm_arm.h>
28#include <asm/kvm_coproc.h> 28#include <asm/kvm_coproc.h>
29 29
30#include <kvm/arm_arch_timer.h>
31
30/****************************************************************************** 32/******************************************************************************
31 * Cortex-A15 Reset Values 33 * Cortex-A15 Reset Values
32 */ 34 */
@@ -37,6 +39,11 @@ static struct kvm_regs a15_regs_reset = {
37 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, 39 .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
38}; 40};
39 41
42static const struct kvm_irq_level a15_vtimer_irq = {
43 .irq = 27,
44 .level = 1,
45};
46
40 47
41/******************************************************************************* 48/*******************************************************************************
42 * Exported reset function 49 * Exported reset function
@@ -52,6 +59,7 @@ static struct kvm_regs a15_regs_reset = {
52int kvm_reset_vcpu(struct kvm_vcpu *vcpu) 59int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
53{ 60{
54 struct kvm_regs *cpu_reset; 61 struct kvm_regs *cpu_reset;
62 const struct kvm_irq_level *cpu_vtimer_irq;
55 63
56 switch (vcpu->arch.target) { 64 switch (vcpu->arch.target) {
57 case KVM_ARM_TARGET_CORTEX_A15: 65 case KVM_ARM_TARGET_CORTEX_A15:
@@ -59,6 +67,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
59 return -EINVAL; 67 return -EINVAL;
60 cpu_reset = &a15_regs_reset; 68 cpu_reset = &a15_regs_reset;
61 vcpu->arch.midr = read_cpuid_id(); 69 vcpu->arch.midr = read_cpuid_id();
70 cpu_vtimer_irq = &a15_vtimer_irq;
62 break; 71 break;
63 default: 72 default:
64 return -ENODEV; 73 return -ENODEV;
@@ -70,5 +79,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
70 /* Reset CP15 registers */ 79 /* Reset CP15 registers */
71 kvm_reset_coprocs(vcpu); 80 kvm_reset_coprocs(vcpu);
72 81
82 /* Reset arch_timer context */
83 kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
84
73 return 0; 85 return 0;
74} 86}
diff --git a/arch/arm/kvm/vgic.c b/arch/arm/kvm/vgic.c
deleted file mode 100644
index 17c5ac7d10ed..000000000000
--- a/arch/arm/kvm/vgic.c
+++ /dev/null
@@ -1,1499 +0,0 @@
1/*
2 * Copyright (C) 2012 ARM Ltd.
3 * Author: Marc Zyngier <marc.zyngier@arm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/cpu.h>
20#include <linux/kvm.h>
21#include <linux/kvm_host.h>
22#include <linux/interrupt.h>
23#include <linux/io.h>
24#include <linux/of.h>
25#include <linux/of_address.h>
26#include <linux/of_irq.h>
27
28#include <linux/irqchip/arm-gic.h>
29
30#include <asm/kvm_emulate.h>
31#include <asm/kvm_arm.h>
32#include <asm/kvm_mmu.h>
33
34/*
35 * How the whole thing works (courtesy of Christoffer Dall):
36 *
37 * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
38 * something is pending
39 * - VGIC pending interrupts are stored on the vgic.irq_state vgic
40 * bitmap (this bitmap is updated by both user land ioctls and guest
41 * mmio ops, and other in-kernel peripherals such as the
42 * arch. timers) and indicate the 'wire' state.
43 * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
44 * recalculated
45 * - To calculate the oracle, we need info for each cpu from
46 * compute_pending_for_cpu, which considers:
47 * - PPI: dist->irq_state & dist->irq_enable
48 * - SPI: dist->irq_state & dist->irq_enable & dist->irq_spi_target
49 * - irq_spi_target is a 'formatted' version of the GICD_ICFGR
50 * registers, stored on each vcpu. We only keep one bit of
51 * information per interrupt, making sure that only one vcpu can
52 * accept the interrupt.
53 * - The same is true when injecting an interrupt, except that we only
54 * consider a single interrupt at a time. The irq_spi_cpu array
55 * contains the target CPU for each SPI.
56 *
57 * The handling of level interrupts adds some extra complexity. We
58 * need to track when the interrupt has been EOIed, so we can sample
59 * the 'line' again. This is achieved as such:
60 *
61 * - When a level interrupt is moved onto a vcpu, the corresponding
62 * bit in irq_active is set. As long as this bit is set, the line
63 * will be ignored for further interrupts. The interrupt is injected
64 * into the vcpu with the GICH_LR_EOI bit set (generate a
65 * maintenance interrupt on EOI).
66 * - When the interrupt is EOIed, the maintenance interrupt fires,
67 * and clears the corresponding bit in irq_active. This allow the
68 * interrupt line to be sampled again.
69 */
70
71#define VGIC_ADDR_UNDEF (-1)
72#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
73
74/* Physical address of vgic virtual cpu interface */
75static phys_addr_t vgic_vcpu_base;
76
77/* Virtual control interface base address */
78static void __iomem *vgic_vctrl_base;
79
80static struct device_node *vgic_node;
81
82#define ACCESS_READ_VALUE (1 << 0)
83#define ACCESS_READ_RAZ (0 << 0)
84#define ACCESS_READ_MASK(x) ((x) & (1 << 0))
85#define ACCESS_WRITE_IGNORED (0 << 1)
86#define ACCESS_WRITE_SETBIT (1 << 1)
87#define ACCESS_WRITE_CLEARBIT (2 << 1)
88#define ACCESS_WRITE_VALUE (3 << 1)
89#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
90
91static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
92static void vgic_update_state(struct kvm *kvm);
93static void vgic_kick_vcpus(struct kvm *kvm);
94static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
95static u32 vgic_nr_lr;
96
97static unsigned int vgic_maint_irq;
98
99static u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x,
100 int cpuid, u32 offset)
101{
102 offset >>= 2;
103 if (!offset)
104 return x->percpu[cpuid].reg;
105 else
106 return x->shared.reg + offset - 1;
107}
108
109static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
110 int cpuid, int irq)
111{
112 if (irq < VGIC_NR_PRIVATE_IRQS)
113 return test_bit(irq, x->percpu[cpuid].reg_ul);
114
115 return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared.reg_ul);
116}
117
118static void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
119 int irq, int val)
120{
121 unsigned long *reg;
122
123 if (irq < VGIC_NR_PRIVATE_IRQS) {
124 reg = x->percpu[cpuid].reg_ul;
125 } else {
126 reg = x->shared.reg_ul;
127 irq -= VGIC_NR_PRIVATE_IRQS;
128 }
129
130 if (val)
131 set_bit(irq, reg);
132 else
133 clear_bit(irq, reg);
134}
135
136static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
137{
138 if (unlikely(cpuid >= VGIC_MAX_CPUS))
139 return NULL;
140 return x->percpu[cpuid].reg_ul;
141}
142
143static unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
144{
145 return x->shared.reg_ul;
146}
147
148static u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
149{
150 offset >>= 2;
151 BUG_ON(offset > (VGIC_NR_IRQS / 4));
152 if (offset < 4)
153 return x->percpu[cpuid] + offset;
154 else
155 return x->shared + offset - 8;
156}
157
158#define VGIC_CFG_LEVEL 0
159#define VGIC_CFG_EDGE 1
160
161static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
162{
163 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
164 int irq_val;
165
166 irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
167 return irq_val == VGIC_CFG_EDGE;
168}
169
170static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
171{
172 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
173
174 return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
175}
176
177static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
178{
179 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
180
181 return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
182}
183
184static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
185{
186 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
187
188 vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
189}
190
191static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
192{
193 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
194
195 vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
196}
197
198static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
199{
200 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
201
202 return vgic_bitmap_get_irq_val(&dist->irq_state, vcpu->vcpu_id, irq);
203}
204
205static void vgic_dist_irq_set(struct kvm_vcpu *vcpu, int irq)
206{
207 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
208
209 vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 1);
210}
211
212static void vgic_dist_irq_clear(struct kvm_vcpu *vcpu, int irq)
213{
214 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
215
216 vgic_bitmap_set_irq_val(&dist->irq_state, vcpu->vcpu_id, irq, 0);
217}
218
219static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
220{
221 if (irq < VGIC_NR_PRIVATE_IRQS)
222 set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
223 else
224 set_bit(irq - VGIC_NR_PRIVATE_IRQS,
225 vcpu->arch.vgic_cpu.pending_shared);
226}
227
228static void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
229{
230 if (irq < VGIC_NR_PRIVATE_IRQS)
231 clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
232 else
233 clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
234 vcpu->arch.vgic_cpu.pending_shared);
235}
236
237static u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
238{
239 return *((u32 *)mmio->data) & mask;
240}
241
242static void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
243{
244 *((u32 *)mmio->data) = value & mask;
245}
246
247/**
248 * vgic_reg_access - access vgic register
249 * @mmio: pointer to the data describing the mmio access
250 * @reg: pointer to the virtual backing of vgic distributor data
251 * @offset: least significant 2 bits used for word offset
252 * @mode: ACCESS_ mode (see defines above)
253 *
254 * Helper to make vgic register access easier using one of the access
255 * modes defined for vgic register access
256 * (read,raz,write-ignored,setbit,clearbit,write)
257 */
258static void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
259 phys_addr_t offset, int mode)
260{
261 int word_offset = (offset & 3) * 8;
262 u32 mask = (1UL << (mmio->len * 8)) - 1;
263 u32 regval;
264
265 /*
266 * Any alignment fault should have been delivered to the guest
267 * directly (ARM ARM B3.12.7 "Prioritization of aborts").
268 */
269
270 if (reg) {
271 regval = *reg;
272 } else {
273 BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
274 regval = 0;
275 }
276
277 if (mmio->is_write) {
278 u32 data = mmio_data_read(mmio, mask) << word_offset;
279 switch (ACCESS_WRITE_MASK(mode)) {
280 case ACCESS_WRITE_IGNORED:
281 return;
282
283 case ACCESS_WRITE_SETBIT:
284 regval |= data;
285 break;
286
287 case ACCESS_WRITE_CLEARBIT:
288 regval &= ~data;
289 break;
290
291 case ACCESS_WRITE_VALUE:
292 regval = (regval & ~(mask << word_offset)) | data;
293 break;
294 }
295 *reg = regval;
296 } else {
297 switch (ACCESS_READ_MASK(mode)) {
298 case ACCESS_READ_RAZ:
299 regval = 0;
300 /* fall through */
301
302 case ACCESS_READ_VALUE:
303 mmio_data_write(mmio, mask, regval >> word_offset);
304 }
305 }
306}
307
308static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
309 struct kvm_exit_mmio *mmio, phys_addr_t offset)
310{
311 u32 reg;
312 u32 word_offset = offset & 3;
313
314 switch (offset & ~3) {
315 case 0: /* CTLR */
316 reg = vcpu->kvm->arch.vgic.enabled;
317 vgic_reg_access(mmio, &reg, word_offset,
318 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
319 if (mmio->is_write) {
320 vcpu->kvm->arch.vgic.enabled = reg & 1;
321 vgic_update_state(vcpu->kvm);
322 return true;
323 }
324 break;
325
326 case 4: /* TYPER */
327 reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
328 reg |= (VGIC_NR_IRQS >> 5) - 1;
329 vgic_reg_access(mmio, &reg, word_offset,
330 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
331 break;
332
333 case 8: /* IIDR */
334 reg = 0x4B00043B;
335 vgic_reg_access(mmio, &reg, word_offset,
336 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
337 break;
338 }
339
340 return false;
341}
342
343static bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu,
344 struct kvm_exit_mmio *mmio, phys_addr_t offset)
345{
346 vgic_reg_access(mmio, NULL, offset,
347 ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
348 return false;
349}
350
351static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
352 struct kvm_exit_mmio *mmio,
353 phys_addr_t offset)
354{
355 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
356 vcpu->vcpu_id, offset);
357 vgic_reg_access(mmio, reg, offset,
358 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
359 if (mmio->is_write) {
360 vgic_update_state(vcpu->kvm);
361 return true;
362 }
363
364 return false;
365}
366
367static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
368 struct kvm_exit_mmio *mmio,
369 phys_addr_t offset)
370{
371 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_enabled,
372 vcpu->vcpu_id, offset);
373 vgic_reg_access(mmio, reg, offset,
374 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
375 if (mmio->is_write) {
376 if (offset < 4) /* Force SGI enabled */
377 *reg |= 0xffff;
378 vgic_retire_disabled_irqs(vcpu);
379 vgic_update_state(vcpu->kvm);
380 return true;
381 }
382
383 return false;
384}
385
386static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
387 struct kvm_exit_mmio *mmio,
388 phys_addr_t offset)
389{
390 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state,
391 vcpu->vcpu_id, offset);
392 vgic_reg_access(mmio, reg, offset,
393 ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
394 if (mmio->is_write) {
395 vgic_update_state(vcpu->kvm);
396 return true;
397 }
398
399 return false;
400}
401
402static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
403 struct kvm_exit_mmio *mmio,
404 phys_addr_t offset)
405{
406 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_state,
407 vcpu->vcpu_id, offset);
408 vgic_reg_access(mmio, reg, offset,
409 ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
410 if (mmio->is_write) {
411 vgic_update_state(vcpu->kvm);
412 return true;
413 }
414
415 return false;
416}
417
418static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
419 struct kvm_exit_mmio *mmio,
420 phys_addr_t offset)
421{
422 u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
423 vcpu->vcpu_id, offset);
424 vgic_reg_access(mmio, reg, offset,
425 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
426 return false;
427}
428
429#define GICD_ITARGETSR_SIZE 32
430#define GICD_CPUTARGETS_BITS 8
431#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
432static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
433{
434 struct vgic_dist *dist = &kvm->arch.vgic;
435 struct kvm_vcpu *vcpu;
436 int i, c;
437 unsigned long *bmap;
438 u32 val = 0;
439
440 irq -= VGIC_NR_PRIVATE_IRQS;
441
442 kvm_for_each_vcpu(c, vcpu, kvm) {
443 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
444 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
445 if (test_bit(irq + i, bmap))
446 val |= 1 << (c + i * 8);
447 }
448
449 return val;
450}
451
452static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
453{
454 struct vgic_dist *dist = &kvm->arch.vgic;
455 struct kvm_vcpu *vcpu;
456 int i, c;
457 unsigned long *bmap;
458 u32 target;
459
460 irq -= VGIC_NR_PRIVATE_IRQS;
461
462 /*
463 * Pick the LSB in each byte. This ensures we target exactly
464 * one vcpu per IRQ. If the byte is null, assume we target
465 * CPU0.
466 */
467 for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
468 int shift = i * GICD_CPUTARGETS_BITS;
469 target = ffs((val >> shift) & 0xffU);
470 target = target ? (target - 1) : 0;
471 dist->irq_spi_cpu[irq + i] = target;
472 kvm_for_each_vcpu(c, vcpu, kvm) {
473 bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
474 if (c == target)
475 set_bit(irq + i, bmap);
476 else
477 clear_bit(irq + i, bmap);
478 }
479 }
480}
481
482static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
483 struct kvm_exit_mmio *mmio,
484 phys_addr_t offset)
485{
486 u32 reg;
487
488 /* We treat the banked interrupts targets as read-only */
489 if (offset < 32) {
490 u32 roreg = 1 << vcpu->vcpu_id;
491 roreg |= roreg << 8;
492 roreg |= roreg << 16;
493
494 vgic_reg_access(mmio, &roreg, offset,
495 ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
496 return false;
497 }
498
499 reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
500 vgic_reg_access(mmio, &reg, offset,
501 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
502 if (mmio->is_write) {
503 vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
504 vgic_update_state(vcpu->kvm);
505 return true;
506 }
507
508 return false;
509}
510
511static u32 vgic_cfg_expand(u16 val)
512{
513 u32 res = 0;
514 int i;
515
516 /*
517 * Turn a 16bit value like abcd...mnop into a 32bit word
518 * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
519 */
520 for (i = 0; i < 16; i++)
521 res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
522
523 return res;
524}
525
526static u16 vgic_cfg_compress(u32 val)
527{
528 u16 res = 0;
529 int i;
530
531 /*
532 * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
533 * abcd...mnop which is what we really care about.
534 */
535 for (i = 0; i < 16; i++)
536 res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
537
538 return res;
539}
540
541/*
542 * The distributor uses 2 bits per IRQ for the CFG register, but the
543 * LSB is always 0. As such, we only keep the upper bit, and use the
544 * two above functions to compress/expand the bits
545 */
546static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
547 struct kvm_exit_mmio *mmio, phys_addr_t offset)
548{
549 u32 val;
550 u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
551 vcpu->vcpu_id, offset >> 1);
552 if (offset & 2)
553 val = *reg >> 16;
554 else
555 val = *reg & 0xffff;
556
557 val = vgic_cfg_expand(val);
558 vgic_reg_access(mmio, &val, offset,
559 ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
560 if (mmio->is_write) {
561 if (offset < 4) {
562 *reg = ~0U; /* Force PPIs/SGIs to 1 */
563 return false;
564 }
565
566 val = vgic_cfg_compress(val);
567 if (offset & 2) {
568 *reg &= 0xffff;
569 *reg |= val << 16;
570 } else {
571 *reg &= 0xffff << 16;
572 *reg |= val;
573 }
574 }
575
576 return false;
577}
578
579static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
580 struct kvm_exit_mmio *mmio, phys_addr_t offset)
581{
582 u32 reg;
583 vgic_reg_access(mmio, &reg, offset,
584 ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
585 if (mmio->is_write) {
586 vgic_dispatch_sgi(vcpu, reg);
587 vgic_update_state(vcpu->kvm);
588 return true;
589 }
590
591 return false;
592}
593
594/*
595 * I would have liked to use the kvm_bus_io_*() API instead, but it
596 * cannot cope with banked registers (only the VM pointer is passed
597 * around, and we need the vcpu). One of these days, someone please
598 * fix it!
599 */
600struct mmio_range {
601 phys_addr_t base;
602 unsigned long len;
603 bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
604 phys_addr_t offset);
605};
606
607static const struct mmio_range vgic_ranges[] = {
608 {
609 .base = GIC_DIST_CTRL,
610 .len = 12,
611 .handle_mmio = handle_mmio_misc,
612 },
613 {
614 .base = GIC_DIST_IGROUP,
615 .len = VGIC_NR_IRQS / 8,
616 .handle_mmio = handle_mmio_raz_wi,
617 },
618 {
619 .base = GIC_DIST_ENABLE_SET,
620 .len = VGIC_NR_IRQS / 8,
621 .handle_mmio = handle_mmio_set_enable_reg,
622 },
623 {
624 .base = GIC_DIST_ENABLE_CLEAR,
625 .len = VGIC_NR_IRQS / 8,
626 .handle_mmio = handle_mmio_clear_enable_reg,
627 },
628 {
629 .base = GIC_DIST_PENDING_SET,
630 .len = VGIC_NR_IRQS / 8,
631 .handle_mmio = handle_mmio_set_pending_reg,
632 },
633 {
634 .base = GIC_DIST_PENDING_CLEAR,
635 .len = VGIC_NR_IRQS / 8,
636 .handle_mmio = handle_mmio_clear_pending_reg,
637 },
638 {
639 .base = GIC_DIST_ACTIVE_SET,
640 .len = VGIC_NR_IRQS / 8,
641 .handle_mmio = handle_mmio_raz_wi,
642 },
643 {
644 .base = GIC_DIST_ACTIVE_CLEAR,
645 .len = VGIC_NR_IRQS / 8,
646 .handle_mmio = handle_mmio_raz_wi,
647 },
648 {
649 .base = GIC_DIST_PRI,
650 .len = VGIC_NR_IRQS,
651 .handle_mmio = handle_mmio_priority_reg,
652 },
653 {
654 .base = GIC_DIST_TARGET,
655 .len = VGIC_NR_IRQS,
656 .handle_mmio = handle_mmio_target_reg,
657 },
658 {
659 .base = GIC_DIST_CONFIG,
660 .len = VGIC_NR_IRQS / 4,
661 .handle_mmio = handle_mmio_cfg_reg,
662 },
663 {
664 .base = GIC_DIST_SOFTINT,
665 .len = 4,
666 .handle_mmio = handle_mmio_sgi_reg,
667 },
668 {}
669};
670
671static const
672struct mmio_range *find_matching_range(const struct mmio_range *ranges,
673 struct kvm_exit_mmio *mmio,
674 phys_addr_t base)
675{
676 const struct mmio_range *r = ranges;
677 phys_addr_t addr = mmio->phys_addr - base;
678
679 while (r->len) {
680 if (addr >= r->base &&
681 (addr + mmio->len) <= (r->base + r->len))
682 return r;
683 r++;
684 }
685
686 return NULL;
687}
688
689/**
690 * vgic_handle_mmio - handle an in-kernel MMIO access
691 * @vcpu: pointer to the vcpu performing the access
692 * @run: pointer to the kvm_run structure
693 * @mmio: pointer to the data describing the access
694 *
695 * returns true if the MMIO access has been performed in kernel space,
696 * and false if it needs to be emulated in user space.
697 */
698bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
699 struct kvm_exit_mmio *mmio)
700{
701 const struct mmio_range *range;
702 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
703 unsigned long base = dist->vgic_dist_base;
704 bool updated_state;
705 unsigned long offset;
706
707 if (!irqchip_in_kernel(vcpu->kvm) ||
708 mmio->phys_addr < base ||
709 (mmio->phys_addr + mmio->len) > (base + KVM_VGIC_V2_DIST_SIZE))
710 return false;
711
712 /* We don't support ldrd / strd or ldm / stm to the emulated vgic */
713 if (mmio->len > 4) {
714 kvm_inject_dabt(vcpu, mmio->phys_addr);
715 return true;
716 }
717
718 range = find_matching_range(vgic_ranges, mmio, base);
719 if (unlikely(!range || !range->handle_mmio)) {
720 pr_warn("Unhandled access %d %08llx %d\n",
721 mmio->is_write, mmio->phys_addr, mmio->len);
722 return false;
723 }
724
725 spin_lock(&vcpu->kvm->arch.vgic.lock);
726 offset = mmio->phys_addr - range->base - base;
727 updated_state = range->handle_mmio(vcpu, mmio, offset);
728 spin_unlock(&vcpu->kvm->arch.vgic.lock);
729 kvm_prepare_mmio(run, mmio);
730 kvm_handle_mmio_return(vcpu, run);
731
732 if (updated_state)
733 vgic_kick_vcpus(vcpu->kvm);
734
735 return true;
736}
737
738static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
739{
740 struct kvm *kvm = vcpu->kvm;
741 struct vgic_dist *dist = &kvm->arch.vgic;
742 int nrcpus = atomic_read(&kvm->online_vcpus);
743 u8 target_cpus;
744 int sgi, mode, c, vcpu_id;
745
746 vcpu_id = vcpu->vcpu_id;
747
748 sgi = reg & 0xf;
749 target_cpus = (reg >> 16) & 0xff;
750 mode = (reg >> 24) & 3;
751
752 switch (mode) {
753 case 0:
754 if (!target_cpus)
755 return;
756
757 case 1:
758 target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
759 break;
760
761 case 2:
762 target_cpus = 1 << vcpu_id;
763 break;
764 }
765
766 kvm_for_each_vcpu(c, vcpu, kvm) {
767 if (target_cpus & 1) {
768 /* Flag the SGI as pending */
769 vgic_dist_irq_set(vcpu, sgi);
770 dist->irq_sgi_sources[c][sgi] |= 1 << vcpu_id;
771 kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
772 }
773
774 target_cpus >>= 1;
775 }
776}
777
778static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
779{
780 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
781 unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
782 unsigned long pending_private, pending_shared;
783 int vcpu_id;
784
785 vcpu_id = vcpu->vcpu_id;
786 pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
787 pend_shared = vcpu->arch.vgic_cpu.pending_shared;
788
789 pending = vgic_bitmap_get_cpu_map(&dist->irq_state, vcpu_id);
790 enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
791 bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
792
793 pending = vgic_bitmap_get_shared_map(&dist->irq_state);
794 enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
795 bitmap_and(pend_shared, pending, enabled, VGIC_NR_SHARED_IRQS);
796 bitmap_and(pend_shared, pend_shared,
797 vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
798 VGIC_NR_SHARED_IRQS);
799
800 pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
801 pending_shared = find_first_bit(pend_shared, VGIC_NR_SHARED_IRQS);
802 return (pending_private < VGIC_NR_PRIVATE_IRQS ||
803 pending_shared < VGIC_NR_SHARED_IRQS);
804}
805
806/*
807 * Update the interrupt state and determine which CPUs have pending
808 * interrupts. Must be called with distributor lock held.
809 */
810static void vgic_update_state(struct kvm *kvm)
811{
812 struct vgic_dist *dist = &kvm->arch.vgic;
813 struct kvm_vcpu *vcpu;
814 int c;
815
816 if (!dist->enabled) {
817 set_bit(0, &dist->irq_pending_on_cpu);
818 return;
819 }
820
821 kvm_for_each_vcpu(c, vcpu, kvm) {
822 if (compute_pending_for_cpu(vcpu)) {
823 pr_debug("CPU%d has pending interrupts\n", c);
824 set_bit(c, &dist->irq_pending_on_cpu);
825 }
826 }
827}
828
829#define LR_CPUID(lr) \
830 (((lr) & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT)
831#define MK_LR_PEND(src, irq) \
832 (GICH_LR_PENDING_BIT | ((src) << GICH_LR_PHYSID_CPUID_SHIFT) | (irq))
833
834/*
835 * An interrupt may have been disabled after being made pending on the
836 * CPU interface (the classic case is a timer running while we're
837 * rebooting the guest - the interrupt would kick as soon as the CPU
838 * interface gets enabled, with deadly consequences).
839 *
840 * The solution is to examine already active LRs, and check the
841 * interrupt is still enabled. If not, just retire it.
842 */
843static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
844{
845 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
846 int lr;
847
848 for_each_set_bit(lr, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
849 int irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
850
851 if (!vgic_irq_is_enabled(vcpu, irq)) {
852 vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
853 clear_bit(lr, vgic_cpu->lr_used);
854 vgic_cpu->vgic_lr[lr] &= ~GICH_LR_STATE;
855 if (vgic_irq_is_active(vcpu, irq))
856 vgic_irq_clear_active(vcpu, irq);
857 }
858 }
859}
860
861/*
862 * Queue an interrupt to a CPU virtual interface. Return true on success,
863 * or false if it wasn't possible to queue it.
864 */
865static bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
866{
867 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
868 int lr;
869
870 /* Sanitize the input... */
871 BUG_ON(sgi_source_id & ~7);
872 BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
873 BUG_ON(irq >= VGIC_NR_IRQS);
874
875 kvm_debug("Queue IRQ%d\n", irq);
876
877 lr = vgic_cpu->vgic_irq_lr_map[irq];
878
879 /* Do we have an active interrupt for the same CPUID? */
880 if (lr != LR_EMPTY &&
881 (LR_CPUID(vgic_cpu->vgic_lr[lr]) == sgi_source_id)) {
882 kvm_debug("LR%d piggyback for IRQ%d %x\n",
883 lr, irq, vgic_cpu->vgic_lr[lr]);
884 BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
885 vgic_cpu->vgic_lr[lr] |= GICH_LR_PENDING_BIT;
886 return true;
887 }
888
889 /* Try to use another LR for this interrupt */
890 lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used,
891 vgic_cpu->nr_lr);
892 if (lr >= vgic_cpu->nr_lr)
893 return false;
894
895 kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
896 vgic_cpu->vgic_lr[lr] = MK_LR_PEND(sgi_source_id, irq);
897 vgic_cpu->vgic_irq_lr_map[irq] = lr;
898 set_bit(lr, vgic_cpu->lr_used);
899
900 if (!vgic_irq_is_edge(vcpu, irq))
901 vgic_cpu->vgic_lr[lr] |= GICH_LR_EOI;
902
903 return true;
904}
905
906static bool vgic_queue_sgi(struct kvm_vcpu *vcpu, int irq)
907{
908 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
909 unsigned long sources;
910 int vcpu_id = vcpu->vcpu_id;
911 int c;
912
913 sources = dist->irq_sgi_sources[vcpu_id][irq];
914
915 for_each_set_bit(c, &sources, VGIC_MAX_CPUS) {
916 if (vgic_queue_irq(vcpu, c, irq))
917 clear_bit(c, &sources);
918 }
919
920 dist->irq_sgi_sources[vcpu_id][irq] = sources;
921
922 /*
923 * If the sources bitmap has been cleared it means that we
924 * could queue all the SGIs onto link registers (see the
925 * clear_bit above), and therefore we are done with them in
926 * our emulated gic and can get rid of them.
927 */
928 if (!sources) {
929 vgic_dist_irq_clear(vcpu, irq);
930 vgic_cpu_irq_clear(vcpu, irq);
931 return true;
932 }
933
934 return false;
935}
936
937static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
938{
939 if (vgic_irq_is_active(vcpu, irq))
940 return true; /* level interrupt, already queued */
941
942 if (vgic_queue_irq(vcpu, 0, irq)) {
943 if (vgic_irq_is_edge(vcpu, irq)) {
944 vgic_dist_irq_clear(vcpu, irq);
945 vgic_cpu_irq_clear(vcpu, irq);
946 } else {
947 vgic_irq_set_active(vcpu, irq);
948 }
949
950 return true;
951 }
952
953 return false;
954}
955
956/*
957 * Fill the list registers with pending interrupts before running the
958 * guest.
959 */
960static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
961{
962 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
963 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
964 int i, vcpu_id;
965 int overflow = 0;
966
967 vcpu_id = vcpu->vcpu_id;
968
969 /*
970 * We may not have any pending interrupt, or the interrupts
971 * may have been serviced from another vcpu. In all cases,
972 * move along.
973 */
974 if (!kvm_vgic_vcpu_pending_irq(vcpu)) {
975 pr_debug("CPU%d has no pending interrupt\n", vcpu_id);
976 goto epilog;
977 }
978
979 /* SGIs */
980 for_each_set_bit(i, vgic_cpu->pending_percpu, VGIC_NR_SGIS) {
981 if (!vgic_queue_sgi(vcpu, i))
982 overflow = 1;
983 }
984
985 /* PPIs */
986 for_each_set_bit_from(i, vgic_cpu->pending_percpu, VGIC_NR_PRIVATE_IRQS) {
987 if (!vgic_queue_hwirq(vcpu, i))
988 overflow = 1;
989 }
990
991 /* SPIs */
992 for_each_set_bit(i, vgic_cpu->pending_shared, VGIC_NR_SHARED_IRQS) {
993 if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
994 overflow = 1;
995 }
996
997epilog:
998 if (overflow) {
999 vgic_cpu->vgic_hcr |= GICH_HCR_UIE;
1000 } else {
1001 vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE;
1002 /*
1003 * We're about to run this VCPU, and we've consumed
1004 * everything the distributor had in store for
1005 * us. Claim we don't have anything pending. We'll
1006 * adjust that if needed while exiting.
1007 */
1008 clear_bit(vcpu_id, &dist->irq_pending_on_cpu);
1009 }
1010}
1011
1012static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
1013{
1014 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1015 bool level_pending = false;
1016
1017 kvm_debug("MISR = %08x\n", vgic_cpu->vgic_misr);
1018
1019 if (vgic_cpu->vgic_misr & GICH_MISR_EOI) {
1020 /*
1021 * Some level interrupts have been EOIed. Clear their
1022 * active bit.
1023 */
1024 int lr, irq;
1025
1026 for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_eisr,
1027 vgic_cpu->nr_lr) {
1028 irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
1029
1030 vgic_irq_clear_active(vcpu, irq);
1031 vgic_cpu->vgic_lr[lr] &= ~GICH_LR_EOI;
1032
1033 /* Any additional pending interrupt? */
1034 if (vgic_dist_irq_is_pending(vcpu, irq)) {
1035 vgic_cpu_irq_set(vcpu, irq);
1036 level_pending = true;
1037 } else {
1038 vgic_cpu_irq_clear(vcpu, irq);
1039 }
1040
1041 /*
1042 * Despite being EOIed, the LR may not have
1043 * been marked as empty.
1044 */
1045 set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr);
1046 vgic_cpu->vgic_lr[lr] &= ~GICH_LR_ACTIVE_BIT;
1047 }
1048 }
1049
1050 if (vgic_cpu->vgic_misr & GICH_MISR_U)
1051 vgic_cpu->vgic_hcr &= ~GICH_HCR_UIE;
1052
1053 return level_pending;
1054}
1055
1056/*
1057 * Sync back the VGIC state after a guest run. The distributor lock is
1058 * needed so we don't get preempted in the middle of the state processing.
1059 */
1060static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
1061{
1062 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1063 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1064 int lr, pending;
1065 bool level_pending;
1066
1067 level_pending = vgic_process_maintenance(vcpu);
1068
1069 /* Clear mappings for empty LRs */
1070 for_each_set_bit(lr, (unsigned long *)vgic_cpu->vgic_elrsr,
1071 vgic_cpu->nr_lr) {
1072 int irq;
1073
1074 if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
1075 continue;
1076
1077 irq = vgic_cpu->vgic_lr[lr] & GICH_LR_VIRTUALID;
1078
1079 BUG_ON(irq >= VGIC_NR_IRQS);
1080 vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
1081 }
1082
1083 /* Check if we still have something up our sleeve... */
1084 pending = find_first_zero_bit((unsigned long *)vgic_cpu->vgic_elrsr,
1085 vgic_cpu->nr_lr);
1086 if (level_pending || pending < vgic_cpu->nr_lr)
1087 set_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu);
1088}
1089
1090void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
1091{
1092 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1093
1094 if (!irqchip_in_kernel(vcpu->kvm))
1095 return;
1096
1097 spin_lock(&dist->lock);
1098 __kvm_vgic_flush_hwstate(vcpu);
1099 spin_unlock(&dist->lock);
1100}
1101
1102void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
1103{
1104 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1105
1106 if (!irqchip_in_kernel(vcpu->kvm))
1107 return;
1108
1109 spin_lock(&dist->lock);
1110 __kvm_vgic_sync_hwstate(vcpu);
1111 spin_unlock(&dist->lock);
1112}
1113
1114int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
1115{
1116 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1117
1118 if (!irqchip_in_kernel(vcpu->kvm))
1119 return 0;
1120
1121 return test_bit(vcpu->vcpu_id, &dist->irq_pending_on_cpu);
1122}
1123
1124static void vgic_kick_vcpus(struct kvm *kvm)
1125{
1126 struct kvm_vcpu *vcpu;
1127 int c;
1128
1129 /*
1130 * We've injected an interrupt, time to find out who deserves
1131 * a good kick...
1132 */
1133 kvm_for_each_vcpu(c, vcpu, kvm) {
1134 if (kvm_vgic_vcpu_pending_irq(vcpu))
1135 kvm_vcpu_kick(vcpu);
1136 }
1137}
1138
1139static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
1140{
1141 int is_edge = vgic_irq_is_edge(vcpu, irq);
1142 int state = vgic_dist_irq_is_pending(vcpu, irq);
1143
1144 /*
1145 * Only inject an interrupt if:
1146 * - edge triggered and we have a rising edge
1147 * - level triggered and we change level
1148 */
1149 if (is_edge)
1150 return level > state;
1151 else
1152 return level != state;
1153}
1154
1155static bool vgic_update_irq_state(struct kvm *kvm, int cpuid,
1156 unsigned int irq_num, bool level)
1157{
1158 struct vgic_dist *dist = &kvm->arch.vgic;
1159 struct kvm_vcpu *vcpu;
1160 int is_edge, is_level;
1161 int enabled;
1162 bool ret = true;
1163
1164 spin_lock(&dist->lock);
1165
1166 vcpu = kvm_get_vcpu(kvm, cpuid);
1167 is_edge = vgic_irq_is_edge(vcpu, irq_num);
1168 is_level = !is_edge;
1169
1170 if (!vgic_validate_injection(vcpu, irq_num, level)) {
1171 ret = false;
1172 goto out;
1173 }
1174
1175 if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
1176 cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
1177 vcpu = kvm_get_vcpu(kvm, cpuid);
1178 }
1179
1180 kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
1181
1182 if (level)
1183 vgic_dist_irq_set(vcpu, irq_num);
1184 else
1185 vgic_dist_irq_clear(vcpu, irq_num);
1186
1187 enabled = vgic_irq_is_enabled(vcpu, irq_num);
1188
1189 if (!enabled) {
1190 ret = false;
1191 goto out;
1192 }
1193
1194 if (is_level && vgic_irq_is_active(vcpu, irq_num)) {
1195 /*
1196 * Level interrupt in progress, will be picked up
1197 * when EOId.
1198 */
1199 ret = false;
1200 goto out;
1201 }
1202
1203 if (level) {
1204 vgic_cpu_irq_set(vcpu, irq_num);
1205 set_bit(cpuid, &dist->irq_pending_on_cpu);
1206 }
1207
1208out:
1209 spin_unlock(&dist->lock);
1210
1211 return ret;
1212}
1213
1214/**
1215 * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
1216 * @kvm: The VM structure pointer
1217 * @cpuid: The CPU for PPIs
1218 * @irq_num: The IRQ number that is assigned to the device
1219 * @level: Edge-triggered: true: to trigger the interrupt
1220 * false: to ignore the call
1221 * Level-sensitive true: activates an interrupt
1222 * false: deactivates an interrupt
1223 *
1224 * The GIC is not concerned with devices being active-LOW or active-HIGH for
1225 * level-sensitive interrupts. You can think of the level parameter as 1
1226 * being HIGH and 0 being LOW and all devices being active-HIGH.
1227 */
1228int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
1229 bool level)
1230{
1231 if (vgic_update_irq_state(kvm, cpuid, irq_num, level))
1232 vgic_kick_vcpus(kvm);
1233
1234 return 0;
1235}
1236
1237static irqreturn_t vgic_maintenance_handler(int irq, void *data)
1238{
1239 /*
1240 * We cannot rely on the vgic maintenance interrupt to be
1241 * delivered synchronously. This means we can only use it to
1242 * exit the VM, and we perform the handling of EOIed
1243 * interrupts on the exit path (see vgic_process_maintenance).
1244 */
1245 return IRQ_HANDLED;
1246}
1247
1248int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
1249{
1250 struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
1251 struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
1252 int i;
1253
1254 if (!irqchip_in_kernel(vcpu->kvm))
1255 return 0;
1256
1257 if (vcpu->vcpu_id >= VGIC_MAX_CPUS)
1258 return -EBUSY;
1259
1260 for (i = 0; i < VGIC_NR_IRQS; i++) {
1261 if (i < VGIC_NR_PPIS)
1262 vgic_bitmap_set_irq_val(&dist->irq_enabled,
1263 vcpu->vcpu_id, i, 1);
1264 if (i < VGIC_NR_PRIVATE_IRQS)
1265 vgic_bitmap_set_irq_val(&dist->irq_cfg,
1266 vcpu->vcpu_id, i, VGIC_CFG_EDGE);
1267
1268 vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY;
1269 }
1270
1271 /*
1272 * By forcing VMCR to zero, the GIC will restore the binary
1273 * points to their reset values. Anything else resets to zero
1274 * anyway.
1275 */
1276 vgic_cpu->vgic_vmcr = 0;
1277
1278 vgic_cpu->nr_lr = vgic_nr_lr;
1279 vgic_cpu->vgic_hcr = GICH_HCR_EN; /* Get the show on the road... */
1280
1281 return 0;
1282}
1283
1284static void vgic_init_maintenance_interrupt(void *info)
1285{
1286 enable_percpu_irq(vgic_maint_irq, 0);
1287}
1288
1289static int vgic_cpu_notify(struct notifier_block *self,
1290 unsigned long action, void *cpu)
1291{
1292 switch (action) {
1293 case CPU_STARTING:
1294 case CPU_STARTING_FROZEN:
1295 vgic_init_maintenance_interrupt(NULL);
1296 break;
1297 case CPU_DYING:
1298 case CPU_DYING_FROZEN:
1299 disable_percpu_irq(vgic_maint_irq);
1300 break;
1301 }
1302
1303 return NOTIFY_OK;
1304}
1305
1306static struct notifier_block vgic_cpu_nb = {
1307 .notifier_call = vgic_cpu_notify,
1308};
1309
1310int kvm_vgic_hyp_init(void)
1311{
1312 int ret;
1313 struct resource vctrl_res;
1314 struct resource vcpu_res;
1315
1316 vgic_node = of_find_compatible_node(NULL, NULL, "arm,cortex-a15-gic");
1317 if (!vgic_node) {
1318 kvm_err("error: no compatible vgic node in DT\n");
1319 return -ENODEV;
1320 }
1321
1322 vgic_maint_irq = irq_of_parse_and_map(vgic_node, 0);
1323 if (!vgic_maint_irq) {
1324 kvm_err("error getting vgic maintenance irq from DT\n");
1325 ret = -ENXIO;
1326 goto out;
1327 }
1328
1329 ret = request_percpu_irq(vgic_maint_irq, vgic_maintenance_handler,
1330 "vgic", kvm_get_running_vcpus());
1331 if (ret) {
1332 kvm_err("Cannot register interrupt %d\n", vgic_maint_irq);
1333 goto out;
1334 }
1335
1336 ret = register_cpu_notifier(&vgic_cpu_nb);
1337 if (ret) {
1338 kvm_err("Cannot register vgic CPU notifier\n");
1339 goto out_free_irq;
1340 }
1341
1342 ret = of_address_to_resource(vgic_node, 2, &vctrl_res);
1343 if (ret) {
1344 kvm_err("Cannot obtain VCTRL resource\n");
1345 goto out_free_irq;
1346 }
1347
1348 vgic_vctrl_base = of_iomap(vgic_node, 2);
1349 if (!vgic_vctrl_base) {
1350 kvm_err("Cannot ioremap VCTRL\n");
1351 ret = -ENOMEM;
1352 goto out_free_irq;
1353 }
1354
1355 vgic_nr_lr = readl_relaxed(vgic_vctrl_base + GICH_VTR);
1356 vgic_nr_lr = (vgic_nr_lr & 0x3f) + 1;
1357
1358 ret = create_hyp_io_mappings(vgic_vctrl_base,
1359 vgic_vctrl_base + resource_size(&vctrl_res),
1360 vctrl_res.start);
1361 if (ret) {
1362 kvm_err("Cannot map VCTRL into hyp\n");
1363 goto out_unmap;
1364 }
1365
1366 kvm_info("%s@%llx IRQ%d\n", vgic_node->name,
1367 vctrl_res.start, vgic_maint_irq);
1368 on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1);
1369
1370 if (of_address_to_resource(vgic_node, 3, &vcpu_res)) {
1371 kvm_err("Cannot obtain VCPU resource\n");
1372 ret = -ENXIO;
1373 goto out_unmap;
1374 }
1375 vgic_vcpu_base = vcpu_res.start;
1376
1377 goto out;
1378
1379out_unmap:
1380 iounmap(vgic_vctrl_base);
1381out_free_irq:
1382 free_percpu_irq(vgic_maint_irq, kvm_get_running_vcpus());
1383out:
1384 of_node_put(vgic_node);
1385 return ret;
1386}
1387
1388int kvm_vgic_init(struct kvm *kvm)
1389{
1390 int ret = 0, i;
1391
1392 mutex_lock(&kvm->lock);
1393
1394 if (vgic_initialized(kvm))
1395 goto out;
1396
1397 if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) ||
1398 IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_cpu_base)) {
1399 kvm_err("Need to set vgic cpu and dist addresses first\n");
1400 ret = -ENXIO;
1401 goto out;
1402 }
1403
1404 ret = kvm_phys_addr_ioremap(kvm, kvm->arch.vgic.vgic_cpu_base,
1405 vgic_vcpu_base, KVM_VGIC_V2_CPU_SIZE);
1406 if (ret) {
1407 kvm_err("Unable to remap VGIC CPU to VCPU\n");
1408 goto out;
1409 }
1410
1411 for (i = VGIC_NR_PRIVATE_IRQS; i < VGIC_NR_IRQS; i += 4)
1412 vgic_set_target_reg(kvm, 0, i);
1413
1414 kvm_timer_init(kvm);
1415 kvm->arch.vgic.ready = true;
1416out:
1417 mutex_unlock(&kvm->lock);
1418 return ret;
1419}
1420
1421int kvm_vgic_create(struct kvm *kvm)
1422{
1423 int ret = 0;
1424
1425 mutex_lock(&kvm->lock);
1426
1427 if (atomic_read(&kvm->online_vcpus) || kvm->arch.vgic.vctrl_base) {
1428 ret = -EEXIST;
1429 goto out;
1430 }
1431
1432 spin_lock_init(&kvm->arch.vgic.lock);
1433 kvm->arch.vgic.vctrl_base = vgic_vctrl_base;
1434 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
1435 kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
1436
1437out:
1438 mutex_unlock(&kvm->lock);
1439 return ret;
1440}
1441
1442static bool vgic_ioaddr_overlap(struct kvm *kvm)
1443{
1444 phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
1445 phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
1446
1447 if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
1448 return 0;
1449 if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
1450 (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
1451 return -EBUSY;
1452 return 0;
1453}
1454
1455static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
1456 phys_addr_t addr, phys_addr_t size)
1457{
1458 int ret;
1459
1460 if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
1461 return -EEXIST;
1462 if (addr + size < addr)
1463 return -EINVAL;
1464
1465 ret = vgic_ioaddr_overlap(kvm);
1466 if (ret)
1467 return ret;
1468 *ioaddr = addr;
1469 return ret;
1470}
1471
1472int kvm_vgic_set_addr(struct kvm *kvm, unsigned long type, u64 addr)
1473{
1474 int r = 0;
1475 struct vgic_dist *vgic = &kvm->arch.vgic;
1476
1477 if (addr & ~KVM_PHYS_MASK)
1478 return -E2BIG;
1479
1480 if (addr & (SZ_4K - 1))
1481 return -EINVAL;
1482
1483 mutex_lock(&kvm->lock);
1484 switch (type) {
1485 case KVM_VGIC_V2_ADDR_TYPE_DIST:
1486 r = vgic_ioaddr_assign(kvm, &vgic->vgic_dist_base,
1487 addr, KVM_VGIC_V2_DIST_SIZE);
1488 break;
1489 case KVM_VGIC_V2_ADDR_TYPE_CPU:
1490 r = vgic_ioaddr_assign(kvm, &vgic->vgic_cpu_base,
1491 addr, KVM_VGIC_V2_CPU_SIZE);
1492 break;
1493 default:
1494 r = -ENODEV;
1495 }
1496
1497 mutex_unlock(&kvm->lock);
1498 return r;
1499}
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 1a4053789d01..18e45ec49bbf 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -47,12 +47,13 @@ FORCE : $(obj)/$(offsets-file)
47 47
48ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 48ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
50KVM := ../../../virt/kvm
50 51
51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 52common-objs = $(KVM)/kvm_main.o $(KVM)/ioapic.o \
52 coalesced_mmio.o irq_comm.o) 53 $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o
53 54
54ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y) 55ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
55common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o) 56common-objs += $(KVM)/assigned-dev.o $(KVM)/iommu.o
56endif 57endif
57 58
58kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o 59kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 349ed85c7d61..08891d07aeb6 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -107,8 +107,9 @@ struct kvmppc_vcpu_book3s {
107#define CONTEXT_GUEST 1 107#define CONTEXT_GUEST 1
108#define CONTEXT_GUEST_END 2 108#define CONTEXT_GUEST_END 2
109 109
110#define VSID_REAL 0x1fffffffffc00000ULL 110#define VSID_REAL 0x0fffffffffc00000ULL
111#define VSID_BAT 0x1fffffffffb00000ULL 111#define VSID_BAT 0x0fffffffffb00000ULL
112#define VSID_1T 0x1000000000000000ULL
112#define VSID_REAL_DR 0x2000000000000000ULL 113#define VSID_REAL_DR 0x2000000000000000ULL
113#define VSID_REAL_IR 0x4000000000000000ULL 114#define VSID_REAL_IR 0x4000000000000000ULL
114#define VSID_PR 0x8000000000000000ULL 115#define VSID_PR 0x8000000000000000ULL
@@ -123,6 +124,7 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
123extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); 124extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
124extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 125extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
125extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 126extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
127extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong seg_size);
126extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 128extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
127extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 129extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
128 struct kvm_vcpu *vcpu, unsigned long addr, 130 struct kvm_vcpu *vcpu, unsigned long addr,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 422de3f4d46c..008cd856c5b5 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -5,9 +5,10 @@
5subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 5subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm 7ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
8KVM := ../../../virt/kvm
8 9
9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ 10common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
10 eventfd.o) 11 $(KVM)/eventfd.o
11 12
12CFLAGS_44x_tlb.o := -I. 13CFLAGS_44x_tlb.o := -I.
13CFLAGS_e500_mmu.o := -I. 14CFLAGS_e500_mmu.o := -I.
@@ -53,7 +54,7 @@ kvm-e500mc-objs := \
53kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) 54kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
54 55
55kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ 56kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
56 ../../../virt/kvm/coalesced_mmio.o \ 57 $(KVM)/coalesced_mmio.o \
57 fpu.o \ 58 fpu.o \
58 book3s_paired_singles.o \ 59 book3s_paired_singles.o \
59 book3s_pr.o \ 60 book3s_pr.o \
@@ -86,8 +87,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
86 book3s_xics.o 87 book3s_xics.o
87 88
88kvm-book3s_64-module-objs := \ 89kvm-book3s_64-module-objs := \
89 ../../../virt/kvm/kvm_main.o \ 90 $(KVM)/kvm_main.o \
90 ../../../virt/kvm/eventfd.o \ 91 $(KVM)/eventfd.o \
91 powerpc.o \ 92 powerpc.o \
92 emulate.o \ 93 emulate.o \
93 book3s.o \ 94 book3s.o \
@@ -111,7 +112,7 @@ kvm-book3s_32-objs := \
111kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 112kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
112 113
113kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o 114kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
114kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o) 115kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
115 116
116kvm-objs := $(kvm-objs-m) $(kvm-objs-y) 117kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
117 118
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b871721c0050..739bfbadb85e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -26,6 +26,7 @@
26#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
27#include <asm/kvm_ppc.h> 27#include <asm/kvm_ppc.h>
28#include <asm/kvm_book3s.h> 28#include <asm/kvm_book3s.h>
29#include <asm/mmu-hash64.h>
29 30
30/* #define DEBUG_MMU */ 31/* #define DEBUG_MMU */
31 32
@@ -76,6 +77,24 @@ static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
76 return NULL; 77 return NULL;
77} 78}
78 79
80static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe)
81{
82 return slbe->tb ? SID_SHIFT_1T : SID_SHIFT;
83}
84
85static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe)
86{
87 return (1ul << kvmppc_slb_sid_shift(slbe)) - 1;
88}
89
90static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr)
91{
92 eaddr &= kvmppc_slb_offset_mask(slb);
93
94 return (eaddr >> VPN_SHIFT) |
95 ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT));
96}
97
79static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, 98static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
80 bool data) 99 bool data)
81{ 100{
@@ -85,11 +104,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
85 if (!slb) 104 if (!slb)
86 return 0; 105 return 0;
87 106
88 if (slb->tb) 107 return kvmppc_slb_calc_vpn(slb, eaddr);
89 return (((u64)eaddr >> 12) & 0xfffffff) |
90 (((u64)slb->vsid) << 28);
91
92 return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16);
93} 108}
94 109
95static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) 110static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
@@ -100,7 +115,8 @@ static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
100static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) 115static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
101{ 116{
102 int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); 117 int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
103 return ((eaddr & 0xfffffff) >> p); 118
119 return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
104} 120}
105 121
106static hva_t kvmppc_mmu_book3s_64_get_pteg( 122static hva_t kvmppc_mmu_book3s_64_get_pteg(
@@ -109,13 +125,15 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
109 bool second) 125 bool second)
110{ 126{
111 u64 hash, pteg, htabsize; 127 u64 hash, pteg, htabsize;
112 u32 page; 128 u32 ssize;
113 hva_t r; 129 hva_t r;
130 u64 vpn;
114 131
115 page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
116 htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); 132 htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
117 133
118 hash = slbe->vsid ^ page; 134 vpn = kvmppc_slb_calc_vpn(slbe, eaddr);
135 ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M;
136 hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);
119 if (second) 137 if (second)
120 hash = ~hash; 138 hash = ~hash;
121 hash &= ((1ULL << 39ULL) - 1ULL); 139 hash &= ((1ULL << 39ULL) - 1ULL);
@@ -146,7 +164,7 @@ static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
146 u64 avpn; 164 u64 avpn;
147 165
148 avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); 166 avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
149 avpn |= slbe->vsid << (28 - p); 167 avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
150 168
151 if (p < 24) 169 if (p < 24)
152 avpn >>= ((80 - p) - 56) - 8; 170 avpn >>= ((80 - p) - 56) - 8;
@@ -167,7 +185,6 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
167 int i; 185 int i;
168 u8 key = 0; 186 u8 key = 0;
169 bool found = false; 187 bool found = false;
170 bool perm_err = false;
171 int second = 0; 188 int second = 0;
172 ulong mp_ea = vcpu->arch.magic_page_ea; 189 ulong mp_ea = vcpu->arch.magic_page_ea;
173 190
@@ -190,13 +207,15 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
190 if (!slbe) 207 if (!slbe)
191 goto no_seg_found; 208 goto no_seg_found;
192 209
210 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
211 if (slbe->tb)
212 avpn |= SLB_VSID_B_1T;
213
193do_second: 214do_second:
194 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); 215 ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
195 if (kvm_is_error_hva(ptegp)) 216 if (kvm_is_error_hva(ptegp))
196 goto no_page_found; 217 goto no_page_found;
197 218
198 avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
199
200 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { 219 if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
201 printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); 220 printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
202 goto no_page_found; 221 goto no_page_found;
@@ -219,7 +238,7 @@ do_second:
219 continue; 238 continue;
220 239
221 /* AVPN compare */ 240 /* AVPN compare */
222 if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { 241 if (HPTE_V_COMPARE(avpn, v)) {
223 u8 pp = (r & HPTE_R_PP) | key; 242 u8 pp = (r & HPTE_R_PP) | key;
224 int eaddr_mask = 0xFFF; 243 int eaddr_mask = 0xFFF;
225 244
@@ -248,11 +267,6 @@ do_second:
248 break; 267 break;
249 } 268 }
250 269
251 if (!gpte->may_read) {
252 perm_err = true;
253 continue;
254 }
255
256 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " 270 dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
257 "-> 0x%lx\n", 271 "-> 0x%lx\n",
258 eaddr, avpn, gpte->vpage, gpte->raddr); 272 eaddr, avpn, gpte->vpage, gpte->raddr);
@@ -281,6 +295,8 @@ do_second:
281 if (pteg[i+1] != oldr) 295 if (pteg[i+1] != oldr)
282 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); 296 copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
283 297
298 if (!gpte->may_read)
299 return -EPERM;
284 return 0; 300 return 0;
285 } else { 301 } else {
286 dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " 302 dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
@@ -296,13 +312,7 @@ do_second:
296 } 312 }
297 } 313 }
298 314
299
300no_page_found: 315no_page_found:
301
302
303 if (perm_err)
304 return -EPERM;
305
306 return -ENOENT; 316 return -ENOENT;
307 317
308no_seg_found: 318no_seg_found:
@@ -334,7 +344,7 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
334 slbe->large = (rs & SLB_VSID_L) ? 1 : 0; 344 slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
335 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; 345 slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0;
336 slbe->esid = slbe->tb ? esid_1t : esid; 346 slbe->esid = slbe->tb ? esid_1t : esid;
337 slbe->vsid = rs >> 12; 347 slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);
338 slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; 348 slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
339 slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; 349 slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0;
340 slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; 350 slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0;
@@ -375,6 +385,7 @@ static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
375static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) 385static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
376{ 386{
377 struct kvmppc_slb *slbe; 387 struct kvmppc_slb *slbe;
388 u64 seg_size;
378 389
379 dprintk("KVM MMU: slbie(0x%llx)\n", ea); 390 dprintk("KVM MMU: slbie(0x%llx)\n", ea);
380 391
@@ -386,8 +397,11 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
386 dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); 397 dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
387 398
388 slbe->valid = false; 399 slbe->valid = false;
400 slbe->orige = 0;
401 slbe->origv = 0;
389 402
390 kvmppc_mmu_map_segment(vcpu, ea); 403 seg_size = 1ull << kvmppc_slb_sid_shift(slbe);
404 kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);
391} 405}
392 406
393static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) 407static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
@@ -396,8 +410,11 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
396 410
397 dprintk("KVM MMU: slbia()\n"); 411 dprintk("KVM MMU: slbia()\n");
398 412
399 for (i = 1; i < vcpu->arch.slb_nr; i++) 413 for (i = 1; i < vcpu->arch.slb_nr; i++) {
400 vcpu->arch.slb[i].valid = false; 414 vcpu->arch.slb[i].valid = false;
415 vcpu->arch.slb[i].orige = 0;
416 vcpu->arch.slb[i].origv = 0;
417 }
401 418
402 if (vcpu->arch.shared->msr & MSR_IR) { 419 if (vcpu->arch.shared->msr & MSR_IR) {
403 kvmppc_mmu_flush_segments(vcpu); 420 kvmppc_mmu_flush_segments(vcpu);
@@ -467,8 +484,14 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
467 484
468 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 485 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
469 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); 486 slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
470 if (slb) 487 if (slb) {
471 gvsid = slb->vsid; 488 gvsid = slb->vsid;
489 if (slb->tb) {
490 gvsid <<= SID_SHIFT_1T - SID_SHIFT;
491 gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
492 gvsid |= VSID_1T;
493 }
494 }
472 } 495 }
473 496
474 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) { 497 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 3a9a1aceb14f..b350d9494b26 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -301,6 +301,23 @@ out:
301 return r; 301 return r;
302} 302}
303 303
304void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
305{
306 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
307 ulong seg_mask = -seg_size;
308 int i;
309
310 for (i = 1; i < svcpu->slb_max; i++) {
311 if ((svcpu->slb[i].esid & SLB_ESID_V) &&
312 (svcpu->slb[i].esid & seg_mask) == ea) {
313 /* Invalidate this entry */
314 svcpu->slb[i].esid = 0;
315 }
316 }
317
318 svcpu_put(svcpu);
319}
320
304void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) 321void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
305{ 322{
306 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); 323 struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -325,9 +342,9 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
325 return -1; 342 return -1;
326 vcpu3s->context_id[0] = err; 343 vcpu3s->context_id[0] = err;
327 344
328 vcpu3s->proto_vsid_max = ((vcpu3s->context_id[0] + 1) 345 vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1)
329 << ESID_BITS) - 1; 346 << ESID_BITS) - 1;
330 vcpu3s->proto_vsid_first = vcpu3s->context_id[0] << ESID_BITS; 347 vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS;
331 vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; 348 vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;
332 349
333 kvmppc_mmu_hpte_init(vcpu); 350 kvmppc_mmu_hpte_init(vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 56b983e7b738..4f0caecc0f9d 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -66,10 +66,6 @@ slb_exit_skip_ ## num:
66 66
67 ld r12, PACA_SLBSHADOWPTR(r13) 67 ld r12, PACA_SLBSHADOWPTR(r13)
68 68
69 /* Save off the first entry so we can slbie it later */
70 ld r10, SHADOW_SLB_ESID(0)(r12)
71 ld r11, SHADOW_SLB_VSID(0)(r12)
72
73 /* Remove bolted entries */ 69 /* Remove bolted entries */
74 UNBOLT_SLB_ENTRY(0) 70 UNBOLT_SLB_ENTRY(0)
75 UNBOLT_SLB_ENTRY(1) 71 UNBOLT_SLB_ENTRY(1)
@@ -81,15 +77,10 @@ slb_exit_skip_ ## num:
81 77
82 /* Flush SLB */ 78 /* Flush SLB */
83 79
80 li r10, 0
81 slbmte r10, r10
84 slbia 82 slbia
85 83
86 /* r0 = esid & ESID_MASK */
87 rldicr r10, r10, 0, 35
88 /* r0 |= CLASS_BIT(VSID) */
89 rldic r12, r11, 56 - 36, 36
90 or r10, r10, r12
91 slbie r10
92
93 /* Fill SLB with our shadow */ 84 /* Fill SLB with our shadow */
94 85
95 lbz r12, SVCPU_SLB_MAX(r3) 86 lbz r12, SVCPU_SLB_MAX(r3)
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index bdc40b8e77d9..19498a567a81 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1239,8 +1239,7 @@ out:
1239#ifdef CONFIG_PPC64 1239#ifdef CONFIG_PPC64
1240int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) 1240int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
1241{ 1241{
1242 /* No flags */ 1242 info->flags = KVM_PPC_1T_SEGMENTS;
1243 info->flags = 0;
1244 1243
1245 /* SLB is always 64 entries */ 1244 /* SLB is always 64 entries */
1246 info->slb_size = 64; 1245 info->slb_size = 64;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1a1b51189773..dcc94f016007 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -796,7 +796,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
796 kvmppc_fill_pt_regs(&regs); 796 kvmppc_fill_pt_regs(&regs);
797 timer_interrupt(&regs); 797 timer_interrupt(&regs);
798 break; 798 break;
799#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) 799#if defined(CONFIG_PPC_DOORBELL)
800 case BOOKE_INTERRUPT_DOORBELL: 800 case BOOKE_INTERRUPT_DOORBELL:
801 kvmppc_fill_pt_regs(&regs); 801 kvmppc_fill_pt_regs(&regs);
802 doorbell_exception(&regs); 802 doorbell_exception(&regs);
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 631a2650e4e4..2c52ada30775 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -169,6 +169,9 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
169 vcpu->arch.shared->sprg3 = spr_val; 169 vcpu->arch.shared->sprg3 = spr_val;
170 break; 170 break;
171 171
172 /* PIR can legally be written, but we ignore it */
173 case SPRN_PIR: break;
174
172 default: 175 default:
173 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, 176 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
174 spr_val); 177 spr_val);
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 16bd5d169cdb..3238d4004e84 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -62,13 +62,20 @@ struct sca_block {
62#define CPUSTAT_MCDS 0x00000100 62#define CPUSTAT_MCDS 0x00000100
63#define CPUSTAT_SM 0x00000080 63#define CPUSTAT_SM 0x00000080
64#define CPUSTAT_G 0x00000008 64#define CPUSTAT_G 0x00000008
65#define CPUSTAT_GED 0x00000004
65#define CPUSTAT_J 0x00000002 66#define CPUSTAT_J 0x00000002
66#define CPUSTAT_P 0x00000001 67#define CPUSTAT_P 0x00000001
67 68
68struct kvm_s390_sie_block { 69struct kvm_s390_sie_block {
69 atomic_t cpuflags; /* 0x0000 */ 70 atomic_t cpuflags; /* 0x0000 */
70 __u32 prefix; /* 0x0004 */ 71 __u32 prefix; /* 0x0004 */
71 __u8 reserved8[32]; /* 0x0008 */ 72 __u8 reserved08[4]; /* 0x0008 */
73#define PROG_IN_SIE (1<<0)
74 __u32 prog0c; /* 0x000c */
75 __u8 reserved10[16]; /* 0x0010 */
76#define PROG_BLOCK_SIE 0x00000001
77 atomic_t prog20; /* 0x0020 */
78 __u8 reserved24[4]; /* 0x0024 */
72 __u64 cputm; /* 0x0028 */ 79 __u64 cputm; /* 0x0028 */
73 __u64 ckc; /* 0x0030 */ 80 __u64 ckc; /* 0x0030 */
74 __u64 epoch; /* 0x0038 */ 81 __u64 epoch; /* 0x0038 */
@@ -90,7 +97,8 @@ struct kvm_s390_sie_block {
90 __u32 scaoh; /* 0x005c */ 97 __u32 scaoh; /* 0x005c */
91 __u8 reserved60; /* 0x0060 */ 98 __u8 reserved60; /* 0x0060 */
92 __u8 ecb; /* 0x0061 */ 99 __u8 ecb; /* 0x0061 */
93 __u8 reserved62[2]; /* 0x0062 */ 100 __u8 ecb2; /* 0x0062 */
101 __u8 reserved63[1]; /* 0x0063 */
94 __u32 scaol; /* 0x0064 */ 102 __u32 scaol; /* 0x0064 */
95 __u8 reserved68[4]; /* 0x0068 */ 103 __u8 reserved68[4]; /* 0x0068 */
96 __u32 todpr; /* 0x006c */ 104 __u32 todpr; /* 0x006c */
@@ -130,6 +138,7 @@ struct kvm_vcpu_stat {
130 u32 deliver_program_int; 138 u32 deliver_program_int;
131 u32 deliver_io_int; 139 u32 deliver_io_int;
132 u32 exit_wait_state; 140 u32 exit_wait_state;
141 u32 instruction_pfmf;
133 u32 instruction_stidp; 142 u32 instruction_stidp;
134 u32 instruction_spx; 143 u32 instruction_spx;
135 u32 instruction_stpx; 144 u32 instruction_stpx;
@@ -166,7 +175,7 @@ struct kvm_s390_ext_info {
166}; 175};
167 176
168#define PGM_OPERATION 0x01 177#define PGM_OPERATION 0x01
169#define PGM_PRIVILEGED_OPERATION 0x02 178#define PGM_PRIVILEGED_OP 0x02
170#define PGM_EXECUTE 0x03 179#define PGM_EXECUTE 0x03
171#define PGM_PROTECTION 0x04 180#define PGM_PROTECTION 0x04
172#define PGM_ADDRESSING 0x05 181#define PGM_ADDRESSING 0x05
@@ -219,7 +228,7 @@ struct kvm_s390_local_interrupt {
219 atomic_t active; 228 atomic_t active;
220 struct kvm_s390_float_interrupt *float_int; 229 struct kvm_s390_float_interrupt *float_int;
221 int timer_due; /* event indicator for waitqueue below */ 230 int timer_due; /* event indicator for waitqueue below */
222 wait_queue_head_t wq; 231 wait_queue_head_t *wq;
223 atomic_t *cpuflags; 232 atomic_t *cpuflags;
224 unsigned int action_bits; 233 unsigned int action_bits;
225}; 234};
@@ -266,4 +275,5 @@ struct kvm_arch{
266}; 275};
267 276
268extern int sie64a(struct kvm_s390_sie_block *, u64 *); 277extern int sie64a(struct kvm_s390_sie_block *, u64 *);
278extern char sie_exit;
269#endif 279#endif
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 5f0173a31693..1141fb3e7b21 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -14,3 +14,13 @@
14/* Per-CPU flags for PMU states */ 14/* Per-CPU flags for PMU states */
15#define PMU_F_RESERVED 0x1000 15#define PMU_F_RESERVED 0x1000
16#define PMU_F_ENABLED 0x2000 16#define PMU_F_ENABLED 0x2000
17
18#ifdef CONFIG_64BIT
19
20/* Perf callbacks */
21struct pt_regs;
22extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
23extern unsigned long perf_misc_flags(struct pt_regs *regs);
24#define perf_misc_flags(regs) perf_misc_flags(regs)
25
26#endif /* CONFIG_64BIT */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 9aefa3c64eb2..0ea4e591fa78 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -296,18 +296,16 @@ extern unsigned long MODULES_END;
296#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV) 296#define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INV)
297 297
298/* Page status table bits for virtualization */ 298/* Page status table bits for virtualization */
299#define RCP_ACC_BITS 0xf0000000UL 299#define PGSTE_ACC_BITS 0xf0000000UL
300#define RCP_FP_BIT 0x08000000UL 300#define PGSTE_FP_BIT 0x08000000UL
301#define RCP_PCL_BIT 0x00800000UL 301#define PGSTE_PCL_BIT 0x00800000UL
302#define RCP_HR_BIT 0x00400000UL 302#define PGSTE_HR_BIT 0x00400000UL
303#define RCP_HC_BIT 0x00200000UL 303#define PGSTE_HC_BIT 0x00200000UL
304#define RCP_GR_BIT 0x00040000UL 304#define PGSTE_GR_BIT 0x00040000UL
305#define RCP_GC_BIT 0x00020000UL 305#define PGSTE_GC_BIT 0x00020000UL
306#define RCP_IN_BIT 0x00002000UL /* IPTE notify bit */ 306#define PGSTE_UR_BIT 0x00008000UL
307 307#define PGSTE_UC_BIT 0x00004000UL /* user dirty (migration) */
308/* User dirty / referenced bit for KVM's migration feature */ 308#define PGSTE_IN_BIT 0x00002000UL /* IPTE notify bit */
309#define KVM_UR_BIT 0x00008000UL
310#define KVM_UC_BIT 0x00004000UL
311 309
312#else /* CONFIG_64BIT */ 310#else /* CONFIG_64BIT */
313 311
@@ -364,18 +362,16 @@ extern unsigned long MODULES_END;
364 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO) 362 | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO)
365 363
366/* Page status table bits for virtualization */ 364/* Page status table bits for virtualization */
367#define RCP_ACC_BITS 0xf000000000000000UL 365#define PGSTE_ACC_BITS 0xf000000000000000UL
368#define RCP_FP_BIT 0x0800000000000000UL 366#define PGSTE_FP_BIT 0x0800000000000000UL
369#define RCP_PCL_BIT 0x0080000000000000UL 367#define PGSTE_PCL_BIT 0x0080000000000000UL
370#define RCP_HR_BIT 0x0040000000000000UL 368#define PGSTE_HR_BIT 0x0040000000000000UL
371#define RCP_HC_BIT 0x0020000000000000UL 369#define PGSTE_HC_BIT 0x0020000000000000UL
372#define RCP_GR_BIT 0x0004000000000000UL 370#define PGSTE_GR_BIT 0x0004000000000000UL
373#define RCP_GC_BIT 0x0002000000000000UL 371#define PGSTE_GC_BIT 0x0002000000000000UL
374#define RCP_IN_BIT 0x0000200000000000UL /* IPTE notify bit */ 372#define PGSTE_UR_BIT 0x0000800000000000UL
375 373#define PGSTE_UC_BIT 0x0000400000000000UL /* user dirty (migration) */
376/* User dirty / referenced bit for KVM's migration feature */ 374#define PGSTE_IN_BIT 0x0000200000000000UL /* IPTE notify bit */
377#define KVM_UR_BIT 0x0000800000000000UL
378#define KVM_UC_BIT 0x0000400000000000UL
379 375
380#endif /* CONFIG_64BIT */ 376#endif /* CONFIG_64BIT */
381 377
@@ -615,8 +611,8 @@ static inline pgste_t pgste_get_lock(pte_t *ptep)
615 asm( 611 asm(
616 " lg %0,%2\n" 612 " lg %0,%2\n"
617 "0: lgr %1,%0\n" 613 "0: lgr %1,%0\n"
618 " nihh %0,0xff7f\n" /* clear RCP_PCL_BIT in old */ 614 " nihh %0,0xff7f\n" /* clear PCL bit in old */
619 " oihh %1,0x0080\n" /* set RCP_PCL_BIT in new */ 615 " oihh %1,0x0080\n" /* set PCL bit in new */
620 " csg %0,%1,%2\n" 616 " csg %0,%1,%2\n"
621 " jl 0b\n" 617 " jl 0b\n"
622 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) 618 : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
@@ -629,7 +625,7 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
629{ 625{
630#ifdef CONFIG_PGSTE 626#ifdef CONFIG_PGSTE
631 asm( 627 asm(
632 " nihh %1,0xff7f\n" /* clear RCP_PCL_BIT */ 628 " nihh %1,0xff7f\n" /* clear PCL bit */
633 " stg %1,%0\n" 629 " stg %1,%0\n"
634 : "=Q" (ptep[PTRS_PER_PTE]) 630 : "=Q" (ptep[PTRS_PER_PTE])
635 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) 631 : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
@@ -662,14 +658,14 @@ static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste)
662 else if (bits) 658 else if (bits)
663 page_reset_referenced(address); 659 page_reset_referenced(address);
664 /* Transfer page changed & referenced bit to guest bits in pgste */ 660 /* Transfer page changed & referenced bit to guest bits in pgste */
665 pgste_val(pgste) |= bits << 48; /* RCP_GR_BIT & RCP_GC_BIT */ 661 pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */
666 /* Get host changed & referenced bits from pgste */ 662 /* Get host changed & referenced bits from pgste */
667 bits |= (pgste_val(pgste) & (RCP_HR_BIT | RCP_HC_BIT)) >> 52; 663 bits |= (pgste_val(pgste) & (PGSTE_HR_BIT | PGSTE_HC_BIT)) >> 52;
668 /* Transfer page changed & referenced bit to kvm user bits */ 664 /* Transfer page changed & referenced bit to kvm user bits */
669 pgste_val(pgste) |= bits << 45; /* KVM_UR_BIT & KVM_UC_BIT */ 665 pgste_val(pgste) |= bits << 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */
670 /* Clear relevant host bits in pgste. */ 666 /* Clear relevant host bits in pgste. */
671 pgste_val(pgste) &= ~(RCP_HR_BIT | RCP_HC_BIT); 667 pgste_val(pgste) &= ~(PGSTE_HR_BIT | PGSTE_HC_BIT);
672 pgste_val(pgste) &= ~(RCP_ACC_BITS | RCP_FP_BIT); 668 pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
673 /* Copy page access key and fetch protection bit to pgste */ 669 /* Copy page access key and fetch protection bit to pgste */
674 pgste_val(pgste) |= 670 pgste_val(pgste) |=
675 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; 671 (unsigned long) (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
@@ -690,15 +686,15 @@ static inline pgste_t pgste_update_young(pte_t *ptep, pgste_t pgste)
690 /* Get referenced bit from storage key */ 686 /* Get referenced bit from storage key */
691 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); 687 young = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
692 if (young) 688 if (young)
693 pgste_val(pgste) |= RCP_GR_BIT; 689 pgste_val(pgste) |= PGSTE_GR_BIT;
694 /* Get host referenced bit from pgste */ 690 /* Get host referenced bit from pgste */
695 if (pgste_val(pgste) & RCP_HR_BIT) { 691 if (pgste_val(pgste) & PGSTE_HR_BIT) {
696 pgste_val(pgste) &= ~RCP_HR_BIT; 692 pgste_val(pgste) &= ~PGSTE_HR_BIT;
697 young = 1; 693 young = 1;
698 } 694 }
699 /* Transfer referenced bit to kvm user bits and pte */ 695 /* Transfer referenced bit to kvm user bits and pte */
700 if (young) { 696 if (young) {
701 pgste_val(pgste) |= KVM_UR_BIT; 697 pgste_val(pgste) |= PGSTE_UR_BIT;
702 pte_val(*ptep) |= _PAGE_SWR; 698 pte_val(*ptep) |= _PAGE_SWR;
703 } 699 }
704#endif 700#endif
@@ -720,7 +716,7 @@ static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry)
720 * The guest C/R information is still in the PGSTE, set real 716 * The guest C/R information is still in the PGSTE, set real
721 * key C/R to 0. 717 * key C/R to 0.
722 */ 718 */
723 nkey = (pgste_val(pgste) & (RCP_ACC_BITS | RCP_FP_BIT)) >> 56; 719 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
724 page_set_storage_key(address, nkey, 0); 720 page_set_storage_key(address, nkey, 0);
725#endif 721#endif
726} 722}
@@ -750,6 +746,7 @@ struct gmap {
750 struct mm_struct *mm; 746 struct mm_struct *mm;
751 unsigned long *table; 747 unsigned long *table;
752 unsigned long asce; 748 unsigned long asce;
749 void *private;
753 struct list_head crst_list; 750 struct list_head crst_list;
754}; 751};
755 752
@@ -808,8 +805,8 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
808 pte_t *ptep, pgste_t pgste) 805 pte_t *ptep, pgste_t pgste)
809{ 806{
810#ifdef CONFIG_PGSTE 807#ifdef CONFIG_PGSTE
811 if (pgste_val(pgste) & RCP_IN_BIT) { 808 if (pgste_val(pgste) & PGSTE_IN_BIT) {
812 pgste_val(pgste) &= ~RCP_IN_BIT; 809 pgste_val(pgste) &= ~PGSTE_IN_BIT;
813 gmap_do_ipte_notify(mm, addr, ptep); 810 gmap_do_ipte_notify(mm, addr, ptep);
814 } 811 }
815#endif 812#endif
@@ -977,8 +974,8 @@ static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm,
977 if (mm_has_pgste(mm)) { 974 if (mm_has_pgste(mm)) {
978 pgste = pgste_get_lock(ptep); 975 pgste = pgste_get_lock(ptep);
979 pgste = pgste_update_all(ptep, pgste); 976 pgste = pgste_update_all(ptep, pgste);
980 dirty = !!(pgste_val(pgste) & KVM_UC_BIT); 977 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
981 pgste_val(pgste) &= ~KVM_UC_BIT; 978 pgste_val(pgste) &= ~PGSTE_UC_BIT;
982 pgste_set_unlock(ptep, pgste); 979 pgste_set_unlock(ptep, pgste);
983 return dirty; 980 return dirty;
984 } 981 }
@@ -997,8 +994,8 @@ static inline int ptep_test_and_clear_user_young(struct mm_struct *mm,
997 if (mm_has_pgste(mm)) { 994 if (mm_has_pgste(mm)) {
998 pgste = pgste_get_lock(ptep); 995 pgste = pgste_get_lock(ptep);
999 pgste = pgste_update_young(ptep, pgste); 996 pgste = pgste_update_young(ptep, pgste);
1000 young = !!(pgste_val(pgste) & KVM_UR_BIT); 997 young = !!(pgste_val(pgste) & PGSTE_UR_BIT);
1001 pgste_val(pgste) &= ~KVM_UR_BIT; 998 pgste_val(pgste) &= ~PGSTE_UR_BIT;
1002 pgste_set_unlock(ptep, pgste); 999 pgste_set_unlock(ptep, pgste);
1003 } 1000 }
1004 return young; 1001 return young;
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index d6de844bc30a..2416138ebd3e 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
7#define ASM_OFFSETS_C 7#define ASM_OFFSETS_C
8 8
9#include <linux/kbuild.h> 9#include <linux/kbuild.h>
10#include <linux/kvm_host.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <asm/cputime.h> 12#include <asm/cputime.h>
12#include <asm/vdso.h> 13#include <asm/vdso.h>
@@ -162,6 +163,8 @@ int main(void)
162 DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb)); 163 DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb));
163 DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); 164 DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb));
164 DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce)); 165 DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce));
166 DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c));
167 DEFINE(__SIE_PROG20, offsetof(struct kvm_s390_sie_block, prog20));
165#endif /* CONFIG_32BIT */ 168#endif /* CONFIG_32BIT */
166 return 0; 169 return 0;
167} 170}
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index bc5864c5148b..1c039d0c24c7 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -47,7 +47,6 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
47 _TIF_MCCK_PENDING) 47 _TIF_MCCK_PENDING)
48_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ 48_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
49 _TIF_SYSCALL_TRACEPOINT) 49 _TIF_SYSCALL_TRACEPOINT)
50_TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
51 50
52#define BASED(name) name-system_call(%r13) 51#define BASED(name) name-system_call(%r13)
53 52
@@ -81,23 +80,27 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
81#endif 80#endif
82 .endm 81 .endm
83 82
84 .macro HANDLE_SIE_INTERCEPT scratch,pgmcheck 83 .macro HANDLE_SIE_INTERCEPT scratch,reason
85#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) 84#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
86 tmhh %r8,0x0001 # interrupting from user ? 85 tmhh %r8,0x0001 # interrupting from user ?
87 jnz .+42 86 jnz .+62
88 lgr \scratch,%r9 87 lgr \scratch,%r9
89 slg \scratch,BASED(.Lsie_loop) 88 slg \scratch,BASED(.Lsie_critical)
90 clg \scratch,BASED(.Lsie_length) 89 clg \scratch,BASED(.Lsie_critical_length)
91 .if \pgmcheck 90 .if \reason==1
92 # Some program interrupts are suppressing (e.g. protection). 91 # Some program interrupts are suppressing (e.g. protection).
93 # We must also check the instruction after SIE in that case. 92 # We must also check the instruction after SIE in that case.
94 # do_protection_exception will rewind to rewind_pad 93 # do_protection_exception will rewind to rewind_pad
95 jh .+22 94 jh .+42
96 .else 95 .else
97 jhe .+22 96 jhe .+42
98 .endif 97 .endif
99 lg %r9,BASED(.Lsie_loop) 98 lg %r14,__SF_EMPTY(%r15) # get control block pointer
100 LPP BASED(.Lhost_id) # set host id 99 LPP __SF_EMPTY+16(%r15) # set host id
100 ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
101 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
102 larl %r9,sie_exit # skip forward to sie_exit
103 mvi __SF_EMPTY+31(%r15),\reason # set exit reason
101#endif 104#endif
102 .endm 105 .endm
103 106
@@ -450,7 +453,7 @@ ENTRY(io_int_handler)
450 lg %r12,__LC_THREAD_INFO 453 lg %r12,__LC_THREAD_INFO
451 larl %r13,system_call 454 larl %r13,system_call
452 lmg %r8,%r9,__LC_IO_OLD_PSW 455 lmg %r8,%r9,__LC_IO_OLD_PSW
453 HANDLE_SIE_INTERCEPT %r14,0 456 HANDLE_SIE_INTERCEPT %r14,2
454 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT 457 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
455 tmhh %r8,0x0001 # interrupting from user? 458 tmhh %r8,0x0001 # interrupting from user?
456 jz io_skip 459 jz io_skip
@@ -603,7 +606,7 @@ ENTRY(ext_int_handler)
603 lg %r12,__LC_THREAD_INFO 606 lg %r12,__LC_THREAD_INFO
604 larl %r13,system_call 607 larl %r13,system_call
605 lmg %r8,%r9,__LC_EXT_OLD_PSW 608 lmg %r8,%r9,__LC_EXT_OLD_PSW
606 HANDLE_SIE_INTERCEPT %r14,0 609 HANDLE_SIE_INTERCEPT %r14,3
607 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT 610 SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT
608 tmhh %r8,0x0001 # interrupting from user ? 611 tmhh %r8,0x0001 # interrupting from user ?
609 jz ext_skip 612 jz ext_skip
@@ -651,7 +654,7 @@ ENTRY(mcck_int_handler)
651 lg %r12,__LC_THREAD_INFO 654 lg %r12,__LC_THREAD_INFO
652 larl %r13,system_call 655 larl %r13,system_call
653 lmg %r8,%r9,__LC_MCK_OLD_PSW 656 lmg %r8,%r9,__LC_MCK_OLD_PSW
654 HANDLE_SIE_INTERCEPT %r14,0 657 HANDLE_SIE_INTERCEPT %r14,4
655 tm __LC_MCCK_CODE,0x80 # system damage? 658 tm __LC_MCCK_CODE,0x80 # system damage?
656 jo mcck_panic # yes -> rest of mcck code invalid 659 jo mcck_panic # yes -> rest of mcck code invalid
657 lghi %r14,__LC_CPU_TIMER_SAVE_AREA 660 lghi %r14,__LC_CPU_TIMER_SAVE_AREA
@@ -945,56 +948,50 @@ ENTRY(sie64a)
945 stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers 948 stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
946 stg %r2,__SF_EMPTY(%r15) # save control block pointer 949 stg %r2,__SF_EMPTY(%r15) # save control block pointer
947 stg %r3,__SF_EMPTY+8(%r15) # save guest register save area 950 stg %r3,__SF_EMPTY+8(%r15) # save guest register save area
948 xc __SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # host id == 0 951 xc __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
949 lmg %r0,%r13,0(%r3) # load guest gprs 0-13 952 lmg %r0,%r13,0(%r3) # load guest gprs 0-13
950# some program checks are suppressing. C code (e.g. do_protection_exception)
951# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
952# instructions in the sie_loop should not cause program interrupts. So
953# lets use a nop (47 00 00 00) as a landing pad.
954# See also HANDLE_SIE_INTERCEPT
955rewind_pad:
956 nop 0
957sie_loop:
958 lg %r14,__LC_THREAD_INFO # pointer thread_info struct
959 tm __TI_flags+7(%r14),_TIF_EXIT_SIE
960 jnz sie_exit
961 lg %r14,__LC_GMAP # get gmap pointer 953 lg %r14,__LC_GMAP # get gmap pointer
962 ltgr %r14,%r14 954 ltgr %r14,%r14
963 jz sie_gmap 955 jz sie_gmap
964 lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce 956 lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce
965sie_gmap: 957sie_gmap:
966 lg %r14,__SF_EMPTY(%r15) # get control block pointer 958 lg %r14,__SF_EMPTY(%r15) # get control block pointer
959 oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
960 tm __SIE_PROG20+3(%r14),1 # last exit...
961 jnz sie_done
967 LPP __SF_EMPTY(%r15) # set guest id 962 LPP __SF_EMPTY(%r15) # set guest id
968 sie 0(%r14) 963 sie 0(%r14)
969sie_done: 964sie_done:
970 LPP __SF_EMPTY+16(%r15) # set host id 965 LPP __SF_EMPTY+16(%r15) # set host id
971 lg %r14,__LC_THREAD_INFO # pointer thread_info struct 966 ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
972sie_exit:
973 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce 967 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
968# some program checks are suppressing. C code (e.g. do_protection_exception)
969# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other
970# instructions beween sie64a and sie_done should not cause program
971# interrupts. So lets use a nop (47 00 00 00) as a landing pad.
972# See also HANDLE_SIE_INTERCEPT
973rewind_pad:
974 nop 0
975 .globl sie_exit
976sie_exit:
974 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area 977 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area
975 stmg %r0,%r13,0(%r14) # save guest gprs 0-13 978 stmg %r0,%r13,0(%r14) # save guest gprs 0-13
976 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers 979 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
977 lghi %r2,0 980 lg %r2,__SF_EMPTY+24(%r15) # return exit reason code
978 br %r14 981 br %r14
979sie_fault: 982sie_fault:
980 lctlg %c1,%c1,__LC_USER_ASCE # load primary asce 983 lghi %r14,-EFAULT
981 lg %r14,__LC_THREAD_INFO # pointer thread_info struct 984 stg %r14,__SF_EMPTY+24(%r15) # set exit reason code
982 lg %r14,__SF_EMPTY+8(%r15) # load guest register save area 985 j sie_exit
983 stmg %r0,%r13,0(%r14) # save guest gprs 0-13
984 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
985 lghi %r2,-EFAULT
986 br %r14
987 986
988 .align 8 987 .align 8
989.Lsie_loop: 988.Lsie_critical:
990 .quad sie_loop 989 .quad sie_gmap
991.Lsie_length: 990.Lsie_critical_length:
992 .quad sie_done - sie_loop 991 .quad sie_done - sie_gmap
993.Lhost_id:
994 .quad 0
995 992
996 EX_TABLE(rewind_pad,sie_fault) 993 EX_TABLE(rewind_pad,sie_fault)
997 EX_TABLE(sie_loop,sie_fault) 994 EX_TABLE(sie_exit,sie_fault)
998#endif 995#endif
999 996
1000 .section .rodata, "a" 997 .section .rodata, "a"
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index f58f37f66824..a6fc037671b1 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/perf_event.h> 15#include <linux/perf_event.h>
16#include <linux/kvm_host.h>
16#include <linux/percpu.h> 17#include <linux/percpu.h>
17#include <linux/export.h> 18#include <linux/export.h>
18#include <asm/irq.h> 19#include <asm/irq.h>
@@ -39,6 +40,57 @@ int perf_num_counters(void)
39} 40}
40EXPORT_SYMBOL(perf_num_counters); 41EXPORT_SYMBOL(perf_num_counters);
41 42
43static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
44{
45 struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
46
47 if (!stack)
48 return NULL;
49
50 return (struct kvm_s390_sie_block *) stack->empty1[0];
51}
52
53static bool is_in_guest(struct pt_regs *regs)
54{
55 unsigned long ip = instruction_pointer(regs);
56
57 if (user_mode(regs))
58 return false;
59
60 return ip == (unsigned long) &sie_exit;
61}
62
63static unsigned long guest_is_user_mode(struct pt_regs *regs)
64{
65 return sie_block(regs)->gpsw.mask & PSW_MASK_PSTATE;
66}
67
68static unsigned long instruction_pointer_guest(struct pt_regs *regs)
69{
70 return sie_block(regs)->gpsw.addr & PSW_ADDR_INSN;
71}
72
73unsigned long perf_instruction_pointer(struct pt_regs *regs)
74{
75 return is_in_guest(regs) ? instruction_pointer_guest(regs)
76 : instruction_pointer(regs);
77}
78
79static unsigned long perf_misc_guest_flags(struct pt_regs *regs)
80{
81 return guest_is_user_mode(regs) ? PERF_RECORD_MISC_GUEST_USER
82 : PERF_RECORD_MISC_GUEST_KERNEL;
83}
84
85unsigned long perf_misc_flags(struct pt_regs *regs)
86{
87 if (is_in_guest(regs))
88 return perf_misc_guest_flags(regs);
89
90 return user_mode(regs) ? PERF_RECORD_MISC_USER
91 : PERF_RECORD_MISC_KERNEL;
92}
93
42void perf_event_print_debug(void) 94void perf_event_print_debug(void)
43{ 95{
44 struct cpumf_ctr_info cf_info; 96 struct cpumf_ctr_info cf_info;
diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 9bdbcef1da9e..3bac589844a7 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -7,6 +7,7 @@ EXPORT_SYMBOL(_mcount);
7#endif 7#endif
8#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) 8#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE)
9EXPORT_SYMBOL(sie64a); 9EXPORT_SYMBOL(sie64a);
10EXPORT_SYMBOL(sie_exit);
10#endif 11#endif
11EXPORT_SYMBOL(memcpy); 12EXPORT_SYMBOL(memcpy);
12EXPORT_SYMBOL(memset); 13EXPORT_SYMBOL(memset);
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 8fe9d65a4585..40b4c6470f88 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -6,7 +6,8 @@
6# it under the terms of the GNU General Public License (version 2 only) 6# it under the terms of the GNU General Public License (version 2 only)
7# as published by the Free Software Foundation. 7# as published by the Free Software Foundation.
8 8
9common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o) 9KVM := ../../../virt/kvm
10common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
10 11
11ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 12ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
12 13
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1c01a9912989..3074475c8ae0 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -132,6 +132,9 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
132{ 132{
133 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; 133 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
134 134
135 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
136 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
137
135 trace_kvm_s390_handle_diag(vcpu, code); 138 trace_kvm_s390_handle_diag(vcpu, code);
136 switch (code) { 139 switch (code) {
137 case 0x10: 140 case 0x10:
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index b7d1b2edeeb3..5ee56e5acc23 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -22,87 +22,6 @@
22#include "trace.h" 22#include "trace.h"
23#include "trace-s390.h" 23#include "trace-s390.h"
24 24
25static int handle_lctlg(struct kvm_vcpu *vcpu)
26{
27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
29 u64 useraddr;
30 int reg, rc;
31
32 vcpu->stat.instruction_lctlg++;
33
34 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
35
36 if (useraddr & 7)
37 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
38
39 reg = reg1;
40
41 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
42 useraddr);
43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
44
45 do {
46 rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
47 (u64 __user *) useraddr);
48 if (rc)
49 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
50 useraddr += 8;
51 if (reg == reg3)
52 break;
53 reg = (reg + 1) % 16;
54 } while (1);
55 return 0;
56}
57
58static int handle_lctl(struct kvm_vcpu *vcpu)
59{
60 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
61 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
62 u64 useraddr;
63 u32 val = 0;
64 int reg, rc;
65
66 vcpu->stat.instruction_lctl++;
67
68 useraddr = kvm_s390_get_base_disp_rs(vcpu);
69
70 if (useraddr & 3)
71 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
72
73 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
74 useraddr);
75 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
76
77 reg = reg1;
78 do {
79 rc = get_guest(vcpu, val, (u32 __user *) useraddr);
80 if (rc)
81 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
82 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
83 vcpu->arch.sie_block->gcr[reg] |= val;
84 useraddr += 4;
85 if (reg == reg3)
86 break;
87 reg = (reg + 1) % 16;
88 } while (1);
89 return 0;
90}
91
92static const intercept_handler_t eb_handlers[256] = {
93 [0x2f] = handle_lctlg,
94 [0x8a] = kvm_s390_handle_priv_eb,
95};
96
97static int handle_eb(struct kvm_vcpu *vcpu)
98{
99 intercept_handler_t handler;
100
101 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
102 if (handler)
103 return handler(vcpu);
104 return -EOPNOTSUPP;
105}
106 25
107static const intercept_handler_t instruction_handlers[256] = { 26static const intercept_handler_t instruction_handlers[256] = {
108 [0x01] = kvm_s390_handle_01, 27 [0x01] = kvm_s390_handle_01,
@@ -110,10 +29,10 @@ static const intercept_handler_t instruction_handlers[256] = {
110 [0x83] = kvm_s390_handle_diag, 29 [0x83] = kvm_s390_handle_diag,
111 [0xae] = kvm_s390_handle_sigp, 30 [0xae] = kvm_s390_handle_sigp,
112 [0xb2] = kvm_s390_handle_b2, 31 [0xb2] = kvm_s390_handle_b2,
113 [0xb7] = handle_lctl, 32 [0xb7] = kvm_s390_handle_lctl,
114 [0xb9] = kvm_s390_handle_b9, 33 [0xb9] = kvm_s390_handle_b9,
115 [0xe5] = kvm_s390_handle_e5, 34 [0xe5] = kvm_s390_handle_e5,
116 [0xeb] = handle_eb, 35 [0xeb] = kvm_s390_handle_eb,
117}; 36};
118 37
119static int handle_noop(struct kvm_vcpu *vcpu) 38static int handle_noop(struct kvm_vcpu *vcpu)
@@ -174,47 +93,12 @@ static int handle_stop(struct kvm_vcpu *vcpu)
174 93
175static int handle_validity(struct kvm_vcpu *vcpu) 94static int handle_validity(struct kvm_vcpu *vcpu)
176{ 95{
177 unsigned long vmaddr;
178 int viwhy = vcpu->arch.sie_block->ipb >> 16; 96 int viwhy = vcpu->arch.sie_block->ipb >> 16;
179 int rc;
180 97
181 vcpu->stat.exit_validity++; 98 vcpu->stat.exit_validity++;
182 trace_kvm_s390_intercept_validity(vcpu, viwhy); 99 trace_kvm_s390_intercept_validity(vcpu, viwhy);
183 if (viwhy == 0x37) { 100 WARN_ONCE(true, "kvm: unhandled validity intercept 0x%x\n", viwhy);
184 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix, 101 return -EOPNOTSUPP;
185 vcpu->arch.gmap);
186 if (IS_ERR_VALUE(vmaddr)) {
187 rc = -EOPNOTSUPP;
188 goto out;
189 }
190 rc = fault_in_pages_writeable((char __user *) vmaddr,
191 PAGE_SIZE);
192 if (rc) {
193 /* user will receive sigsegv, exit to user */
194 rc = -EOPNOTSUPP;
195 goto out;
196 }
197 vmaddr = gmap_fault(vcpu->arch.sie_block->prefix + PAGE_SIZE,
198 vcpu->arch.gmap);
199 if (IS_ERR_VALUE(vmaddr)) {
200 rc = -EOPNOTSUPP;
201 goto out;
202 }
203 rc = fault_in_pages_writeable((char __user *) vmaddr,
204 PAGE_SIZE);
205 if (rc) {
206 /* user will receive sigsegv, exit to user */
207 rc = -EOPNOTSUPP;
208 goto out;
209 }
210 } else
211 rc = -EOPNOTSUPP;
212
213out:
214 if (rc)
215 VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
216 viwhy);
217 return rc;
218} 102}
219 103
220static int handle_instruction(struct kvm_vcpu *vcpu) 104static int handle_instruction(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5c948177529e..7f35cb33e510 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -438,7 +438,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
438no_timer: 438no_timer:
439 spin_lock(&vcpu->arch.local_int.float_int->lock); 439 spin_lock(&vcpu->arch.local_int.float_int->lock);
440 spin_lock_bh(&vcpu->arch.local_int.lock); 440 spin_lock_bh(&vcpu->arch.local_int.lock);
441 add_wait_queue(&vcpu->arch.local_int.wq, &wait); 441 add_wait_queue(&vcpu->wq, &wait);
442 while (list_empty(&vcpu->arch.local_int.list) && 442 while (list_empty(&vcpu->arch.local_int.list) &&
443 list_empty(&vcpu->arch.local_int.float_int->list) && 443 list_empty(&vcpu->arch.local_int.float_int->list) &&
444 (!vcpu->arch.local_int.timer_due) && 444 (!vcpu->arch.local_int.timer_due) &&
@@ -452,7 +452,7 @@ no_timer:
452 } 452 }
453 __unset_cpu_idle(vcpu); 453 __unset_cpu_idle(vcpu);
454 __set_current_state(TASK_RUNNING); 454 __set_current_state(TASK_RUNNING);
455 remove_wait_queue(&vcpu->arch.local_int.wq, &wait); 455 remove_wait_queue(&vcpu->wq, &wait);
456 spin_unlock_bh(&vcpu->arch.local_int.lock); 456 spin_unlock_bh(&vcpu->arch.local_int.lock);
457 spin_unlock(&vcpu->arch.local_int.float_int->lock); 457 spin_unlock(&vcpu->arch.local_int.float_int->lock);
458 hrtimer_try_to_cancel(&vcpu->arch.ckc_timer); 458 hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
@@ -465,8 +465,8 @@ void kvm_s390_tasklet(unsigned long parm)
465 465
466 spin_lock(&vcpu->arch.local_int.lock); 466 spin_lock(&vcpu->arch.local_int.lock);
467 vcpu->arch.local_int.timer_due = 1; 467 vcpu->arch.local_int.timer_due = 1;
468 if (waitqueue_active(&vcpu->arch.local_int.wq)) 468 if (waitqueue_active(&vcpu->wq))
469 wake_up_interruptible(&vcpu->arch.local_int.wq); 469 wake_up_interruptible(&vcpu->wq);
470 spin_unlock(&vcpu->arch.local_int.lock); 470 spin_unlock(&vcpu->arch.local_int.lock);
471} 471}
472 472
@@ -613,7 +613,7 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
613 spin_lock_bh(&li->lock); 613 spin_lock_bh(&li->lock);
614 list_add(&inti->list, &li->list); 614 list_add(&inti->list, &li->list);
615 atomic_set(&li->active, 1); 615 atomic_set(&li->active, 1);
616 BUG_ON(waitqueue_active(&li->wq)); 616 BUG_ON(waitqueue_active(li->wq));
617 spin_unlock_bh(&li->lock); 617 spin_unlock_bh(&li->lock);
618 return 0; 618 return 0;
619} 619}
@@ -746,8 +746,8 @@ int kvm_s390_inject_vm(struct kvm *kvm,
746 li = fi->local_int[sigcpu]; 746 li = fi->local_int[sigcpu];
747 spin_lock_bh(&li->lock); 747 spin_lock_bh(&li->lock);
748 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 748 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
749 if (waitqueue_active(&li->wq)) 749 if (waitqueue_active(li->wq))
750 wake_up_interruptible(&li->wq); 750 wake_up_interruptible(li->wq);
751 spin_unlock_bh(&li->lock); 751 spin_unlock_bh(&li->lock);
752 spin_unlock(&fi->lock); 752 spin_unlock(&fi->lock);
753 mutex_unlock(&kvm->lock); 753 mutex_unlock(&kvm->lock);
@@ -832,8 +832,8 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
832 if (inti->type == KVM_S390_SIGP_STOP) 832 if (inti->type == KVM_S390_SIGP_STOP)
833 li->action_bits |= ACTION_STOP_ON_STOP; 833 li->action_bits |= ACTION_STOP_ON_STOP;
834 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 834 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
835 if (waitqueue_active(&li->wq)) 835 if (waitqueue_active(&vcpu->wq))
836 wake_up_interruptible(&vcpu->arch.local_int.wq); 836 wake_up_interruptible(&vcpu->wq);
837 spin_unlock_bh(&li->lock); 837 spin_unlock_bh(&li->lock);
838 mutex_unlock(&vcpu->kvm->lock); 838 mutex_unlock(&vcpu->kvm->lock);
839 return 0; 839 return 0;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index c1c7c683fa26..ba694d2ba51e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -59,6 +59,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
59 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, 59 { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
60 { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, 60 { "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
61 { "exit_wait_state", VCPU_STAT(exit_wait_state) }, 61 { "exit_wait_state", VCPU_STAT(exit_wait_state) },
62 { "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
62 { "instruction_stidp", VCPU_STAT(instruction_stidp) }, 63 { "instruction_stidp", VCPU_STAT(instruction_stidp) },
63 { "instruction_spx", VCPU_STAT(instruction_spx) }, 64 { "instruction_spx", VCPU_STAT(instruction_spx) },
64 { "instruction_stpx", VCPU_STAT(instruction_stpx) }, 65 { "instruction_stpx", VCPU_STAT(instruction_stpx) },
@@ -84,6 +85,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
84}; 85};
85 86
86static unsigned long long *facilities; 87static unsigned long long *facilities;
88static struct gmap_notifier gmap_notifier;
87 89
88/* Section: not file related */ 90/* Section: not file related */
89int kvm_arch_hardware_enable(void *garbage) 91int kvm_arch_hardware_enable(void *garbage)
@@ -96,13 +98,18 @@ void kvm_arch_hardware_disable(void *garbage)
96{ 98{
97} 99}
98 100
101static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
102
99int kvm_arch_hardware_setup(void) 103int kvm_arch_hardware_setup(void)
100{ 104{
105 gmap_notifier.notifier_call = kvm_gmap_notifier;
106 gmap_register_ipte_notifier(&gmap_notifier);
101 return 0; 107 return 0;
102} 108}
103 109
104void kvm_arch_hardware_unsetup(void) 110void kvm_arch_hardware_unsetup(void)
105{ 111{
112 gmap_unregister_ipte_notifier(&gmap_notifier);
106} 113}
107 114
108void kvm_arch_check_processor_compat(void *rtn) 115void kvm_arch_check_processor_compat(void *rtn)
@@ -239,6 +246,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
239 kvm->arch.gmap = gmap_alloc(current->mm); 246 kvm->arch.gmap = gmap_alloc(current->mm);
240 if (!kvm->arch.gmap) 247 if (!kvm->arch.gmap)
241 goto out_nogmap; 248 goto out_nogmap;
249 kvm->arch.gmap->private = kvm;
242 } 250 }
243 251
244 kvm->arch.css_support = 0; 252 kvm->arch.css_support = 0;
@@ -270,7 +278,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
270 278
271 free_page((unsigned long)(vcpu->arch.sie_block)); 279 free_page((unsigned long)(vcpu->arch.sie_block));
272 kvm_vcpu_uninit(vcpu); 280 kvm_vcpu_uninit(vcpu);
273 kfree(vcpu); 281 kmem_cache_free(kvm_vcpu_cache, vcpu);
274} 282}
275 283
276static void kvm_free_vcpus(struct kvm *kvm) 284static void kvm_free_vcpus(struct kvm *kvm)
@@ -309,6 +317,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
309 vcpu->arch.gmap = gmap_alloc(current->mm); 317 vcpu->arch.gmap = gmap_alloc(current->mm);
310 if (!vcpu->arch.gmap) 318 if (!vcpu->arch.gmap)
311 return -ENOMEM; 319 return -ENOMEM;
320 vcpu->arch.gmap->private = vcpu->kvm;
312 return 0; 321 return 0;
313 } 322 }
314 323
@@ -373,8 +382,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
373{ 382{
374 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | 383 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
375 CPUSTAT_SM | 384 CPUSTAT_SM |
376 CPUSTAT_STOPPED); 385 CPUSTAT_STOPPED |
386 CPUSTAT_GED);
377 vcpu->arch.sie_block->ecb = 6; 387 vcpu->arch.sie_block->ecb = 6;
388 vcpu->arch.sie_block->ecb2 = 8;
378 vcpu->arch.sie_block->eca = 0xC1002001U; 389 vcpu->arch.sie_block->eca = 0xC1002001U;
379 vcpu->arch.sie_block->fac = (int) (long) facilities; 390 vcpu->arch.sie_block->fac = (int) (long) facilities;
380 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 391 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -397,7 +408,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
397 408
398 rc = -ENOMEM; 409 rc = -ENOMEM;
399 410
400 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 411 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
401 if (!vcpu) 412 if (!vcpu)
402 goto out; 413 goto out;
403 414
@@ -427,7 +438,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
427 vcpu->arch.local_int.float_int = &kvm->arch.float_int; 438 vcpu->arch.local_int.float_int = &kvm->arch.float_int;
428 spin_lock(&kvm->arch.float_int.lock); 439 spin_lock(&kvm->arch.float_int.lock);
429 kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int; 440 kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
430 init_waitqueue_head(&vcpu->arch.local_int.wq); 441 vcpu->arch.local_int.wq = &vcpu->wq;
431 vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; 442 vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
432 spin_unlock(&kvm->arch.float_int.lock); 443 spin_unlock(&kvm->arch.float_int.lock);
433 444
@@ -442,7 +453,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
442out_free_sie_block: 453out_free_sie_block:
443 free_page((unsigned long)(vcpu->arch.sie_block)); 454 free_page((unsigned long)(vcpu->arch.sie_block));
444out_free_cpu: 455out_free_cpu:
445 kfree(vcpu); 456 kmem_cache_free(kvm_vcpu_cache, vcpu);
446out: 457out:
447 return ERR_PTR(rc); 458 return ERR_PTR(rc);
448} 459}
@@ -454,6 +465,50 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
454 return 0; 465 return 0;
455} 466}
456 467
468void s390_vcpu_block(struct kvm_vcpu *vcpu)
469{
470 atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
471}
472
473void s390_vcpu_unblock(struct kvm_vcpu *vcpu)
474{
475 atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
476}
477
478/*
479 * Kick a guest cpu out of SIE and wait until SIE is not running.
480 * If the CPU is not running (e.g. waiting as idle) the function will
481 * return immediately. */
482void exit_sie(struct kvm_vcpu *vcpu)
483{
484 atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
485 while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
486 cpu_relax();
487}
488
489/* Kick a guest cpu out of SIE and prevent SIE-reentry */
490void exit_sie_sync(struct kvm_vcpu *vcpu)
491{
492 s390_vcpu_block(vcpu);
493 exit_sie(vcpu);
494}
495
496static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
497{
498 int i;
499 struct kvm *kvm = gmap->private;
500 struct kvm_vcpu *vcpu;
501
502 kvm_for_each_vcpu(i, vcpu, kvm) {
503 /* match against both prefix pages */
504 if (vcpu->arch.sie_block->prefix == (address & ~0x1000UL)) {
505 VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
506 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
507 exit_sie_sync(vcpu);
508 }
509 }
510}
511
457int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) 512int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
458{ 513{
459 /* kvm common code refers to this, but never calls it */ 514 /* kvm common code refers to this, but never calls it */
@@ -606,6 +661,27 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
606 return -EINVAL; /* not implemented yet */ 661 return -EINVAL; /* not implemented yet */
607} 662}
608 663
664static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
665{
666 /*
667 * We use MMU_RELOAD just to re-arm the ipte notifier for the
668 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
669 * This ensures that the ipte instruction for this request has
670 * already finished. We might race against a second unmapper that
671 * wants to set the blocking bit. Lets just retry the request loop.
672 */
673 while (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
674 int rc;
675 rc = gmap_ipte_notify(vcpu->arch.gmap,
676 vcpu->arch.sie_block->prefix,
677 PAGE_SIZE * 2);
678 if (rc)
679 return rc;
680 s390_vcpu_unblock(vcpu);
681 }
682 return 0;
683}
684
609static int __vcpu_run(struct kvm_vcpu *vcpu) 685static int __vcpu_run(struct kvm_vcpu *vcpu)
610{ 686{
611 int rc; 687 int rc;
@@ -621,6 +697,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
621 if (!kvm_is_ucontrol(vcpu->kvm)) 697 if (!kvm_is_ucontrol(vcpu->kvm))
622 kvm_s390_deliver_pending_interrupts(vcpu); 698 kvm_s390_deliver_pending_interrupts(vcpu);
623 699
700 rc = kvm_s390_handle_requests(vcpu);
701 if (rc)
702 return rc;
703
624 vcpu->arch.sie_block->icptcode = 0; 704 vcpu->arch.sie_block->icptcode = 0;
625 preempt_disable(); 705 preempt_disable();
626 kvm_guest_enter(); 706 kvm_guest_enter();
@@ -630,7 +710,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
630 trace_kvm_s390_sie_enter(vcpu, 710 trace_kvm_s390_sie_enter(vcpu,
631 atomic_read(&vcpu->arch.sie_block->cpuflags)); 711 atomic_read(&vcpu->arch.sie_block->cpuflags));
632 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); 712 rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
633 if (rc) { 713 if (rc > 0)
714 rc = 0;
715 if (rc < 0) {
634 if (kvm_is_ucontrol(vcpu->kvm)) { 716 if (kvm_is_ucontrol(vcpu->kvm)) {
635 rc = SIE_INTERCEPT_UCONTROL; 717 rc = SIE_INTERCEPT_UCONTROL;
636 } else { 718 } else {
@@ -1046,7 +1128,7 @@ static int __init kvm_s390_init(void)
1046 return -ENOMEM; 1128 return -ENOMEM;
1047 } 1129 }
1048 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 1130 memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
1049 facilities[0] &= 0xff00fff3f47c0000ULL; 1131 facilities[0] &= 0xff82fff3f47c0000ULL;
1050 facilities[1] &= 0x001c000000000000ULL; 1132 facilities[1] &= 0x001c000000000000ULL;
1051 return 0; 1133 return 0;
1052} 1134}
@@ -1059,3 +1141,12 @@ static void __exit kvm_s390_exit(void)
1059 1141
1060module_init(kvm_s390_init); 1142module_init(kvm_s390_init);
1061module_exit(kvm_s390_exit); 1143module_exit(kvm_s390_exit);
1144
1145/*
1146 * Enable autoloading of the kvm module.
1147 * Note that we add the module alias here instead of virt/kvm/kvm_main.c
1148 * since x86 takes a different approach.
1149 */
1150#include <linux/miscdevice.h>
1151MODULE_ALIAS_MISCDEV(KVM_MINOR);
1152MODULE_ALIAS("devname:kvm");
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index efc14f687265..028ca9fd2158 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -63,6 +63,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
63{ 63{
64 vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u; 64 vcpu->arch.sie_block->prefix = prefix & 0x7fffe000u;
65 vcpu->arch.sie_block->ihcpu = 0xffff; 65 vcpu->arch.sie_block->ihcpu = 0xffff;
66 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
66} 67}
67 68
68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu) 69static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
@@ -85,6 +86,12 @@ static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2; 86 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
86} 87}
87 88
89static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2)
90{
91 *r1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 20;
92 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
93}
94
88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu) 95static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
89{ 96{
90 u32 base2 = vcpu->arch.sie_block->ipb >> 28; 97 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
@@ -125,7 +132,8 @@ int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
125int kvm_s390_handle_01(struct kvm_vcpu *vcpu); 132int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
126int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); 133int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
127int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); 134int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
128int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu); 135int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
136int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
129 137
130/* implemented in sigp.c */ 138/* implemented in sigp.c */
131int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 139int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -133,6 +141,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
133/* implemented in kvm-s390.c */ 141/* implemented in kvm-s390.c */
134int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 142int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
135 unsigned long addr); 143 unsigned long addr);
144void s390_vcpu_block(struct kvm_vcpu *vcpu);
145void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
146void exit_sie(struct kvm_vcpu *vcpu);
147void exit_sie_sync(struct kvm_vcpu *vcpu);
136/* implemented in diag.c */ 148/* implemented in diag.c */
137int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); 149int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
138 150
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 6bbd7b5a0bbe..0da3e6eb6be6 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * handling privileged instructions 2 * handling privileged instructions
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008, 2013
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -20,6 +20,9 @@
20#include <asm/debug.h> 20#include <asm/debug.h>
21#include <asm/ebcdic.h> 21#include <asm/ebcdic.h>
22#include <asm/sysinfo.h> 22#include <asm/sysinfo.h>
23#include <asm/pgtable.h>
24#include <asm/pgalloc.h>
25#include <asm/io.h>
23#include <asm/ptrace.h> 26#include <asm/ptrace.h>
24#include <asm/compat.h> 27#include <asm/compat.h>
25#include "gaccess.h" 28#include "gaccess.h"
@@ -34,6 +37,9 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
34 37
35 vcpu->stat.instruction_spx++; 38 vcpu->stat.instruction_spx++;
36 39
40 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
41 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
42
37 operand2 = kvm_s390_get_base_disp_s(vcpu); 43 operand2 = kvm_s390_get_base_disp_s(vcpu);
38 44
39 /* must be word boundary */ 45 /* must be word boundary */
@@ -65,6 +71,9 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
65 71
66 vcpu->stat.instruction_stpx++; 72 vcpu->stat.instruction_stpx++;
67 73
74 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
75 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
76
68 operand2 = kvm_s390_get_base_disp_s(vcpu); 77 operand2 = kvm_s390_get_base_disp_s(vcpu);
69 78
70 /* must be word boundary */ 79 /* must be word boundary */
@@ -89,6 +98,9 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
89 98
90 vcpu->stat.instruction_stap++; 99 vcpu->stat.instruction_stap++;
91 100
101 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
102 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
103
92 useraddr = kvm_s390_get_base_disp_s(vcpu); 104 useraddr = kvm_s390_get_base_disp_s(vcpu);
93 105
94 if (useraddr & 1) 106 if (useraddr & 1)
@@ -105,7 +117,12 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
105static int handle_skey(struct kvm_vcpu *vcpu) 117static int handle_skey(struct kvm_vcpu *vcpu)
106{ 118{
107 vcpu->stat.instruction_storage_key++; 119 vcpu->stat.instruction_storage_key++;
108 vcpu->arch.sie_block->gpsw.addr -= 4; 120
121 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
122 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
123
124 vcpu->arch.sie_block->gpsw.addr =
125 __rewind_psw(vcpu->arch.sie_block->gpsw, 4);
109 VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); 126 VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
110 return 0; 127 return 0;
111} 128}
@@ -129,9 +146,10 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
129 * Store the two-word I/O interruption code into the 146 * Store the two-word I/O interruption code into the
130 * provided area. 147 * provided area.
131 */ 148 */
132 put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr); 149 if (put_guest(vcpu, inti->io.subchannel_id, (u16 __user *)addr)
133 put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2)); 150 || put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *)(addr + 2))
134 put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4)); 151 || put_guest(vcpu, inti->io.io_int_parm, (u32 __user *)(addr + 4)))
152 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
135 } else { 153 } else {
136 /* 154 /*
137 * Store the three-word I/O interruption code into 155 * Store the three-word I/O interruption code into
@@ -182,6 +200,9 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
182{ 200{
183 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction"); 201 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
184 202
203 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
204 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
205
185 if (vcpu->kvm->arch.css_support) { 206 if (vcpu->kvm->arch.css_support) {
186 /* 207 /*
187 * Most I/O instructions will be handled by userspace. 208 * Most I/O instructions will be handled by userspace.
@@ -210,8 +231,12 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
210 int rc; 231 int rc;
211 232
212 vcpu->stat.instruction_stfl++; 233 vcpu->stat.instruction_stfl++;
234
235 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
236 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
237
213 /* only pass the facility bits, which we can handle */ 238 /* only pass the facility bits, which we can handle */
214 facility_list = S390_lowcore.stfl_fac_list & 0xff00fff3; 239 facility_list = S390_lowcore.stfl_fac_list & 0xff82fff3;
215 240
216 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 241 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
217 &facility_list, sizeof(facility_list)); 242 &facility_list, sizeof(facility_list));
@@ -255,8 +280,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
255 u64 addr; 280 u64 addr;
256 281
257 if (gpsw->mask & PSW_MASK_PSTATE) 282 if (gpsw->mask & PSW_MASK_PSTATE)
258 return kvm_s390_inject_program_int(vcpu, 283 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
259 PGM_PRIVILEGED_OPERATION); 284
260 addr = kvm_s390_get_base_disp_s(vcpu); 285 addr = kvm_s390_get_base_disp_s(vcpu);
261 if (addr & 7) 286 if (addr & 7)
262 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 287 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -278,6 +303,9 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
278 psw_t new_psw; 303 psw_t new_psw;
279 u64 addr; 304 u64 addr;
280 305
306 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
307 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
308
281 addr = kvm_s390_get_base_disp_s(vcpu); 309 addr = kvm_s390_get_base_disp_s(vcpu);
282 if (addr & 7) 310 if (addr & 7)
283 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 311 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -296,6 +324,9 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
296 324
297 vcpu->stat.instruction_stidp++; 325 vcpu->stat.instruction_stidp++;
298 326
327 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
328 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
329
299 operand2 = kvm_s390_get_base_disp_s(vcpu); 330 operand2 = kvm_s390_get_base_disp_s(vcpu);
300 331
301 if (operand2 & 7) 332 if (operand2 & 7)
@@ -351,16 +382,30 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
351 vcpu->stat.instruction_stsi++; 382 vcpu->stat.instruction_stsi++;
352 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); 383 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
353 384
354 operand2 = kvm_s390_get_base_disp_s(vcpu); 385 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
386 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
387
388 if (fc > 3) {
389 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; /* cc 3 */
390 return 0;
391 }
355 392
356 if (operand2 & 0xfff && fc > 0) 393 if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
394 || vcpu->run->s.regs.gprs[1] & 0xffff0000)
357 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 395 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
358 396
359 switch (fc) { 397 if (fc == 0) {
360 case 0:
361 vcpu->run->s.regs.gprs[0] = 3 << 28; 398 vcpu->run->s.regs.gprs[0] = 3 << 28;
362 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 399 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); /* cc 0 */
363 return 0; 400 return 0;
401 }
402
403 operand2 = kvm_s390_get_base_disp_s(vcpu);
404
405 if (operand2 & 0xfff)
406 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
407
408 switch (fc) {
364 case 1: /* same handling for 1 and 2 */ 409 case 1: /* same handling for 1 and 2 */
365 case 2: 410 case 2:
366 mem = get_zeroed_page(GFP_KERNEL); 411 mem = get_zeroed_page(GFP_KERNEL);
@@ -377,8 +422,6 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
377 goto out_no_data; 422 goto out_no_data;
378 handle_stsi_3_2_2(vcpu, (void *) mem); 423 handle_stsi_3_2_2(vcpu, (void *) mem);
379 break; 424 break;
380 default:
381 goto out_no_data;
382 } 425 }
383 426
384 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { 427 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
@@ -432,20 +475,14 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
432 intercept_handler_t handler; 475 intercept_handler_t handler;
433 476
434 /* 477 /*
435 * a lot of B2 instructions are priviledged. We first check for 478 * A lot of B2 instructions are priviledged. Here we check for
436 * the privileged ones, that we can handle in the kernel. If the 479 * the privileged ones, that we can handle in the kernel.
437 * kernel can handle this instruction, we check for the problem 480 * Anything else goes to userspace.
438 * state bit and (a) handle the instruction or (b) send a code 2 481 */
439 * program check.
440 * Anything else goes to userspace.*/
441 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 482 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
442 if (handler) { 483 if (handler)
443 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 484 return handler(vcpu);
444 return kvm_s390_inject_program_int(vcpu, 485
445 PGM_PRIVILEGED_OPERATION);
446 else
447 return handler(vcpu);
448 }
449 return -EOPNOTSUPP; 486 return -EOPNOTSUPP;
450} 487}
451 488
@@ -453,8 +490,7 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
453{ 490{
454 int reg1, reg2; 491 int reg1, reg2;
455 492
456 reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24; 493 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
457 reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
458 494
459 /* This basically extracts the mask half of the psw. */ 495 /* This basically extracts the mask half of the psw. */
460 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000; 496 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
@@ -467,9 +503,88 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
467 return 0; 503 return 0;
468} 504}
469 505
506#define PFMF_RESERVED 0xfffc0101UL
507#define PFMF_SK 0x00020000UL
508#define PFMF_CF 0x00010000UL
509#define PFMF_UI 0x00008000UL
510#define PFMF_FSC 0x00007000UL
511#define PFMF_NQ 0x00000800UL
512#define PFMF_MR 0x00000400UL
513#define PFMF_MC 0x00000200UL
514#define PFMF_KEY 0x000000feUL
515
516static int handle_pfmf(struct kvm_vcpu *vcpu)
517{
518 int reg1, reg2;
519 unsigned long start, end;
520
521 vcpu->stat.instruction_pfmf++;
522
523 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
524
525 if (!MACHINE_HAS_PFMF)
526 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
527
528 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
529 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
530
531 if (vcpu->run->s.regs.gprs[reg1] & PFMF_RESERVED)
532 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
533
534 /* Only provide non-quiescing support if the host supports it */
535 if (vcpu->run->s.regs.gprs[reg1] & PFMF_NQ &&
536 S390_lowcore.stfl_fac_list & 0x00020000)
537 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
538
539 /* No support for conditional-SSKE */
540 if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
541 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
542
543 start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
544 switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
545 case 0x00000000:
546 end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
547 break;
548 case 0x00001000:
549 end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
550 break;
551 /* We dont support EDAT2
552 case 0x00002000:
553 end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
554 break;*/
555 default:
556 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
557 }
558 while (start < end) {
559 unsigned long useraddr;
560
561 useraddr = gmap_translate(start, vcpu->arch.gmap);
562 if (IS_ERR((void *)useraddr))
563 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
564
565 if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
566 if (clear_user((void __user *)useraddr, PAGE_SIZE))
567 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
568 }
569
570 if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
571 if (set_guest_storage_key(current->mm, useraddr,
572 vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
573 vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
574 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
575 }
576
577 start += PAGE_SIZE;
578 }
579 if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
580 vcpu->run->s.regs.gprs[reg2] = end;
581 return 0;
582}
583
470static const intercept_handler_t b9_handlers[256] = { 584static const intercept_handler_t b9_handlers[256] = {
471 [0x8d] = handle_epsw, 585 [0x8d] = handle_epsw,
472 [0x9c] = handle_io_inst, 586 [0x9c] = handle_io_inst,
587 [0xaf] = handle_pfmf,
473}; 588};
474 589
475int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) 590int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
@@ -478,29 +593,96 @@ int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
478 593
479 /* This is handled just as for the B2 instructions. */ 594 /* This is handled just as for the B2 instructions. */
480 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 595 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
481 if (handler) { 596 if (handler)
482 if ((handler != handle_epsw) && 597 return handler(vcpu);
483 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)) 598
484 return kvm_s390_inject_program_int(vcpu,
485 PGM_PRIVILEGED_OPERATION);
486 else
487 return handler(vcpu);
488 }
489 return -EOPNOTSUPP; 599 return -EOPNOTSUPP;
490} 600}
491 601
602int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
603{
604 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
605 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
606 u64 useraddr;
607 u32 val = 0;
608 int reg, rc;
609
610 vcpu->stat.instruction_lctl++;
611
612 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
613 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
614
615 useraddr = kvm_s390_get_base_disp_rs(vcpu);
616
617 if (useraddr & 3)
618 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
619
620 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
621 useraddr);
622 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
623
624 reg = reg1;
625 do {
626 rc = get_guest(vcpu, val, (u32 __user *) useraddr);
627 if (rc)
628 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
629 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
630 vcpu->arch.sie_block->gcr[reg] |= val;
631 useraddr += 4;
632 if (reg == reg3)
633 break;
634 reg = (reg + 1) % 16;
635 } while (1);
636
637 return 0;
638}
639
640static int handle_lctlg(struct kvm_vcpu *vcpu)
641{
642 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
643 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
644 u64 useraddr;
645 int reg, rc;
646
647 vcpu->stat.instruction_lctlg++;
648
649 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
650 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
651
652 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
653
654 if (useraddr & 7)
655 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
656
657 reg = reg1;
658
659 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
660 useraddr);
661 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
662
663 do {
664 rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
665 (u64 __user *) useraddr);
666 if (rc)
667 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
668 useraddr += 8;
669 if (reg == reg3)
670 break;
671 reg = (reg + 1) % 16;
672 } while (1);
673
674 return 0;
675}
676
492static const intercept_handler_t eb_handlers[256] = { 677static const intercept_handler_t eb_handlers[256] = {
678 [0x2f] = handle_lctlg,
493 [0x8a] = handle_io_inst, 679 [0x8a] = handle_io_inst,
494}; 680};
495 681
496int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu) 682int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
497{ 683{
498 intercept_handler_t handler; 684 intercept_handler_t handler;
499 685
500 /* All eb instructions that end up here are privileged. */
501 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
502 return kvm_s390_inject_program_int(vcpu,
503 PGM_PRIVILEGED_OPERATION);
504 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; 686 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
505 if (handler) 687 if (handler)
506 return handler(vcpu); 688 return handler(vcpu);
@@ -515,6 +697,9 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
515 697
516 vcpu->stat.instruction_tprot++; 698 vcpu->stat.instruction_tprot++;
517 699
700 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
701 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
702
518 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2); 703 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
519 704
520 /* we only handle the Linux memory detection case: 705 /* we only handle the Linux memory detection case:
@@ -560,8 +745,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
560 u32 value; 745 u32 value;
561 746
562 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 747 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
563 return kvm_s390_inject_program_int(vcpu, 748 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
564 PGM_PRIVILEGED_OPERATION);
565 749
566 if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000) 750 if (vcpu->run->s.regs.gprs[0] & 0x00000000ffff0000)
567 return kvm_s390_inject_program_int(vcpu, 751 return kvm_s390_inject_program_int(vcpu,
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 1c48ab2845e0..bec398c57acf 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -79,8 +79,8 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
79 list_add_tail(&inti->list, &li->list); 79 list_add_tail(&inti->list, &li->list);
80 atomic_set(&li->active, 1); 80 atomic_set(&li->active, 1);
81 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 81 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
82 if (waitqueue_active(&li->wq)) 82 if (waitqueue_active(li->wq))
83 wake_up_interruptible(&li->wq); 83 wake_up_interruptible(li->wq);
84 spin_unlock_bh(&li->lock); 84 spin_unlock_bh(&li->lock);
85 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 85 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
86 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr); 86 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
@@ -117,8 +117,8 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
117 list_add_tail(&inti->list, &li->list); 117 list_add_tail(&inti->list, &li->list);
118 atomic_set(&li->active, 1); 118 atomic_set(&li->active, 1);
119 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); 119 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
120 if (waitqueue_active(&li->wq)) 120 if (waitqueue_active(li->wq))
121 wake_up_interruptible(&li->wq); 121 wake_up_interruptible(li->wq);
122 spin_unlock_bh(&li->lock); 122 spin_unlock_bh(&li->lock);
123 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 123 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
124 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr); 124 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
@@ -145,8 +145,8 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
145 atomic_set(&li->active, 1); 145 atomic_set(&li->active, 1);
146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
147 li->action_bits |= action; 147 li->action_bits |= action;
148 if (waitqueue_active(&li->wq)) 148 if (waitqueue_active(li->wq))
149 wake_up_interruptible(&li->wq); 149 wake_up_interruptible(li->wq);
150out: 150out:
151 spin_unlock_bh(&li->lock); 151 spin_unlock_bh(&li->lock);
152 152
@@ -250,8 +250,8 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
250 250
251 list_add_tail(&inti->list, &li->list); 251 list_add_tail(&inti->list, &li->list);
252 atomic_set(&li->active, 1); 252 atomic_set(&li->active, 1);
253 if (waitqueue_active(&li->wq)) 253 if (waitqueue_active(li->wq))
254 wake_up_interruptible(&li->wq); 254 wake_up_interruptible(li->wq);
255 rc = SIGP_CC_ORDER_CODE_ACCEPTED; 255 rc = SIGP_CC_ORDER_CODE_ACCEPTED;
256 256
257 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address); 257 VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
@@ -333,8 +333,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
333 333
334 /* sigp in userspace can exit */ 334 /* sigp in userspace can exit */
335 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 335 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
336 return kvm_s390_inject_program_int(vcpu, 336 return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
337 PGM_PRIVILEGED_OPERATION);
338 337
339 order_code = kvm_s390_get_base_disp_rs(vcpu); 338 order_code = kvm_s390_get_base_disp_rs(vcpu);
340 339
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 74c29d922458..17bf4d3d303a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -689,7 +689,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
689 entry = *ptep; 689 entry = *ptep;
690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) { 690 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
691 pgste = pgste_get_lock(ptep); 691 pgste = pgste_get_lock(ptep);
692 pgste_val(pgste) |= RCP_IN_BIT; 692 pgste_val(pgste) |= PGSTE_IN_BIT;
693 pgste_set_unlock(ptep, pgste); 693 pgste_set_unlock(ptep, pgste);
694 start += PAGE_SIZE; 694 start += PAGE_SIZE;
695 len -= PAGE_SIZE; 695 len -= PAGE_SIZE;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af9c5525434d..f87f7fcefa0a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,14 +222,22 @@ struct kvm_mmu_page {
222 int root_count; /* Currently serving as active root */ 222 int root_count; /* Currently serving as active root */
223 unsigned int unsync_children; 223 unsigned int unsync_children;
224 unsigned long parent_ptes; /* Reverse mapping for parent_pte */ 224 unsigned long parent_ptes; /* Reverse mapping for parent_pte */
225
226 /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
227 unsigned long mmu_valid_gen;
228
225 DECLARE_BITMAP(unsync_child_bitmap, 512); 229 DECLARE_BITMAP(unsync_child_bitmap, 512);
226 230
227#ifdef CONFIG_X86_32 231#ifdef CONFIG_X86_32
232 /*
233 * Used out of the mmu-lock to avoid reading spte values while an
234 * update is in progress; see the comments in __get_spte_lockless().
235 */
228 int clear_spte_count; 236 int clear_spte_count;
229#endif 237#endif
230 238
239 /* Number of writes since the last time traversal visited this page. */
231 int write_flooding_count; 240 int write_flooding_count;
232 bool mmio_cached;
233}; 241};
234 242
235struct kvm_pio_request { 243struct kvm_pio_request {
@@ -529,11 +537,14 @@ struct kvm_arch {
529 unsigned int n_requested_mmu_pages; 537 unsigned int n_requested_mmu_pages;
530 unsigned int n_max_mmu_pages; 538 unsigned int n_max_mmu_pages;
531 unsigned int indirect_shadow_pages; 539 unsigned int indirect_shadow_pages;
540 unsigned long mmu_valid_gen;
532 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 541 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
533 /* 542 /*
534 * Hash table of struct kvm_mmu_page. 543 * Hash table of struct kvm_mmu_page.
535 */ 544 */
536 struct list_head active_mmu_pages; 545 struct list_head active_mmu_pages;
546 struct list_head zapped_obsolete_pages;
547
537 struct list_head assigned_dev_head; 548 struct list_head assigned_dev_head;
538 struct iommu_domain *iommu_domain; 549 struct iommu_domain *iommu_domain;
539 int iommu_flags; 550 int iommu_flags;
@@ -769,7 +780,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
769 struct kvm_memory_slot *slot, 780 struct kvm_memory_slot *slot,
770 gfn_t gfn_offset, unsigned long mask); 781 gfn_t gfn_offset, unsigned long mask);
771void kvm_mmu_zap_all(struct kvm *kvm); 782void kvm_mmu_zap_all(struct kvm *kvm);
772void kvm_mmu_zap_mmio_sptes(struct kvm *kvm); 783void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
773unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 784unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
774void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 785void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
775 786
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d609e1d84048..bf4fb04d0112 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8KVM := ../../../virt/kvm
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9
10 irqchip.o) 10kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
11kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \ 11 $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
12 assigned-dev.o iommu.o) 12 $(KVM)/eventfd.o $(KVM)/irqchip.o
13kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
14kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
14 15
15kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 16kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
16 i8254.o cpuid.o pmu.o 17 i8254.o cpuid.o pmu.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5953dcea752d..2bc1e81045b0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -61,6 +61,8 @@
61#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ 62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */ 63#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
64#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
65#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
64 66
65#define OpBits 5 /* Width of operand field */ 67#define OpBits 5 /* Width of operand field */
66#define OpMask ((1ull << OpBits) - 1) 68#define OpMask ((1ull << OpBits) - 1)
@@ -86,6 +88,7 @@
86#define DstMem64 (OpMem64 << DstShift) 88#define DstMem64 (OpMem64 << DstShift)
87#define DstImmUByte (OpImmUByte << DstShift) 89#define DstImmUByte (OpImmUByte << DstShift)
88#define DstDX (OpDX << DstShift) 90#define DstDX (OpDX << DstShift)
91#define DstAccLo (OpAccLo << DstShift)
89#define DstMask (OpMask << DstShift) 92#define DstMask (OpMask << DstShift)
90/* Source operand type. */ 93/* Source operand type. */
91#define SrcShift 6 94#define SrcShift 6
@@ -108,6 +111,7 @@
108#define SrcImm64 (OpImm64 << SrcShift) 111#define SrcImm64 (OpImm64 << SrcShift)
109#define SrcDX (OpDX << SrcShift) 112#define SrcDX (OpDX << SrcShift)
110#define SrcMem8 (OpMem8 << SrcShift) 113#define SrcMem8 (OpMem8 << SrcShift)
114#define SrcAccHi (OpAccHi << SrcShift)
111#define SrcMask (OpMask << SrcShift) 115#define SrcMask (OpMask << SrcShift)
112#define BitOp (1<<11) 116#define BitOp (1<<11)
113#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 117#define MemAbs (1<<12) /* Memory operand is absolute displacement */
@@ -138,6 +142,7 @@
138/* Source 2 operand type */ 142/* Source 2 operand type */
139#define Src2Shift (31) 143#define Src2Shift (31)
140#define Src2None (OpNone << Src2Shift) 144#define Src2None (OpNone << Src2Shift)
145#define Src2Mem (OpMem << Src2Shift)
141#define Src2CL (OpCL << Src2Shift) 146#define Src2CL (OpCL << Src2Shift)
142#define Src2ImmByte (OpImmByte << Src2Shift) 147#define Src2ImmByte (OpImmByte << Src2Shift)
143#define Src2One (OpOne << Src2Shift) 148#define Src2One (OpOne << Src2Shift)
@@ -155,6 +160,9 @@
155#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 160#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
156#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ 161#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
157#define NoWrite ((u64)1 << 45) /* No writeback */ 162#define NoWrite ((u64)1 << 45) /* No writeback */
163#define SrcWrite ((u64)1 << 46) /* Write back src operand */
164
165#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
158 166
159#define X2(x...) x, x 167#define X2(x...) x, x
160#define X3(x...) X2(x), x 168#define X3(x...) X2(x), x
@@ -171,10 +179,11 @@
171/* 179/*
172 * fastop functions have a special calling convention: 180 * fastop functions have a special calling convention:
173 * 181 *
174 * dst: [rdx]:rax (in/out) 182 * dst: rax (in/out)
175 * src: rbx (in/out) 183 * src: rdx (in/out)
176 * src2: rcx (in) 184 * src2: rcx (in)
177 * flags: rflags (in/out) 185 * flags: rflags (in/out)
186 * ex: rsi (in:fastop pointer, out:zero if exception)
178 * 187 *
179 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for 188 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
180 * different operand sizes can be reached by calculation, rather than a jump 189 * different operand sizes can be reached by calculation, rather than a jump
@@ -276,174 +285,17 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
276} 285}
277 286
278/* 287/*
279 * Instruction emulation:
280 * Most instructions are emulated directly via a fragment of inline assembly
281 * code. This allows us to save/restore EFLAGS and thus very easily pick up
282 * any modified flags.
283 */
284
285#if defined(CONFIG_X86_64)
286#define _LO32 "k" /* force 32-bit operand */
287#define _STK "%%rsp" /* stack pointer */
288#elif defined(__i386__)
289#define _LO32 "" /* force 32-bit operand */
290#define _STK "%%esp" /* stack pointer */
291#endif
292
293/*
294 * These EFLAGS bits are restored from saved value during emulation, and 288 * These EFLAGS bits are restored from saved value during emulation, and
295 * any changes are written back to the saved value after emulation. 289 * any changes are written back to the saved value after emulation.
296 */ 290 */
297#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) 291#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
298 292
299/* Before executing instruction: restore necessary bits in EFLAGS. */
300#define _PRE_EFLAGS(_sav, _msk, _tmp) \
301 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
302 "movl %"_sav",%"_LO32 _tmp"; " \
303 "push %"_tmp"; " \
304 "push %"_tmp"; " \
305 "movl %"_msk",%"_LO32 _tmp"; " \
306 "andl %"_LO32 _tmp",("_STK"); " \
307 "pushf; " \
308 "notl %"_LO32 _tmp"; " \
309 "andl %"_LO32 _tmp",("_STK"); " \
310 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
311 "pop %"_tmp"; " \
312 "orl %"_LO32 _tmp",("_STK"); " \
313 "popf; " \
314 "pop %"_sav"; "
315
316/* After executing instruction: write-back necessary bits in EFLAGS. */
317#define _POST_EFLAGS(_sav, _msk, _tmp) \
318 /* _sav |= EFLAGS & _msk; */ \
319 "pushf; " \
320 "pop %"_tmp"; " \
321 "andl %"_msk",%"_LO32 _tmp"; " \
322 "orl %"_LO32 _tmp",%"_sav"; "
323
324#ifdef CONFIG_X86_64 293#ifdef CONFIG_X86_64
325#define ON64(x) x 294#define ON64(x) x
326#else 295#else
327#define ON64(x) 296#define ON64(x)
328#endif 297#endif
329 298
330#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
331 do { \
332 __asm__ __volatile__ ( \
333 _PRE_EFLAGS("0", "4", "2") \
334 _op _suffix " %"_x"3,%1; " \
335 _POST_EFLAGS("0", "4", "2") \
336 : "=m" ((ctxt)->eflags), \
337 "+q" (*(_dsttype*)&(ctxt)->dst.val), \
338 "=&r" (_tmp) \
339 : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
340 } while (0)
341
342
343/* Raw emulation: instruction has two explicit operands. */
344#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
345 do { \
346 unsigned long _tmp; \
347 \
348 switch ((ctxt)->dst.bytes) { \
349 case 2: \
350 ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
351 break; \
352 case 4: \
353 ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
354 break; \
355 case 8: \
356 ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
357 break; \
358 } \
359 } while (0)
360
361#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
362 do { \
363 unsigned long _tmp; \
364 switch ((ctxt)->dst.bytes) { \
365 case 1: \
366 ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
367 break; \
368 default: \
369 __emulate_2op_nobyte(ctxt, _op, \
370 _wx, _wy, _lx, _ly, _qx, _qy); \
371 break; \
372 } \
373 } while (0)
374
375/* Source operand is byte-sized and may be restricted to just %cl. */
376#define emulate_2op_SrcB(ctxt, _op) \
377 __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
378
379/* Source operand is byte, word, long or quad sized. */
380#define emulate_2op_SrcV(ctxt, _op) \
381 __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
382
383/* Source operand is word, long or quad sized. */
384#define emulate_2op_SrcV_nobyte(ctxt, _op) \
385 __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
386
387/* Instruction has three operands and one operand is stored in ECX register */
388#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
389 do { \
390 unsigned long _tmp; \
391 _type _clv = (ctxt)->src2.val; \
392 _type _srcv = (ctxt)->src.val; \
393 _type _dstv = (ctxt)->dst.val; \
394 \
395 __asm__ __volatile__ ( \
396 _PRE_EFLAGS("0", "5", "2") \
397 _op _suffix " %4,%1 \n" \
398 _POST_EFLAGS("0", "5", "2") \
399 : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
400 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
401 ); \
402 \
403 (ctxt)->src2.val = (unsigned long) _clv; \
404 (ctxt)->src2.val = (unsigned long) _srcv; \
405 (ctxt)->dst.val = (unsigned long) _dstv; \
406 } while (0)
407
408#define emulate_2op_cl(ctxt, _op) \
409 do { \
410 switch ((ctxt)->dst.bytes) { \
411 case 2: \
412 __emulate_2op_cl(ctxt, _op, "w", u16); \
413 break; \
414 case 4: \
415 __emulate_2op_cl(ctxt, _op, "l", u32); \
416 break; \
417 case 8: \
418 ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
419 break; \
420 } \
421 } while (0)
422
423#define __emulate_1op(ctxt, _op, _suffix) \
424 do { \
425 unsigned long _tmp; \
426 \
427 __asm__ __volatile__ ( \
428 _PRE_EFLAGS("0", "3", "2") \
429 _op _suffix " %1; " \
430 _POST_EFLAGS("0", "3", "2") \
431 : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
432 "=&r" (_tmp) \
433 : "i" (EFLAGS_MASK)); \
434 } while (0)
435
436/* Instruction has only one explicit operand (no source operand). */
437#define emulate_1op(ctxt, _op) \
438 do { \
439 switch ((ctxt)->dst.bytes) { \
440 case 1: __emulate_1op(ctxt, _op, "b"); break; \
441 case 2: __emulate_1op(ctxt, _op, "w"); break; \
442 case 4: __emulate_1op(ctxt, _op, "l"); break; \
443 case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
444 } \
445 } while (0)
446
447static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); 299static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
448 300
449#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" 301#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
@@ -462,7 +314,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
462#define FOPNOP() FOP_ALIGN FOP_RET 314#define FOPNOP() FOP_ALIGN FOP_RET
463 315
464#define FOP1E(op, dst) \ 316#define FOP1E(op, dst) \
465 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET 317 FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
318
319#define FOP1EEX(op, dst) \
320 FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
466 321
467#define FASTOP1(op) \ 322#define FASTOP1(op) \
468 FOP_START(op) \ 323 FOP_START(op) \
@@ -472,24 +327,42 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
472 ON64(FOP1E(op##q, rax)) \ 327 ON64(FOP1E(op##q, rax)) \
473 FOP_END 328 FOP_END
474 329
330/* 1-operand, using src2 (for MUL/DIV r/m) */
331#define FASTOP1SRC2(op, name) \
332 FOP_START(name) \
333 FOP1E(op, cl) \
334 FOP1E(op, cx) \
335 FOP1E(op, ecx) \
336 ON64(FOP1E(op, rcx)) \
337 FOP_END
338
339/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
340#define FASTOP1SRC2EX(op, name) \
341 FOP_START(name) \
342 FOP1EEX(op, cl) \
343 FOP1EEX(op, cx) \
344 FOP1EEX(op, ecx) \
345 ON64(FOP1EEX(op, rcx)) \
346 FOP_END
347
475#define FOP2E(op, dst, src) \ 348#define FOP2E(op, dst, src) \
476 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET 349 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
477 350
478#define FASTOP2(op) \ 351#define FASTOP2(op) \
479 FOP_START(op) \ 352 FOP_START(op) \
480 FOP2E(op##b, al, bl) \ 353 FOP2E(op##b, al, dl) \
481 FOP2E(op##w, ax, bx) \ 354 FOP2E(op##w, ax, dx) \
482 FOP2E(op##l, eax, ebx) \ 355 FOP2E(op##l, eax, edx) \
483 ON64(FOP2E(op##q, rax, rbx)) \ 356 ON64(FOP2E(op##q, rax, rdx)) \
484 FOP_END 357 FOP_END
485 358
486/* 2 operand, word only */ 359/* 2 operand, word only */
487#define FASTOP2W(op) \ 360#define FASTOP2W(op) \
488 FOP_START(op) \ 361 FOP_START(op) \
489 FOPNOP() \ 362 FOPNOP() \
490 FOP2E(op##w, ax, bx) \ 363 FOP2E(op##w, ax, dx) \
491 FOP2E(op##l, eax, ebx) \ 364 FOP2E(op##l, eax, edx) \
492 ON64(FOP2E(op##q, rax, rbx)) \ 365 ON64(FOP2E(op##q, rax, rdx)) \
493 FOP_END 366 FOP_END
494 367
495/* 2 operand, src is CL */ 368/* 2 operand, src is CL */
@@ -508,14 +381,17 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
508#define FASTOP3WCL(op) \ 381#define FASTOP3WCL(op) \
509 FOP_START(op) \ 382 FOP_START(op) \
510 FOPNOP() \ 383 FOPNOP() \
511 FOP3E(op##w, ax, bx, cl) \ 384 FOP3E(op##w, ax, dx, cl) \
512 FOP3E(op##l, eax, ebx, cl) \ 385 FOP3E(op##l, eax, edx, cl) \
513 ON64(FOP3E(op##q, rax, rbx, cl)) \ 386 ON64(FOP3E(op##q, rax, rdx, cl)) \
514 FOP_END 387 FOP_END
515 388
516/* Special case for SETcc - 1 instruction per cc */ 389/* Special case for SETcc - 1 instruction per cc */
517#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" 390#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
518 391
392asm(".global kvm_fastop_exception \n"
393 "kvm_fastop_exception: xor %esi, %esi; ret");
394
519FOP_START(setcc) 395FOP_START(setcc)
520FOP_SETCC(seto) 396FOP_SETCC(seto)
521FOP_SETCC(setno) 397FOP_SETCC(setno)
@@ -538,47 +414,6 @@ FOP_END;
538FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET 414FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
539FOP_END; 415FOP_END;
540 416
541#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
542 do { \
543 unsigned long _tmp; \
544 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
545 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
546 \
547 __asm__ __volatile__ ( \
548 _PRE_EFLAGS("0", "5", "1") \
549 "1: \n\t" \
550 _op _suffix " %6; " \
551 "2: \n\t" \
552 _POST_EFLAGS("0", "5", "1") \
553 ".pushsection .fixup,\"ax\" \n\t" \
554 "3: movb $1, %4 \n\t" \
555 "jmp 2b \n\t" \
556 ".popsection \n\t" \
557 _ASM_EXTABLE(1b, 3b) \
558 : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
559 "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
560 : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \
561 } while (0)
562
563/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
564#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
565 do { \
566 switch((ctxt)->src.bytes) { \
567 case 1: \
568 __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
569 break; \
570 case 2: \
571 __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
572 break; \
573 case 4: \
574 __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
575 break; \
576 case 8: ON64( \
577 __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
578 break; \
579 } \
580 } while (0)
581
582static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, 417static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
583 enum x86_intercept intercept, 418 enum x86_intercept intercept,
584 enum x86_intercept_stage stage) 419 enum x86_intercept_stage stage)
@@ -988,6 +823,11 @@ FASTOP2(xor);
988FASTOP2(cmp); 823FASTOP2(cmp);
989FASTOP2(test); 824FASTOP2(test);
990 825
826FASTOP1SRC2(mul, mul_ex);
827FASTOP1SRC2(imul, imul_ex);
828FASTOP1SRC2EX(div, div_ex);
829FASTOP1SRC2EX(idiv, idiv_ex);
830
991FASTOP3WCL(shld); 831FASTOP3WCL(shld);
992FASTOP3WCL(shrd); 832FASTOP3WCL(shrd);
993 833
@@ -1013,6 +853,8 @@ FASTOP2W(bts);
1013FASTOP2W(btr); 853FASTOP2W(btr);
1014FASTOP2W(btc); 854FASTOP2W(btc);
1015 855
856FASTOP2(xadd);
857
1016static u8 test_cc(unsigned int condition, unsigned long flags) 858static u8 test_cc(unsigned int condition, unsigned long flags)
1017{ 859{
1018 u8 rc; 860 u8 rc;
@@ -1726,45 +1568,42 @@ static void write_register_operand(struct operand *op)
1726 } 1568 }
1727} 1569}
1728 1570
1729static int writeback(struct x86_emulate_ctxt *ctxt) 1571static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
1730{ 1572{
1731 int rc; 1573 int rc;
1732 1574
1733 if (ctxt->d & NoWrite) 1575 switch (op->type) {
1734 return X86EMUL_CONTINUE;
1735
1736 switch (ctxt->dst.type) {
1737 case OP_REG: 1576 case OP_REG:
1738 write_register_operand(&ctxt->dst); 1577 write_register_operand(op);
1739 break; 1578 break;
1740 case OP_MEM: 1579 case OP_MEM:
1741 if (ctxt->lock_prefix) 1580 if (ctxt->lock_prefix)
1742 rc = segmented_cmpxchg(ctxt, 1581 rc = segmented_cmpxchg(ctxt,
1743 ctxt->dst.addr.mem, 1582 op->addr.mem,
1744 &ctxt->dst.orig_val, 1583 &op->orig_val,
1745 &ctxt->dst.val, 1584 &op->val,
1746 ctxt->dst.bytes); 1585 op->bytes);
1747 else 1586 else
1748 rc = segmented_write(ctxt, 1587 rc = segmented_write(ctxt,
1749 ctxt->dst.addr.mem, 1588 op->addr.mem,
1750 &ctxt->dst.val, 1589 &op->val,
1751 ctxt->dst.bytes); 1590 op->bytes);
1752 if (rc != X86EMUL_CONTINUE) 1591 if (rc != X86EMUL_CONTINUE)
1753 return rc; 1592 return rc;
1754 break; 1593 break;
1755 case OP_MEM_STR: 1594 case OP_MEM_STR:
1756 rc = segmented_write(ctxt, 1595 rc = segmented_write(ctxt,
1757 ctxt->dst.addr.mem, 1596 op->addr.mem,
1758 ctxt->dst.data, 1597 op->data,
1759 ctxt->dst.bytes * ctxt->dst.count); 1598 op->bytes * op->count);
1760 if (rc != X86EMUL_CONTINUE) 1599 if (rc != X86EMUL_CONTINUE)
1761 return rc; 1600 return rc;
1762 break; 1601 break;
1763 case OP_XMM: 1602 case OP_XMM:
1764 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1603 write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
1765 break; 1604 break;
1766 case OP_MM: 1605 case OP_MM:
1767 write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); 1606 write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
1768 break; 1607 break;
1769 case OP_NONE: 1608 case OP_NONE:
1770 /* no writeback */ 1609 /* no writeback */
@@ -2117,42 +1956,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
2117 return X86EMUL_CONTINUE; 1956 return X86EMUL_CONTINUE;
2118} 1957}
2119 1958
2120static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
2121{
2122 u8 ex = 0;
2123
2124 emulate_1op_rax_rdx(ctxt, "mul", ex);
2125 return X86EMUL_CONTINUE;
2126}
2127
2128static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
2129{
2130 u8 ex = 0;
2131
2132 emulate_1op_rax_rdx(ctxt, "imul", ex);
2133 return X86EMUL_CONTINUE;
2134}
2135
2136static int em_div_ex(struct x86_emulate_ctxt *ctxt)
2137{
2138 u8 de = 0;
2139
2140 emulate_1op_rax_rdx(ctxt, "div", de);
2141 if (de)
2142 return emulate_de(ctxt);
2143 return X86EMUL_CONTINUE;
2144}
2145
2146static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
2147{
2148 u8 de = 0;
2149
2150 emulate_1op_rax_rdx(ctxt, "idiv", de);
2151 if (de)
2152 return emulate_de(ctxt);
2153 return X86EMUL_CONTINUE;
2154}
2155
2156static int em_grp45(struct x86_emulate_ctxt *ctxt) 1959static int em_grp45(struct x86_emulate_ctxt *ctxt)
2157{ 1960{
2158 int rc = X86EMUL_CONTINUE; 1961 int rc = X86EMUL_CONTINUE;
@@ -3734,10 +3537,10 @@ static const struct opcode group3[] = {
3734 F(DstMem | SrcImm | NoWrite, em_test), 3537 F(DstMem | SrcImm | NoWrite, em_test),
3735 F(DstMem | SrcNone | Lock, em_not), 3538 F(DstMem | SrcNone | Lock, em_not),
3736 F(DstMem | SrcNone | Lock, em_neg), 3539 F(DstMem | SrcNone | Lock, em_neg),
3737 I(SrcMem, em_mul_ex), 3540 F(DstXacc | Src2Mem, em_mul_ex),
3738 I(SrcMem, em_imul_ex), 3541 F(DstXacc | Src2Mem, em_imul_ex),
3739 I(SrcMem, em_div_ex), 3542 F(DstXacc | Src2Mem, em_div_ex),
3740 I(SrcMem, em_idiv_ex), 3543 F(DstXacc | Src2Mem, em_idiv_ex),
3741}; 3544};
3742 3545
3743static const struct opcode group4[] = { 3546static const struct opcode group4[] = {
@@ -4064,7 +3867,7 @@ static const struct opcode twobyte_table[256] = {
4064 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), 3867 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
4065 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3868 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
4066 /* 0xC0 - 0xC7 */ 3869 /* 0xC0 - 0xC7 */
4067 D2bv(DstMem | SrcReg | ModRM | Lock), 3870 F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
4068 N, D(DstMem | SrcReg | ModRM | Mov), 3871 N, D(DstMem | SrcReg | ModRM | Mov),
4069 N, N, N, GD(0, &group9), 3872 N, N, N, GD(0, &group9),
4070 /* 0xC8 - 0xCF */ 3873 /* 0xC8 - 0xCF */
@@ -4172,6 +3975,24 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4172 fetch_register_operand(op); 3975 fetch_register_operand(op);
4173 op->orig_val = op->val; 3976 op->orig_val = op->val;
4174 break; 3977 break;
3978 case OpAccLo:
3979 op->type = OP_REG;
3980 op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
3981 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3982 fetch_register_operand(op);
3983 op->orig_val = op->val;
3984 break;
3985 case OpAccHi:
3986 if (ctxt->d & ByteOp) {
3987 op->type = OP_NONE;
3988 break;
3989 }
3990 op->type = OP_REG;
3991 op->bytes = ctxt->op_bytes;
3992 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3993 fetch_register_operand(op);
3994 op->orig_val = op->val;
3995 break;
4175 case OpDI: 3996 case OpDI:
4176 op->type = OP_MEM; 3997 op->type = OP_MEM;
4177 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3998 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
@@ -4553,11 +4374,15 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4553static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) 4374static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4554{ 4375{
4555 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; 4376 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4556 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; 4377 if (!(ctxt->d & ByteOp))
4378 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4557 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" 4379 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4558 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) 4380 : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
4559 : "c"(ctxt->src2.val), [fastop]"S"(fop)); 4381 [fastop]"+S"(fop)
4382 : "c"(ctxt->src2.val));
4560 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); 4383 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4384 if (!fop) /* exception is returned in fop variable */
4385 return emulate_de(ctxt);
4561 return X86EMUL_CONTINUE; 4386 return X86EMUL_CONTINUE;
4562} 4387}
4563 4388
@@ -4773,9 +4598,17 @@ special_insn:
4773 goto done; 4598 goto done;
4774 4599
4775writeback: 4600writeback:
4776 rc = writeback(ctxt); 4601 if (!(ctxt->d & NoWrite)) {
4777 if (rc != X86EMUL_CONTINUE) 4602 rc = writeback(ctxt, &ctxt->dst);
4778 goto done; 4603 if (rc != X86EMUL_CONTINUE)
4604 goto done;
4605 }
4606 if (ctxt->d & SrcWrite) {
4607 BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
4608 rc = writeback(ctxt, &ctxt->src);
4609 if (rc != X86EMUL_CONTINUE)
4610 goto done;
4611 }
4779 4612
4780 /* 4613 /*
4781 * restore dst type in case the decoding will be reused 4614 * restore dst type in case the decoding will be reused
@@ -4872,12 +4705,6 @@ twobyte_insn:
4872 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : 4705 ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
4873 (s16) ctxt->src.val; 4706 (s16) ctxt->src.val;
4874 break; 4707 break;
4875 case 0xc0 ... 0xc1: /* xadd */
4876 fastop(ctxt, em_add);
4877 /* Write back the register source. */
4878 ctxt->src.val = ctxt->dst.orig_val;
4879 write_register_operand(&ctxt->src);
4880 break;
4881 case 0xc3: /* movnti */ 4708 case 0xc3: /* movnti */
4882 ctxt->dst.bytes = ctxt->op_bytes; 4709 ctxt->dst.bytes = ctxt->op_bytes;
4883 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : 4710 ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0eee2c8b64d1..afc11245827c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1608,8 +1608,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1608 return; 1608 return;
1609 1609
1610 if (atomic_read(&apic->lapic_timer.pending) > 0) { 1610 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1611 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1611 kvm_apic_local_deliver(apic, APIC_LVTT);
1612 atomic_dec(&apic->lapic_timer.pending); 1612 atomic_set(&apic->lapic_timer.pending, 0);
1613 } 1613 }
1614} 1614}
1615 1615
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
197} 197}
198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200/*
201 * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
202 * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
203 * number.
204 */
205#define MMIO_SPTE_GEN_LOW_SHIFT 3
206#define MMIO_SPTE_GEN_HIGH_SHIFT 52
207
208#define MMIO_GEN_SHIFT 19
209#define MMIO_GEN_LOW_SHIFT 9
210#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
211#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
212#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
213
214static u64 generation_mmio_spte_mask(unsigned int gen)
201{ 215{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 216 u64 mask;
217
218 WARN_ON(gen > MMIO_MAX_GEN);
219
220 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
221 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
222 return mask;
223}
224
225static unsigned int get_mmio_spte_generation(u64 spte)
226{
227 unsigned int gen;
228
229 spte &= ~shadow_mmio_mask;
230
231 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
232 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
233 return gen;
234}
235
236static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
237{
238 /*
239 * Init kvm generation close to MMIO_MAX_GEN to easily test the
240 * code of handling generation number wrap-around.
241 */
242 return (kvm_memslots(kvm)->generation +
243 MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
244}
245
246static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
247 unsigned access)
248{
249 unsigned int gen = kvm_current_mmio_generation(kvm);
250 u64 mask = generation_mmio_spte_mask(gen);
203 251
204 access &= ACC_WRITE_MASK | ACC_USER_MASK; 252 access &= ACC_WRITE_MASK | ACC_USER_MASK;
253 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
205 254
206 sp->mmio_cached = true; 255 trace_mark_mmio_spte(sptep, gfn, access, gen);
207 trace_mark_mmio_spte(sptep, gfn, access); 256 mmu_spte_set(sptep, mask);
208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
209} 257}
210 258
211static bool is_mmio_spte(u64 spte) 259static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
215 263
216static gfn_t get_mmio_spte_gfn(u64 spte) 264static gfn_t get_mmio_spte_gfn(u64 spte)
217{ 265{
218 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; 266 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
267 return (spte & ~mask) >> PAGE_SHIFT;
219} 268}
220 269
221static unsigned get_mmio_spte_access(u64 spte) 270static unsigned get_mmio_spte_access(u64 spte)
222{ 271{
223 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; 272 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
273 return (spte & ~mask) & ~PAGE_MASK;
224} 274}
225 275
226static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) 276static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
277 pfn_t pfn, unsigned access)
227{ 278{
228 if (unlikely(is_noslot_pfn(pfn))) { 279 if (unlikely(is_noslot_pfn(pfn))) {
229 mark_mmio_spte(sptep, gfn, access); 280 mark_mmio_spte(kvm, sptep, gfn, access);
230 return true; 281 return true;
231 } 282 }
232 283
233 return false; 284 return false;
234} 285}
235 286
287static bool check_mmio_spte(struct kvm *kvm, u64 spte)
288{
289 unsigned int kvm_gen, spte_gen;
290
291 kvm_gen = kvm_current_mmio_generation(kvm);
292 spte_gen = get_mmio_spte_generation(spte);
293
294 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
295 return likely(kvm_gen == spte_gen);
296}
297
236static inline u64 rsvd_bits(int s, int e) 298static inline u64 rsvd_bits(int s, int e)
237{ 299{
238 return ((1ULL << (e - s + 1)) - 1) << s; 300 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
404/* 466/*
405 * The idea using the light way get the spte on x86_32 guest is from 467 * The idea using the light way get the spte on x86_32 guest is from
406 * gup_get_pte(arch/x86/mm/gup.c). 468 * gup_get_pte(arch/x86/mm/gup.c).
407 * The difference is we can not catch the spte tlb flush if we leave 469 *
408 * guest mode, so we emulate it by increase clear_spte_count when spte 470 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
409 * is cleared. 471 * coalesces them and we are running out of the MMU lock. Therefore
472 * we need to protect against in-progress updates of the spte.
473 *
474 * Reading the spte while an update is in progress may get the old value
475 * for the high part of the spte. The race is fine for a present->non-present
476 * change (because the high part of the spte is ignored for non-present spte),
477 * but for a present->present change we must reread the spte.
478 *
479 * All such changes are done in two steps (present->non-present and
480 * non-present->present), hence it is enough to count the number of
481 * present->non-present updates: if it changed while reading the spte,
482 * we might have hit the race. This is done using clear_spte_count.
410 */ 483 */
411static u64 __get_spte_lockless(u64 *sptep) 484static u64 __get_spte_lockless(u64 *sptep)
412{ 485{
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1511 if (!direct) 1584 if (!direct)
1512 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1585 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1513 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1586 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1587
1588 /*
1589 * The active_mmu_pages list is the FIFO list, do not move the
1590 * page until it is zapped. kvm_zap_obsolete_pages depends on
1591 * this feature. See the comments in kvm_zap_obsolete_pages().
1592 */
1514 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1593 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1515 sp->parent_ptes = 0; 1594 sp->parent_ptes = 0;
1516 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1595 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1727static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list); 1728 struct list_head *invalid_list);
1650 1729
1730/*
1731 * NOTE: we should pay more attention on the zapped-obsolete page
1732 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
1733 * since it has been deleted from active_mmu_pages but still can be found
1734 * at hast list.
1735 *
1736 * for_each_gfn_indirect_valid_sp has skipped that kind of page and
1737 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
1738 * all the obsolete pages.
1739 */
1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1740#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1652 hlist_for_each_entry(_sp, \ 1741 hlist_for_each_entry(_sp, \
1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1742 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
1838 __clear_sp_write_flooding_count(sp); 1927 __clear_sp_write_flooding_count(sp);
1839} 1928}
1840 1929
1930static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1931{
1932 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1933}
1934
1841static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1935static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1842 gfn_t gfn, 1936 gfn_t gfn,
1843 gva_t gaddr, 1937 gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1864 role.quadrant = quadrant; 1958 role.quadrant = quadrant;
1865 } 1959 }
1866 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 1960 for_each_gfn_sp(vcpu->kvm, sp, gfn) {
1961 if (is_obsolete_sp(vcpu->kvm, sp))
1962 continue;
1963
1867 if (!need_sync && sp->unsync) 1964 if (!need_sync && sp->unsync)
1868 need_sync = true; 1965 need_sync = true;
1869 1966
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1900 1997
1901 account_shadowed(vcpu->kvm, gfn); 1998 account_shadowed(vcpu->kvm, gfn);
1902 } 1999 }
2000 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1903 init_shadow_page_table(sp); 2001 init_shadow_page_table(sp);
1904 trace_kvm_mmu_get_page(sp, true); 2002 trace_kvm_mmu_get_page(sp, true);
1905 return sp; 2003 return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2070 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2168 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
2071 kvm_mmu_page_unlink_children(kvm, sp); 2169 kvm_mmu_page_unlink_children(kvm, sp);
2072 kvm_mmu_unlink_parents(kvm, sp); 2170 kvm_mmu_unlink_parents(kvm, sp);
2171
2073 if (!sp->role.invalid && !sp->role.direct) 2172 if (!sp->role.invalid && !sp->role.direct)
2074 unaccount_shadowed(kvm, sp->gfn); 2173 unaccount_shadowed(kvm, sp->gfn);
2174
2075 if (sp->unsync) 2175 if (sp->unsync)
2076 kvm_unlink_unsync_page(kvm, sp); 2176 kvm_unlink_unsync_page(kvm, sp);
2077 if (!sp->root_count) { 2177 if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2081 kvm_mod_used_mmu_pages(kvm, -1); 2181 kvm_mod_used_mmu_pages(kvm, -1);
2082 } else { 2182 } else {
2083 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2183 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2084 kvm_reload_remote_mmus(kvm); 2184
2185 /*
2186 * The obsolete pages can not be used on any vcpus.
2187 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2188 */
2189 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2190 kvm_reload_remote_mmus(kvm);
2085 } 2191 }
2086 2192
2087 sp->role.invalid = 1; 2193 sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2331 u64 spte; 2437 u64 spte;
2332 int ret = 0; 2438 int ret = 0;
2333 2439
2334 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2440 if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
2335 return 0; 2441 return 0;
2336 2442
2337 spte = PT_PRESENT_MASK; 2443 spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2869 2975
2870 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2976 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2871 return; 2977 return;
2872 spin_lock(&vcpu->kvm->mmu_lock); 2978
2873 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 2979 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2874 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 2980 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2875 vcpu->arch.mmu.direct_map)) { 2981 vcpu->arch.mmu.direct_map)) {
2876 hpa_t root = vcpu->arch.mmu.root_hpa; 2982 hpa_t root = vcpu->arch.mmu.root_hpa;
2877 2983
2984 spin_lock(&vcpu->kvm->mmu_lock);
2878 sp = page_header(root); 2985 sp = page_header(root);
2879 --sp->root_count; 2986 --sp->root_count;
2880 if (!sp->root_count && sp->role.invalid) { 2987 if (!sp->root_count && sp->role.invalid) {
2881 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2988 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2882 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2989 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2883 } 2990 }
2884 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2885 spin_unlock(&vcpu->kvm->mmu_lock); 2991 spin_unlock(&vcpu->kvm->mmu_lock);
2992 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2886 return; 2993 return;
2887 } 2994 }
2995
2996 spin_lock(&vcpu->kvm->mmu_lock);
2888 for (i = 0; i < 4; ++i) { 2997 for (i = 0; i < 4; ++i) {
2889 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2998 hpa_t root = vcpu->arch.mmu.pae_root[i];
2890 2999
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3148 return spte; 3257 return spte;
3149} 3258}
3150 3259
3151/*
3152 * If it is a real mmio page fault, return 1 and emulat the instruction
3153 * directly, return 0 to let CPU fault again on the address, -1 is
3154 * returned if bug is detected.
3155 */
3156int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3260int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3157{ 3261{
3158 u64 spte; 3262 u64 spte;
3159 3263
3160 if (quickly_check_mmio_pf(vcpu, addr, direct)) 3264 if (quickly_check_mmio_pf(vcpu, addr, direct))
3161 return 1; 3265 return RET_MMIO_PF_EMULATE;
3162 3266
3163 spte = walk_shadow_page_get_mmio_spte(vcpu, addr); 3267 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
3164 3268
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3166 gfn_t gfn = get_mmio_spte_gfn(spte); 3270 gfn_t gfn = get_mmio_spte_gfn(spte);
3167 unsigned access = get_mmio_spte_access(spte); 3271 unsigned access = get_mmio_spte_access(spte);
3168 3272
3273 if (!check_mmio_spte(vcpu->kvm, spte))
3274 return RET_MMIO_PF_INVALID;
3275
3169 if (direct) 3276 if (direct)
3170 addr = 0; 3277 addr = 0;
3171 3278
3172 trace_handle_mmio_page_fault(addr, gfn, access); 3279 trace_handle_mmio_page_fault(addr, gfn, access);
3173 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3280 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3174 return 1; 3281 return RET_MMIO_PF_EMULATE;
3175 } 3282 }
3176 3283
3177 /* 3284 /*
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3179 * it's a BUG if the gfn is not a mmio page. 3286 * it's a BUG if the gfn is not a mmio page.
3180 */ 3287 */
3181 if (direct && !check_direct_spte_mmio_pf(spte)) 3288 if (direct && !check_direct_spte_mmio_pf(spte))
3182 return -1; 3289 return RET_MMIO_PF_BUG;
3183 3290
3184 /* 3291 /*
3185 * If the page table is zapped by other cpus, let CPU fault again on 3292 * If the page table is zapped by other cpus, let CPU fault again on
3186 * the address. 3293 * the address.
3187 */ 3294 */
3188 return 0; 3295 return RET_MMIO_PF_RETRY;
3189} 3296}
3190EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); 3297EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
3191 3298
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3195 int ret; 3302 int ret;
3196 3303
3197 ret = handle_mmio_page_fault_common(vcpu, addr, direct); 3304 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3198 WARN_ON(ret < 0); 3305 WARN_ON(ret == RET_MMIO_PF_BUG);
3199 return ret; 3306 return ret;
3200} 3307}
3201 3308
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3207 3314
3208 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3315 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3209 3316
3210 if (unlikely(error_code & PFERR_RSVD_MASK)) 3317 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3211 return handle_mmio_page_fault(vcpu, gva, error_code, true); 3318 r = handle_mmio_page_fault(vcpu, gva, error_code, true);
3319
3320 if (likely(r != RET_MMIO_PF_INVALID))
3321 return r;
3322 }
3212 3323
3213 r = mmu_topup_memory_caches(vcpu); 3324 r = mmu_topup_memory_caches(vcpu);
3214 if (r) 3325 if (r)
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3284 ASSERT(vcpu); 3395 ASSERT(vcpu);
3285 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3396 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3286 3397
3287 if (unlikely(error_code & PFERR_RSVD_MASK)) 3398 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3288 return handle_mmio_page_fault(vcpu, gpa, error_code, true); 3399 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
3400
3401 if (likely(r != RET_MMIO_PF_INVALID))
3402 return r;
3403 }
3289 3404
3290 r = mmu_topup_memory_caches(vcpu); 3405 r = mmu_topup_memory_caches(vcpu);
3291 if (r) 3406 if (r)
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3391 *access &= mask; 3506 *access &= mask;
3392} 3507}
3393 3508
3394static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3509static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3395 int *nr_present) 3510 unsigned access, int *nr_present)
3396{ 3511{
3397 if (unlikely(is_mmio_spte(*sptep))) { 3512 if (unlikely(is_mmio_spte(*sptep))) {
3398 if (gfn != get_mmio_spte_gfn(*sptep)) { 3513 if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3401 } 3516 }
3402 3517
3403 (*nr_present)++; 3518 (*nr_present)++;
3404 mark_mmio_spte(sptep, gfn, access); 3519 mark_mmio_spte(kvm, sptep, gfn, access);
3405 return true; 3520 return true;
3406 } 3521 }
3407 3522
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
3764 if (r) 3879 if (r)
3765 goto out; 3880 goto out;
3766 r = mmu_alloc_roots(vcpu); 3881 r = mmu_alloc_roots(vcpu);
3767 spin_lock(&vcpu->kvm->mmu_lock); 3882 kvm_mmu_sync_roots(vcpu);
3768 mmu_sync_roots(vcpu);
3769 spin_unlock(&vcpu->kvm->mmu_lock);
3770 if (r) 3883 if (r)
3771 goto out; 3884 goto out;
3772 /* set_cr3() should ensure TLB has been flushed */ 3885 /* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4179 spin_unlock(&kvm->mmu_lock); 4292 spin_unlock(&kvm->mmu_lock);
4180} 4293}
4181 4294
4182void kvm_mmu_zap_all(struct kvm *kvm) 4295#define BATCH_ZAP_PAGES 10
4296static void kvm_zap_obsolete_pages(struct kvm *kvm)
4183{ 4297{
4184 struct kvm_mmu_page *sp, *node; 4298 struct kvm_mmu_page *sp, *node;
4185 LIST_HEAD(invalid_list); 4299 int batch = 0;
4186 4300
4187 spin_lock(&kvm->mmu_lock);
4188restart: 4301restart:
4189 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 4302 list_for_each_entry_safe_reverse(sp, node,
4190 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 4303 &kvm->arch.active_mmu_pages, link) {
4304 int ret;
4305
4306 /*
4307 * No obsolete page exists before new created page since
4308 * active_mmu_pages is the FIFO list.
4309 */
4310 if (!is_obsolete_sp(kvm, sp))
4311 break;
4312
4313 /*
4314 * Since we are reversely walking the list and the invalid
4315 * list will be moved to the head, skip the invalid page
4316 * can help us to avoid the infinity list walking.
4317 */
4318 if (sp->role.invalid)
4319 continue;
4320
4321 /*
4322 * Need not flush tlb since we only zap the sp with invalid
4323 * generation number.
4324 */
4325 if (batch >= BATCH_ZAP_PAGES &&
4326 cond_resched_lock(&kvm->mmu_lock)) {
4327 batch = 0;
4328 goto restart;
4329 }
4330
4331 ret = kvm_mmu_prepare_zap_page(kvm, sp,
4332 &kvm->arch.zapped_obsolete_pages);
4333 batch += ret;
4334
4335 if (ret)
4191 goto restart; 4336 goto restart;
4337 }
4192 4338
4193 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4339 /*
4194 spin_unlock(&kvm->mmu_lock); 4340 * Should flush tlb before free page tables since lockless-walking
4341 * may use the pages.
4342 */
4343 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
4195} 4344}
4196 4345
4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) 4346/*
4347 * Fast invalidate all shadow pages and use lock-break technique
4348 * to zap obsolete pages.
4349 *
4350 * It's required when memslot is being deleted or VM is being
4351 * destroyed, in these cases, we should ensure that KVM MMU does
4352 * not use any resource of the being-deleted slot or all slots
4353 * after calling the function.
4354 */
4355void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
4198{ 4356{
4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4201
4202 spin_lock(&kvm->mmu_lock); 4357 spin_lock(&kvm->mmu_lock);
4203restart: 4358 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 4359 kvm->arch.mmu_valid_gen++;
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4210 4360
4211 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4361 /*
4362 * Notify all vcpus to reload its shadow page table
4363 * and flush TLB. Then all vcpus will switch to new
4364 * shadow page table with the new mmu_valid_gen.
4365 *
4366 * Note: we should do this under the protection of
4367 * mmu-lock, otherwise, vcpu would purge shadow page
4368 * but miss tlb flush.
4369 */
4370 kvm_reload_remote_mmus(kvm);
4371
4372 kvm_zap_obsolete_pages(kvm);
4212 spin_unlock(&kvm->mmu_lock); 4373 spin_unlock(&kvm->mmu_lock);
4213} 4374}
4214 4375
4376static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
4377{
4378 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
4379}
4380
4381void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4382{
4383 /*
4384 * The very rare case: if the generation-number is round,
4385 * zap all shadow pages.
4386 *
4387 * The max value is MMIO_MAX_GEN - 1 since it is not called
4388 * when mark memslot invalid.
4389 */
4390 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
4391 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4392 kvm_mmu_invalidate_zap_all_pages(kvm);
4393 }
4394}
4395
4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4396static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4216{ 4397{
4217 struct kvm *kvm; 4398 struct kvm *kvm;
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4240 * want to shrink a VM that only started to populate its MMU 4421 * want to shrink a VM that only started to populate its MMU
4241 * anyway. 4422 * anyway.
4242 */ 4423 */
4243 if (!kvm->arch.n_used_mmu_pages) 4424 if (!kvm->arch.n_used_mmu_pages &&
4425 !kvm_has_zapped_obsolete_pages(kvm))
4244 continue; 4426 continue;
4245 4427
4246 idx = srcu_read_lock(&kvm->srcu); 4428 idx = srcu_read_lock(&kvm->srcu);
4247 spin_lock(&kvm->mmu_lock); 4429 spin_lock(&kvm->mmu_lock);
4248 4430
4431 if (kvm_has_zapped_obsolete_pages(kvm)) {
4432 kvm_mmu_commit_zap_page(kvm,
4433 &kvm->arch.zapped_obsolete_pages);
4434 goto unlock;
4435 }
4436
4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list); 4437 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4250 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4438 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4251 4439
4440unlock:
4252 spin_unlock(&kvm->mmu_lock); 4441 spin_unlock(&kvm->mmu_lock);
4253 srcu_read_unlock(&kvm->srcu, idx); 4442 srcu_read_unlock(&kvm->srcu, idx);
4254 4443
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2cac6d..5b59c573aba7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,23 @@
52 52
53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 53int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); 54void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
55
56/*
57 * Return values of handle_mmio_page_fault_common:
58 * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
59 * directly.
60 * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
61 * fault path update the mmio spte.
62 * RET_MMIO_PF_RETRY: let CPU fault again on the address.
63 * RET_MMIO_PF_BUG: bug is detected.
64 */
65enum {
66 RET_MMIO_PF_EMULATE = 1,
67 RET_MMIO_PF_INVALID = 2,
68 RET_MMIO_PF_RETRY = 0,
69 RET_MMIO_PF_BUG = -1
70};
71
55int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); 72int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
56int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); 73int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 74
@@ -97,4 +114,5 @@ static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
97 return (mmu->permissions[pfec >> 1] >> pte_access) & 1; 114 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
98} 115}
99 116
117void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
100#endif 118#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172f4174..9d2e0ffcb190 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9 9
10#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
11 __field(__u64, gfn) \ 11 __field(unsigned long, mmu_valid_gen) \
12 __field(__u32, role) \ 12 __field(__u64, gfn) \
13 __field(__u32, root_count) \ 13 __field(__u32, role) \
14 __field(__u32, root_count) \
14 __field(bool, unsync) 15 __field(bool, unsync)
15 16
16#define KVM_MMU_PAGE_ASSIGN(sp) \ 17#define KVM_MMU_PAGE_ASSIGN(sp) \
17 __entry->gfn = sp->gfn; \ 18 __entry->mmu_valid_gen = sp->mmu_valid_gen; \
18 __entry->role = sp->role.word; \ 19 __entry->gfn = sp->gfn; \
19 __entry->root_count = sp->root_count; \ 20 __entry->role = sp->role.word; \
21 __entry->root_count = sp->root_count; \
20 __entry->unsync = sp->unsync; 22 __entry->unsync = sp->unsync;
21 23
22#define KVM_MMU_PAGE_PRINTK() ({ \ 24#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -28,8 +30,8 @@
28 \ 30 \
29 role.word = __entry->role; \ 31 role.word = __entry->role; \
30 \ 32 \
31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ 33 trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \
32 " %snxe root %u %s%c", \ 34 " %snxe root %u %s%c", __entry->mmu_valid_gen, \
33 __entry->gfn, role.level, \ 35 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \ 36 role.cr4_pae ? " pae" : "", \
35 role.quadrant, \ 37 role.quadrant, \
@@ -197,23 +199,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
197 199
198TRACE_EVENT( 200TRACE_EVENT(
199 mark_mmio_spte, 201 mark_mmio_spte,
200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 202 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
201 TP_ARGS(sptep, gfn, access), 203 TP_ARGS(sptep, gfn, access, gen),
202 204
203 TP_STRUCT__entry( 205 TP_STRUCT__entry(
204 __field(void *, sptep) 206 __field(void *, sptep)
205 __field(gfn_t, gfn) 207 __field(gfn_t, gfn)
206 __field(unsigned, access) 208 __field(unsigned, access)
209 __field(unsigned int, gen)
207 ), 210 ),
208 211
209 TP_fast_assign( 212 TP_fast_assign(
210 __entry->sptep = sptep; 213 __entry->sptep = sptep;
211 __entry->gfn = gfn; 214 __entry->gfn = gfn;
212 __entry->access = access; 215 __entry->access = access;
216 __entry->gen = gen;
213 ), 217 ),
214 218
215 TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, 219 TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
216 __entry->access) 220 __entry->gfn, __entry->access, __entry->gen)
217); 221);
218 222
219TRACE_EVENT( 223TRACE_EVENT(
@@ -274,6 +278,50 @@ TRACE_EVENT(
274 __spte_satisfied(old_spte), __spte_satisfied(new_spte) 278 __spte_satisfied(old_spte), __spte_satisfied(new_spte)
275 ) 279 )
276); 280);
281
282TRACE_EVENT(
283 kvm_mmu_invalidate_zap_all_pages,
284 TP_PROTO(struct kvm *kvm),
285 TP_ARGS(kvm),
286
287 TP_STRUCT__entry(
288 __field(unsigned long, mmu_valid_gen)
289 __field(unsigned int, mmu_used_pages)
290 ),
291
292 TP_fast_assign(
293 __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
294 __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
295 ),
296
297 TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
298 __entry->mmu_valid_gen, __entry->mmu_used_pages
299 )
300);
301
302
303TRACE_EVENT(
304 check_mmio_spte,
305 TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
306 TP_ARGS(spte, kvm_gen, spte_gen),
307
308 TP_STRUCT__entry(
309 __field(unsigned int, kvm_gen)
310 __field(unsigned int, spte_gen)
311 __field(u64, spte)
312 ),
313
314 TP_fast_assign(
315 __entry->kvm_gen = kvm_gen;
316 __entry->spte_gen = spte_gen;
317 __entry->spte = spte;
318 ),
319
320 TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
321 __entry->kvm_gen, __entry->spte_gen,
322 __entry->kvm_gen == __entry->spte_gen
323 )
324);
277#endif /* _TRACE_KVMMMU_H */ 325#endif /* _TRACE_KVMMMU_H */
278 326
279#undef TRACE_INCLUDE_PATH 327#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860b457a..7769699d48a8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -552,9 +552,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
552 552
553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
554 554
555 if (unlikely(error_code & PFERR_RSVD_MASK)) 555 if (unlikely(error_code & PFERR_RSVD_MASK)) {
556 return handle_mmio_page_fault(vcpu, addr, error_code, 556 r = handle_mmio_page_fault(vcpu, addr, error_code,
557 mmu_is_nested(vcpu)); 557 mmu_is_nested(vcpu));
558 if (likely(r != RET_MMIO_PF_INVALID))
559 return r;
560 };
558 561
559 r = mmu_topup_memory_caches(vcpu); 562 r = mmu_topup_memory_caches(vcpu);
560 if (r) 563 if (r)
@@ -792,7 +795,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
792 pte_access &= gpte_access(vcpu, gpte); 795 pte_access &= gpte_access(vcpu, gpte);
793 protect_clean_gpte(&pte_access, gpte); 796 protect_clean_gpte(&pte_access, gpte);
794 797
795 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 798 if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
799 &nr_present))
796 continue; 800 continue;
797 801
798 if (gfn != sp->gfns[i]) { 802 if (gfn != sp->gfns[i]) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a14a6eaf871d..c0bc80391e40 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1026,7 +1026,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1026 g_tsc_offset = svm->vmcb->control.tsc_offset - 1026 g_tsc_offset = svm->vmcb->control.tsc_offset -
1027 svm->nested.hsave->control.tsc_offset; 1027 svm->nested.hsave->control.tsc_offset;
1028 svm->nested.hsave->control.tsc_offset = offset; 1028 svm->nested.hsave->control.tsc_offset = offset;
1029 } 1029 } else
1030 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1031 svm->vmcb->control.tsc_offset,
1032 offset);
1030 1033
1031 svm->vmcb->control.tsc_offset = offset + g_tsc_offset; 1034 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1032 1035
@@ -1044,6 +1047,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
1044 svm->vmcb->control.tsc_offset += adjustment; 1047 svm->vmcb->control.tsc_offset += adjustment;
1045 if (is_guest_mode(vcpu)) 1048 if (is_guest_mode(vcpu))
1046 svm->nested.hsave->control.tsc_offset += adjustment; 1049 svm->nested.hsave->control.tsc_offset += adjustment;
1050 else
1051 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1052 svm->vmcb->control.tsc_offset - adjustment,
1053 svm->vmcb->control.tsc_offset);
1054
1047 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1055 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1048} 1056}
1049 1057
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index fe5e00ed7036..545245d7cc63 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -756,6 +756,27 @@ TRACE_EVENT(
756 __entry->gpa_match ? "GPA" : "GVA") 756 __entry->gpa_match ? "GPA" : "GVA")
757); 757);
758 758
759TRACE_EVENT(kvm_write_tsc_offset,
760 TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
761 __u64 next_tsc_offset),
762 TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
763
764 TP_STRUCT__entry(
765 __field( unsigned int, vcpu_id )
766 __field( __u64, previous_tsc_offset )
767 __field( __u64, next_tsc_offset )
768 ),
769
770 TP_fast_assign(
771 __entry->vcpu_id = vcpu_id;
772 __entry->previous_tsc_offset = previous_tsc_offset;
773 __entry->next_tsc_offset = next_tsc_offset;
774 ),
775
776 TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
777 __entry->previous_tsc_offset, __entry->next_tsc_offset)
778);
779
759#ifdef CONFIG_X86_64 780#ifdef CONFIG_X86_64
760 781
761#define host_clocks \ 782#define host_clocks \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b30f5a54a2ab..a7e18551c968 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2096,6 +2096,8 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2096 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2097 vmcs12->tsc_offset : 0)); 2097 vmcs12->tsc_offset : 0));
2098 } else { 2098 } else {
2099 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2100 vmcs_read64(TSC_OFFSET), offset);
2099 vmcs_write64(TSC_OFFSET, offset); 2101 vmcs_write64(TSC_OFFSET, offset);
2100 } 2102 }
2101} 2103}
@@ -2103,11 +2105,14 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2103static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2105static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2104{ 2106{
2105 u64 offset = vmcs_read64(TSC_OFFSET); 2107 u64 offset = vmcs_read64(TSC_OFFSET);
2108
2106 vmcs_write64(TSC_OFFSET, offset + adjustment); 2109 vmcs_write64(TSC_OFFSET, offset + adjustment);
2107 if (is_guest_mode(vcpu)) { 2110 if (is_guest_mode(vcpu)) {
2108 /* Even when running L2, the adjustment needs to apply to L1 */ 2111 /* Even when running L2, the adjustment needs to apply to L1 */
2109 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2112 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2110 } 2113 } else
2114 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2115 offset + adjustment);
2111} 2116}
2112 2117
2113static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2118static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -4176,10 +4181,10 @@ static void ept_set_mmio_spte_mask(void)
4176 /* 4181 /*
4177 * EPT Misconfigurations can be generated if the value of bits 2:0 4182 * EPT Misconfigurations can be generated if the value of bits 2:0
4178 * of an EPT paging-structure entry is 110b (write/execute). 4183 * of an EPT paging-structure entry is 110b (write/execute).
4179 * Also, magic bits (0xffull << 49) is set to quickly identify mmio 4184 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4180 * spte. 4185 * spte.
4181 */ 4186 */
4182 kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); 4187 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4183} 4188}
4184 4189
4185/* 4190/*
@@ -5366,10 +5371,14 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5366 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5371 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5367 5372
5368 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5373 ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5369 if (likely(ret == 1)) 5374 if (likely(ret == RET_MMIO_PF_EMULATE))
5370 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5375 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5371 EMULATE_DONE; 5376 EMULATE_DONE;
5372 if (unlikely(!ret)) 5377
5378 if (unlikely(ret == RET_MMIO_PF_INVALID))
5379 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5380
5381 if (unlikely(ret == RET_MMIO_PF_RETRY))
5373 return 1; 5382 return 1;
5374 5383
5375 /* It is the real ept misconfig */ 5384 /* It is the real ept misconfig */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 292e6ca89f42..d21bce505315 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1193,20 +1193,37 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1193 elapsed = ns - kvm->arch.last_tsc_nsec; 1193 elapsed = ns - kvm->arch.last_tsc_nsec;
1194 1194
1195 if (vcpu->arch.virtual_tsc_khz) { 1195 if (vcpu->arch.virtual_tsc_khz) {
1196 int faulted = 0;
1197
1196 /* n.b - signed multiplication and division required */ 1198 /* n.b - signed multiplication and division required */
1197 usdiff = data - kvm->arch.last_tsc_write; 1199 usdiff = data - kvm->arch.last_tsc_write;
1198#ifdef CONFIG_X86_64 1200#ifdef CONFIG_X86_64
1199 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1201 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1200#else 1202#else
1201 /* do_div() only does unsigned */ 1203 /* do_div() only does unsigned */
1202 asm("idivl %2; xor %%edx, %%edx" 1204 asm("1: idivl %[divisor]\n"
1203 : "=A"(usdiff) 1205 "2: xor %%edx, %%edx\n"
1204 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1206 " movl $0, %[faulted]\n"
1207 "3:\n"
1208 ".section .fixup,\"ax\"\n"
1209 "4: movl $1, %[faulted]\n"
1210 " jmp 3b\n"
1211 ".previous\n"
1212
1213 _ASM_EXTABLE(1b, 4b)
1214
1215 : "=A"(usdiff), [faulted] "=r" (faulted)
1216 : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1217
1205#endif 1218#endif
1206 do_div(elapsed, 1000); 1219 do_div(elapsed, 1000);
1207 usdiff -= elapsed; 1220 usdiff -= elapsed;
1208 if (usdiff < 0) 1221 if (usdiff < 0)
1209 usdiff = -usdiff; 1222 usdiff = -usdiff;
1223
1224 /* idivl overflow => difference is larger than USEC_PER_SEC */
1225 if (faulted)
1226 usdiff = USEC_PER_SEC;
1210 } else 1227 } else
1211 usdiff = USEC_PER_SEC; /* disable TSC match window below */ 1228 usdiff = USEC_PER_SEC; /* disable TSC match window below */
1212 1229
@@ -1587,6 +1604,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1587 return 0; 1604 return 0;
1588} 1605}
1589 1606
1607/*
1608 * kvmclock updates which are isolated to a given vcpu, such as
1609 * vcpu->cpu migration, should not allow system_timestamp from
1610 * the rest of the vcpus to remain static. Otherwise ntp frequency
1611 * correction applies to one vcpu's system_timestamp but not
1612 * the others.
1613 *
1614 * So in those cases, request a kvmclock update for all vcpus.
1615 * The worst case for a remote vcpu to update its kvmclock
1616 * is then bounded by maximum nohz sleep latency.
1617 */
1618
1619static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1620{
1621 int i;
1622 struct kvm *kvm = v->kvm;
1623 struct kvm_vcpu *vcpu;
1624
1625 kvm_for_each_vcpu(i, vcpu, kvm) {
1626 set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
1627 kvm_vcpu_kick(vcpu);
1628 }
1629}
1630
1590static bool msr_mtrr_valid(unsigned msr) 1631static bool msr_mtrr_valid(unsigned msr)
1591{ 1632{
1592 switch (msr) { 1633 switch (msr) {
@@ -1984,7 +2025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1984 kvmclock_reset(vcpu); 2025 kvmclock_reset(vcpu);
1985 2026
1986 vcpu->arch.time = data; 2027 vcpu->arch.time = data;
1987 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2028 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
1988 2029
1989 /* we verify if the enable bit is set... */ 2030 /* we verify if the enable bit is set... */
1990 if (!(data & 1)) 2031 if (!(data & 1))
@@ -2701,7 +2742,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2701 * kvmclock on vcpu->cpu migration 2742 * kvmclock on vcpu->cpu migration
2702 */ 2743 */
2703 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 2744 if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2704 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2745 kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2705 if (vcpu->cpu != cpu) 2746 if (vcpu->cpu != cpu)
2706 kvm_migrate_timers(vcpu); 2747 kvm_migrate_timers(vcpu);
2707 vcpu->cpu = cpu; 2748 vcpu->cpu = cpu;
@@ -5238,7 +5279,13 @@ static void kvm_set_mmio_spte_mask(void)
5238 * Set the reserved bits and the present bit of an paging-structure 5279 * Set the reserved bits and the present bit of an paging-structure
5239 * entry to generate page fault with PFER.RSV = 1. 5280 * entry to generate page fault with PFER.RSV = 1.
5240 */ 5281 */
5241 mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; 5282 /* Mask the reserved physical address bits. */
5283 mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
5284
5285 /* Bit 62 is always reserved for 32bit host. */
5286 mask |= 0x3ull << 62;
5287
5288 /* Set the present bit. */
5242 mask |= 1ull; 5289 mask |= 1ull;
5243 5290
5244#ifdef CONFIG_X86_64 5291#ifdef CONFIG_X86_64
@@ -5498,13 +5545,6 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5498 char instruction[3]; 5545 char instruction[3];
5499 unsigned long rip = kvm_rip_read(vcpu); 5546 unsigned long rip = kvm_rip_read(vcpu);
5500 5547
5501 /*
5502 * Blow out the MMU to ensure that no other VCPU has an active mapping
5503 * to ensure that the updated hypercall appears atomically across all
5504 * VCPUs.
5505 */
5506 kvm_mmu_zap_all(vcpu->kvm);
5507
5508 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5548 kvm_x86_ops->patch_hypercall(vcpu, instruction);
5509 5549
5510 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); 5550 return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
@@ -5702,6 +5742,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5702 __kvm_migrate_timers(vcpu); 5742 __kvm_migrate_timers(vcpu);
5703 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 5743 if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
5704 kvm_gen_update_masterclock(vcpu->kvm); 5744 kvm_gen_update_masterclock(vcpu->kvm);
5745 if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
5746 kvm_gen_kvmclock_update(vcpu);
5705 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5747 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5706 r = kvm_guest_time_update(vcpu); 5748 r = kvm_guest_time_update(vcpu);
5707 if (unlikely(r)) 5749 if (unlikely(r))
@@ -6812,6 +6854,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6812 return -EINVAL; 6854 return -EINVAL;
6813 6855
6814 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6856 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6857 INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
6815 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6858 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
6816 6859
6817 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6860 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -7040,22 +7083,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
7040 * If memory slot is created, or moved, we need to clear all 7083 * If memory slot is created, or moved, we need to clear all
7041 * mmio sptes. 7084 * mmio sptes.
7042 */ 7085 */
7043 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 7086 kvm_mmu_invalidate_mmio_sptes(kvm);
7044 kvm_mmu_zap_mmio_sptes(kvm);
7045 kvm_reload_remote_mmus(kvm);
7046 }
7047} 7087}
7048 7088
7049void kvm_arch_flush_shadow_all(struct kvm *kvm) 7089void kvm_arch_flush_shadow_all(struct kvm *kvm)
7050{ 7090{
7051 kvm_mmu_zap_all(kvm); 7091 kvm_mmu_invalidate_zap_all_pages(kvm);
7052 kvm_reload_remote_mmus(kvm);
7053} 7092}
7054 7093
7055void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 7094void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7056 struct kvm_memory_slot *slot) 7095 struct kvm_memory_slot *slot)
7057{ 7096{
7058 kvm_arch_flush_shadow_all(kvm); 7097 kvm_mmu_invalidate_zap_all_pages(kvm);
7059} 7098}
7060 7099
7061int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 7100int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
@@ -7263,3 +7302,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7263EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 7302EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7264EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 7303EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7265EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); 7304EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7305EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);