aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/idmap.h1
-rw-r--r--arch/arm/include/asm/kvm_host.h47
-rw-r--r--arch/arm/include/asm/kvm_mmu.h28
-rw-r--r--arch/arm/kernel/asm-offsets.c2
-rw-r--r--arch/arm/kernel/vmlinux.lds.S7
-rw-r--r--arch/arm/kvm/Kconfig6
-rw-r--r--arch/arm/kvm/Makefile2
-rw-r--r--arch/arm/kvm/arch_timer.c7
-rw-r--r--arch/arm/kvm/arm.c129
-rw-r--r--arch/arm/kvm/init.S78
-rw-r--r--arch/arm/kvm/mmu.c455
-rw-r--r--arch/arm/kvm/perf.c68
-rw-r--r--arch/arm/mm/idmap.c32
-rw-r--r--arch/ia64/include/asm/kvm_host.h1
-rw-r--r--arch/ia64/include/uapi/asm/kvm.h1
-rw-r--r--arch/ia64/kvm/Kconfig14
-rw-r--r--arch/ia64/kvm/Makefile6
-rw-r--r--arch/ia64/kvm/kvm-ia64.c35
-rw-r--r--arch/ia64/kvm/lapic.h6
-rw-r--r--arch/powerpc/include/asm/hvcall.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h7
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h13
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h8
-rw-r--r--arch/powerpc/include/asm/kvm_booke.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h41
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h114
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h94
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kvm/44x.c12
-rw-r--r--arch/powerpc/kvm/Kconfig26
-rw-r--r--arch/powerpc/kvm/Makefile12
-rw-r--r--arch/powerpc/kvm/book3s.c36
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c120
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c92
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c11
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c406
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S228
-rw-r--r--arch/powerpc/kvm/book3s_pr.c7
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c21
-rw-r--r--arch/powerpc/kvm/book3s_rtas.c274
-rw-r--r--arch/powerpc/kvm/book3s_xics.c1270
-rw-r--r--arch/powerpc/kvm/book3s_xics.h130
-rw-r--r--arch/powerpc/kvm/booke.c158
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S42
-rw-r--r--arch/powerpc/kvm/e500.c14
-rw-r--r--arch/powerpc/kvm/e500.h22
-rw-r--r--arch/powerpc/kvm/e500_emulate.c19
-rw-r--r--arch/powerpc/kvm/e500_mmu.c192
-rw-r--r--arch/powerpc/kvm/e500mc.c16
-rw-r--r--arch/powerpc/kvm/emulate.c2
-rw-r--r--arch/powerpc/kvm/irq.h20
-rw-r--r--arch/powerpc/kvm/mpic.c1853
-rw-r--r--arch/powerpc/kvm/powerpc.c133
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c8
-rw-r--r--arch/s390/include/uapi/asm/Kbuild1
-rw-r--r--arch/s390/include/uapi/asm/virtio-ccw.h21
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c26
-rw-r--r--arch/s390/kvm/gaccess.h429
-rw-r--r--arch/s390/kvm/intercept.c18
-rw-r--r--arch/s390/kvm/interrupt.c245
-rw-r--r--arch/s390/kvm/kvm-s390.c43
-rw-r--r--arch/s390/kvm/kvm-s390.h12
-rw-r--r--arch/s390/kvm/priv.c270
-rw-r--r--arch/x86/include/asm/entry_arch.h4
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hw_irq.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/irq.c22
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kvmclock.c9
-rw-r--r--arch/x86/kvm/Kconfig14
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/emulate.c31
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/lapic.c189
-rw-r--r--arch/x86/kvm/lapic.h22
-rw-r--r--arch/x86/kvm/mmu.c108
-rw-r--r--arch/x86/kvm/mmu.h11
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/pmu.c14
-rw-r--r--arch/x86/kvm/svm.c40
-rw-r--r--arch/x86/kvm/vmx.c1077
-rw-r--r--arch/x86/kvm/x86.c243
93 files changed, 7424 insertions, 1849 deletions
diff --git a/arch/arm/include/asm/idmap.h b/arch/arm/include/asm/idmap.h
index 1a66f907e5cc..bf863edb517d 100644
--- a/arch/arm/include/asm/idmap.h
+++ b/arch/arm/include/asm/idmap.h
@@ -8,7 +8,6 @@
8#define __idmap __section(.idmap.text) noinline notrace 8#define __idmap __section(.idmap.text) noinline notrace
9 9
10extern pgd_t *idmap_pgd; 10extern pgd_t *idmap_pgd;
11extern pgd_t *hyp_pgd;
12 11
13void setup_mm_for_reboot(void); 12void setup_mm_for_reboot(void);
14 13
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 0c4e643d939e..57cb786a6203 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -87,7 +87,7 @@ struct kvm_vcpu_fault_info {
87 u32 hyp_pc; /* PC when exception was taken from Hyp mode */ 87 u32 hyp_pc; /* PC when exception was taken from Hyp mode */
88}; 88};
89 89
90typedef struct vfp_hard_struct kvm_kernel_vfp_t; 90typedef struct vfp_hard_struct kvm_cpu_context_t;
91 91
92struct kvm_vcpu_arch { 92struct kvm_vcpu_arch {
93 struct kvm_regs regs; 93 struct kvm_regs regs;
@@ -105,8 +105,10 @@ struct kvm_vcpu_arch {
105 struct kvm_vcpu_fault_info fault; 105 struct kvm_vcpu_fault_info fault;
106 106
107 /* Floating point registers (VFP and Advanced SIMD/NEON) */ 107 /* Floating point registers (VFP and Advanced SIMD/NEON) */
108 kvm_kernel_vfp_t vfp_guest; 108 struct vfp_hard_struct vfp_guest;
109 kvm_kernel_vfp_t *vfp_host; 109
110 /* Host FP context */
111 kvm_cpu_context_t *host_cpu_context;
110 112
111 /* VGIC state */ 113 /* VGIC state */
112 struct vgic_cpu vgic_cpu; 114 struct vgic_cpu vgic_cpu;
@@ -188,23 +190,38 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
188int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, 190int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
189 int exception_index); 191 int exception_index);
190 192
191static inline void __cpu_init_hyp_mode(unsigned long long pgd_ptr, 193static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr,
194 unsigned long long pgd_ptr,
192 unsigned long hyp_stack_ptr, 195 unsigned long hyp_stack_ptr,
193 unsigned long vector_ptr) 196 unsigned long vector_ptr)
194{ 197{
195 unsigned long pgd_low, pgd_high;
196
197 pgd_low = (pgd_ptr & ((1ULL << 32) - 1));
198 pgd_high = (pgd_ptr >> 32ULL);
199
200 /* 198 /*
201 * Call initialization code, and switch to the full blown 199 * Call initialization code, and switch to the full blown HYP
202 * HYP code. The init code doesn't need to preserve these registers as 200 * code. The init code doesn't need to preserve these
203 * r1-r3 and r12 are already callee save according to the AAPCS. 201 * registers as r0-r3 are already callee saved according to
204 * Note that we slightly misuse the prototype by casing the pgd_low to 202 * the AAPCS.
205 * a void *. 203 * Note that we slightly misuse the prototype by casing the
204 * stack pointer to a void *.
205 *
206 * We don't have enough registers to perform the full init in
207 * one go. Install the boot PGD first, and then install the
208 * runtime PGD, stack pointer and vectors. The PGDs are always
209 * passed as the third argument, in order to be passed into
210 * r2-r3 to the init code (yes, this is compliant with the
211 * PCS!).
206 */ 212 */
207 kvm_call_hyp((void *)pgd_low, pgd_high, hyp_stack_ptr, vector_ptr); 213
214 kvm_call_hyp(NULL, 0, boot_pgd_ptr);
215
216 kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
208} 217}
209 218
219static inline int kvm_arch_dev_ioctl_check_extension(long ext)
220{
221 return 0;
222}
223
224int kvm_perf_init(void);
225int kvm_perf_teardown(void);
226
210#endif /* __ARM_KVM_HOST_H__ */ 227#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 970f3b5fa109..472ac7091003 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -19,21 +19,33 @@
19#ifndef __ARM_KVM_MMU_H__ 19#ifndef __ARM_KVM_MMU_H__
20#define __ARM_KVM_MMU_H__ 20#define __ARM_KVM_MMU_H__
21 21
22#include <asm/cacheflush.h> 22#include <asm/memory.h>
23#include <asm/pgalloc.h> 23#include <asm/page.h>
24#include <asm/idmap.h>
25 24
26/* 25/*
27 * We directly use the kernel VA for the HYP, as we can directly share 26 * We directly use the kernel VA for the HYP, as we can directly share
28 * the mapping (HTTBR "covers" TTBR1). 27 * the mapping (HTTBR "covers" TTBR1).
29 */ 28 */
30#define HYP_PAGE_OFFSET_MASK (~0UL) 29#define HYP_PAGE_OFFSET_MASK UL(~0)
31#define HYP_PAGE_OFFSET PAGE_OFFSET 30#define HYP_PAGE_OFFSET PAGE_OFFSET
32#define KERN_TO_HYP(kva) (kva) 31#define KERN_TO_HYP(kva) (kva)
33 32
33/*
34 * Our virtual mapping for the boot-time MMU-enable code. Must be
35 * shared across all the page-tables. Conveniently, we use the vectors
36 * page, where no kernel data will ever be shared with HYP.
37 */
38#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE)
39
40#ifndef __ASSEMBLY__
41
42#include <asm/cacheflush.h>
43#include <asm/pgalloc.h>
44
34int create_hyp_mappings(void *from, void *to); 45int create_hyp_mappings(void *from, void *to);
35int create_hyp_io_mappings(void *from, void *to, phys_addr_t); 46int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
36void free_hyp_pmds(void); 47void free_boot_hyp_pgd(void);
48void free_hyp_pgds(void);
37 49
38int kvm_alloc_stage2_pgd(struct kvm *kvm); 50int kvm_alloc_stage2_pgd(struct kvm *kvm);
39void kvm_free_stage2_pgd(struct kvm *kvm); 51void kvm_free_stage2_pgd(struct kvm *kvm);
@@ -45,6 +57,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
45void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); 57void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
46 58
47phys_addr_t kvm_mmu_get_httbr(void); 59phys_addr_t kvm_mmu_get_httbr(void);
60phys_addr_t kvm_mmu_get_boot_httbr(void);
61phys_addr_t kvm_get_idmap_vector(void);
48int kvm_mmu_init(void); 62int kvm_mmu_init(void);
49void kvm_clear_hyp_idmap(void); 63void kvm_clear_hyp_idmap(void);
50 64
@@ -114,4 +128,8 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
114 } 128 }
115} 129}
116 130
131#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l))
132
133#endif /* !__ASSEMBLY__ */
134
117#endif /* __ARM_KVM_MMU_H__ */ 135#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index a53efa993690..ee68cce6b48e 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -158,7 +158,7 @@ int main(void)
158 DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr)); 158 DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr));
159 DEFINE(VCPU_CP15, offsetof(struct kvm_vcpu, arch.cp15)); 159 DEFINE(VCPU_CP15, offsetof(struct kvm_vcpu, arch.cp15));
160 DEFINE(VCPU_VFP_GUEST, offsetof(struct kvm_vcpu, arch.vfp_guest)); 160 DEFINE(VCPU_VFP_GUEST, offsetof(struct kvm_vcpu, arch.vfp_guest));
161 DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.vfp_host)); 161 DEFINE(VCPU_VFP_HOST, offsetof(struct kvm_vcpu, arch.host_cpu_context));
162 DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs)); 162 DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs));
163 DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs)); 163 DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs));
164 DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs)); 164 DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs));
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index b571484e9f03..a871b8e00fca 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -20,7 +20,7 @@
20 VMLINUX_SYMBOL(__idmap_text_start) = .; \ 20 VMLINUX_SYMBOL(__idmap_text_start) = .; \
21 *(.idmap.text) \ 21 *(.idmap.text) \
22 VMLINUX_SYMBOL(__idmap_text_end) = .; \ 22 VMLINUX_SYMBOL(__idmap_text_end) = .; \
23 ALIGN_FUNCTION(); \ 23 . = ALIGN(32); \
24 VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \ 24 VMLINUX_SYMBOL(__hyp_idmap_text_start) = .; \
25 *(.hyp.idmap.text) \ 25 *(.hyp.idmap.text) \
26 VMLINUX_SYMBOL(__hyp_idmap_text_end) = .; 26 VMLINUX_SYMBOL(__hyp_idmap_text_end) = .;
@@ -315,3 +315,8 @@ SECTIONS
315 */ 315 */
316ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support") 316ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support")
317ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") 317ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined")
318/*
319 * The HYP init code can't be more than a page long.
320 * The above comment applies as well.
321 */
322ASSERT(((__hyp_idmap_text_end - __hyp_idmap_text_start) <= PAGE_SIZE), "HYP init code too big")
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 49dd64e579c2..370e1a8af6ac 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -41,9 +41,9 @@ config KVM_ARM_HOST
41 Provides host support for ARM processors. 41 Provides host support for ARM processors.
42 42
43config KVM_ARM_MAX_VCPUS 43config KVM_ARM_MAX_VCPUS
44 int "Number maximum supported virtual CPUs per VM" 44 int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST
45 depends on KVM_ARM_HOST 45 default 4 if KVM_ARM_HOST
46 default 4 46 default 0
47 help 47 help
48 Static number of max supported virtual CPUs per VM. 48 Static number of max supported virtual CPUs per VM.
49 49
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 8dc5e76cb789..53c5ed83d16f 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
18 18
19obj-y += kvm-arm.o init.o interrupts.o 19obj-y += kvm-arm.o init.o interrupts.o
20obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o 20obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
21obj-y += coproc.o coproc_a15.o mmio.o psci.o 21obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
22obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o 22obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o
23obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o 23obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o
diff --git a/arch/arm/kvm/arch_timer.c b/arch/arm/kvm/arch_timer.c
index 6ac938d46297..c55b6089e923 100644
--- a/arch/arm/kvm/arch_timer.c
+++ b/arch/arm/kvm/arch_timer.c
@@ -22,6 +22,7 @@
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <linux/interrupt.h> 23#include <linux/interrupt.h>
24 24
25#include <clocksource/arm_arch_timer.h>
25#include <asm/arch_timer.h> 26#include <asm/arch_timer.h>
26 27
27#include <asm/kvm_vgic.h> 28#include <asm/kvm_vgic.h>
@@ -64,7 +65,7 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
64{ 65{
65 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; 66 struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
66 67
67 timer->cntv_ctl |= 1 << 1; /* Mask the interrupt in the guest */ 68 timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
68 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, 69 kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
69 vcpu->arch.timer_cpu.irq->irq, 70 vcpu->arch.timer_cpu.irq->irq,
70 vcpu->arch.timer_cpu.irq->level); 71 vcpu->arch.timer_cpu.irq->level);
@@ -133,8 +134,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
133 cycle_t cval, now; 134 cycle_t cval, now;
134 u64 ns; 135 u64 ns;
135 136
136 /* Check if the timer is enabled and unmasked first */ 137 if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
137 if ((timer->cntv_ctl & 3) != 1) 138 !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
138 return; 139 return;
139 140
140 cval = timer->cntv_cval; 141 cval = timer->cntv_cval;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a0dfc2a53f91..37d216d814cd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -16,6 +16,7 @@
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */ 17 */
18 18
19#include <linux/cpu.h>
19#include <linux/errno.h> 20#include <linux/errno.h>
20#include <linux/err.h> 21#include <linux/err.h>
21#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
@@ -48,7 +49,7 @@ __asm__(".arch_extension virt");
48#endif 49#endif
49 50
50static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 51static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
51static kvm_kernel_vfp_t __percpu *kvm_host_vfp_state; 52static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
52static unsigned long hyp_default_vectors; 53static unsigned long hyp_default_vectors;
53 54
54/* Per-CPU variable containing the currently running vcpu. */ 55/* Per-CPU variable containing the currently running vcpu. */
@@ -206,7 +207,7 @@ int kvm_dev_ioctl_check_extension(long ext)
206 r = KVM_MAX_VCPUS; 207 r = KVM_MAX_VCPUS;
207 break; 208 break;
208 default: 209 default:
209 r = 0; 210 r = kvm_arch_dev_ioctl_check_extension(ext);
210 break; 211 break;
211 } 212 }
212 return r; 213 return r;
@@ -218,27 +219,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
218 return -EINVAL; 219 return -EINVAL;
219} 220}
220 221
221int kvm_arch_set_memory_region(struct kvm *kvm,
222 struct kvm_userspace_memory_region *mem,
223 struct kvm_memory_slot old,
224 int user_alloc)
225{
226 return 0;
227}
228
229int kvm_arch_prepare_memory_region(struct kvm *kvm, 222int kvm_arch_prepare_memory_region(struct kvm *kvm,
230 struct kvm_memory_slot *memslot, 223 struct kvm_memory_slot *memslot,
231 struct kvm_memory_slot old,
232 struct kvm_userspace_memory_region *mem, 224 struct kvm_userspace_memory_region *mem,
233 bool user_alloc) 225 enum kvm_mr_change change)
234{ 226{
235 return 0; 227 return 0;
236} 228}
237 229
238void kvm_arch_commit_memory_region(struct kvm *kvm, 230void kvm_arch_commit_memory_region(struct kvm *kvm,
239 struct kvm_userspace_memory_region *mem, 231 struct kvm_userspace_memory_region *mem,
240 struct kvm_memory_slot old, 232 const struct kvm_memory_slot *old,
241 bool user_alloc) 233 enum kvm_mr_change change)
242{ 234{
243} 235}
244 236
@@ -326,7 +318,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 318void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
327{ 319{
328 vcpu->cpu = cpu; 320 vcpu->cpu = cpu;
329 vcpu->arch.vfp_host = this_cpu_ptr(kvm_host_vfp_state); 321 vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
330 322
331 /* 323 /*
332 * Check whether this vcpu requires the cache to be flushed on 324 * Check whether this vcpu requires the cache to be flushed on
@@ -639,7 +631,8 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
639 return 0; 631 return 0;
640} 632}
641 633
642int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level) 634int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
635 bool line_status)
643{ 636{
644 u32 irq = irq_level->irq; 637 u32 irq = irq_level->irq;
645 unsigned int irq_type, vcpu_idx, irq_num; 638 unsigned int irq_type, vcpu_idx, irq_num;
@@ -794,30 +787,48 @@ long kvm_arch_vm_ioctl(struct file *filp,
794 } 787 }
795} 788}
796 789
797static void cpu_init_hyp_mode(void *vector) 790static void cpu_init_hyp_mode(void *dummy)
798{ 791{
792 unsigned long long boot_pgd_ptr;
799 unsigned long long pgd_ptr; 793 unsigned long long pgd_ptr;
800 unsigned long hyp_stack_ptr; 794 unsigned long hyp_stack_ptr;
801 unsigned long stack_page; 795 unsigned long stack_page;
802 unsigned long vector_ptr; 796 unsigned long vector_ptr;
803 797
804 /* Switch from the HYP stub to our own HYP init vector */ 798 /* Switch from the HYP stub to our own HYP init vector */
805 __hyp_set_vectors((unsigned long)vector); 799 __hyp_set_vectors(kvm_get_idmap_vector());
806 800
801 boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr();
807 pgd_ptr = (unsigned long long)kvm_mmu_get_httbr(); 802 pgd_ptr = (unsigned long long)kvm_mmu_get_httbr();
808 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); 803 stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
809 hyp_stack_ptr = stack_page + PAGE_SIZE; 804 hyp_stack_ptr = stack_page + PAGE_SIZE;
810 vector_ptr = (unsigned long)__kvm_hyp_vector; 805 vector_ptr = (unsigned long)__kvm_hyp_vector;
811 806
812 __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); 807 __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
808}
809
810static int hyp_init_cpu_notify(struct notifier_block *self,
811 unsigned long action, void *cpu)
812{
813 switch (action) {
814 case CPU_STARTING:
815 case CPU_STARTING_FROZEN:
816 cpu_init_hyp_mode(NULL);
817 break;
818 }
819
820 return NOTIFY_OK;
813} 821}
814 822
823static struct notifier_block hyp_init_cpu_nb = {
824 .notifier_call = hyp_init_cpu_notify,
825};
826
815/** 827/**
816 * Inits Hyp-mode on all online CPUs 828 * Inits Hyp-mode on all online CPUs
817 */ 829 */
818static int init_hyp_mode(void) 830static int init_hyp_mode(void)
819{ 831{
820 phys_addr_t init_phys_addr;
821 int cpu; 832 int cpu;
822 int err = 0; 833 int err = 0;
823 834
@@ -850,24 +861,6 @@ static int init_hyp_mode(void)
850 } 861 }
851 862
852 /* 863 /*
853 * Execute the init code on each CPU.
854 *
855 * Note: The stack is not mapped yet, so don't do anything else than
856 * initializing the hypervisor mode on each CPU using a local stack
857 * space for temporary storage.
858 */
859 init_phys_addr = virt_to_phys(__kvm_hyp_init);
860 for_each_online_cpu(cpu) {
861 smp_call_function_single(cpu, cpu_init_hyp_mode,
862 (void *)(long)init_phys_addr, 1);
863 }
864
865 /*
866 * Unmap the identity mapping
867 */
868 kvm_clear_hyp_idmap();
869
870 /*
871 * Map the Hyp-code called directly from the host 864 * Map the Hyp-code called directly from the host
872 */ 865 */
873 err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); 866 err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
@@ -890,33 +883,38 @@ static int init_hyp_mode(void)
890 } 883 }
891 884
892 /* 885 /*
893 * Map the host VFP structures 886 * Map the host CPU structures
894 */ 887 */
895 kvm_host_vfp_state = alloc_percpu(kvm_kernel_vfp_t); 888 kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
896 if (!kvm_host_vfp_state) { 889 if (!kvm_host_cpu_state) {
897 err = -ENOMEM; 890 err = -ENOMEM;
898 kvm_err("Cannot allocate host VFP state\n"); 891 kvm_err("Cannot allocate host CPU state\n");
899 goto out_free_mappings; 892 goto out_free_mappings;
900 } 893 }
901 894
902 for_each_possible_cpu(cpu) { 895 for_each_possible_cpu(cpu) {
903 kvm_kernel_vfp_t *vfp; 896 kvm_cpu_context_t *cpu_ctxt;
904 897
905 vfp = per_cpu_ptr(kvm_host_vfp_state, cpu); 898 cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
906 err = create_hyp_mappings(vfp, vfp + 1); 899 err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
907 900
908 if (err) { 901 if (err) {
909 kvm_err("Cannot map host VFP state: %d\n", err); 902 kvm_err("Cannot map host CPU state: %d\n", err);
910 goto out_free_vfp; 903 goto out_free_context;
911 } 904 }
912 } 905 }
913 906
914 /* 907 /*
908 * Execute the init code on each CPU.
909 */
910 on_each_cpu(cpu_init_hyp_mode, NULL, 1);
911
912 /*
915 * Init HYP view of VGIC 913 * Init HYP view of VGIC
916 */ 914 */
917 err = kvm_vgic_hyp_init(); 915 err = kvm_vgic_hyp_init();
918 if (err) 916 if (err)
919 goto out_free_vfp; 917 goto out_free_context;
920 918
921#ifdef CONFIG_KVM_ARM_VGIC 919#ifdef CONFIG_KVM_ARM_VGIC
922 vgic_present = true; 920 vgic_present = true;
@@ -929,12 +927,19 @@ static int init_hyp_mode(void)
929 if (err) 927 if (err)
930 goto out_free_mappings; 928 goto out_free_mappings;
931 929
930#ifndef CONFIG_HOTPLUG_CPU
931 free_boot_hyp_pgd();
932#endif
933
934 kvm_perf_init();
935
932 kvm_info("Hyp mode initialized successfully\n"); 936 kvm_info("Hyp mode initialized successfully\n");
937
933 return 0; 938 return 0;
934out_free_vfp: 939out_free_context:
935 free_percpu(kvm_host_vfp_state); 940 free_percpu(kvm_host_cpu_state);
936out_free_mappings: 941out_free_mappings:
937 free_hyp_pmds(); 942 free_hyp_pgds();
938out_free_stack_pages: 943out_free_stack_pages:
939 for_each_possible_cpu(cpu) 944 for_each_possible_cpu(cpu)
940 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); 945 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
@@ -943,27 +948,42 @@ out_err:
943 return err; 948 return err;
944} 949}
945 950
951static void check_kvm_target_cpu(void *ret)
952{
953 *(int *)ret = kvm_target_cpu();
954}
955
946/** 956/**
947 * Initialize Hyp-mode and memory mappings on all CPUs. 957 * Initialize Hyp-mode and memory mappings on all CPUs.
948 */ 958 */
949int kvm_arch_init(void *opaque) 959int kvm_arch_init(void *opaque)
950{ 960{
951 int err; 961 int err;
962 int ret, cpu;
952 963
953 if (!is_hyp_mode_available()) { 964 if (!is_hyp_mode_available()) {
954 kvm_err("HYP mode not available\n"); 965 kvm_err("HYP mode not available\n");
955 return -ENODEV; 966 return -ENODEV;
956 } 967 }
957 968
958 if (kvm_target_cpu() < 0) { 969 for_each_online_cpu(cpu) {
959 kvm_err("Target CPU not supported!\n"); 970 smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
960 return -ENODEV; 971 if (ret < 0) {
972 kvm_err("Error, CPU %d not supported!\n", cpu);
973 return -ENODEV;
974 }
961 } 975 }
962 976
963 err = init_hyp_mode(); 977 err = init_hyp_mode();
964 if (err) 978 if (err)
965 goto out_err; 979 goto out_err;
966 980
981 err = register_cpu_notifier(&hyp_init_cpu_nb);
982 if (err) {
983 kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
984 goto out_err;
985 }
986
967 kvm_coproc_table_init(); 987 kvm_coproc_table_init();
968 return 0; 988 return 0;
969out_err: 989out_err:
@@ -973,6 +993,7 @@ out_err:
973/* NOP: Compiling as a module not supported */ 993/* NOP: Compiling as a module not supported */
974void kvm_arch_exit(void) 994void kvm_arch_exit(void)
975{ 995{
996 kvm_perf_teardown();
976} 997}
977 998
978static int arm_init(void) 999static int arm_init(void)
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 9f37a79b880b..f048338135f7 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -21,13 +21,33 @@
21#include <asm/asm-offsets.h> 21#include <asm/asm-offsets.h>
22#include <asm/kvm_asm.h> 22#include <asm/kvm_asm.h>
23#include <asm/kvm_arm.h> 23#include <asm/kvm_arm.h>
24#include <asm/kvm_mmu.h>
24 25
25/******************************************************************** 26/********************************************************************
26 * Hypervisor initialization 27 * Hypervisor initialization
27 * - should be called with: 28 * - should be called with:
28 * r0,r1 = Hypervisor pgd pointer 29 * r0 = top of Hyp stack (kernel VA)
29 * r2 = top of Hyp stack (kernel VA) 30 * r1 = pointer to hyp vectors
30 * r3 = pointer to hyp vectors 31 * r2,r3 = Hypervisor pgd pointer
32 *
33 * The init scenario is:
34 * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
35 * runtime stack, runtime vectors
36 * - Enable the MMU with the boot pgd
37 * - Jump to a target into the trampoline page (remember, this is the same
38 * physical page!)
39 * - Now switch to the runtime pgd (same VA, and still the same physical
40 * page!)
41 * - Invalidate TLBs
42 * - Set stack and vectors
43 * - Profit! (or eret, if you only care about the code).
44 *
45 * As we only have four registers available to pass parameters (and we
46 * need six), we split the init in two phases:
47 * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
48 * Provides the basic HYP init, and enable the MMU.
49 * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
50 * Switches to the runtime PGD, set stack and vectors.
31 */ 51 */
32 52
33 .text 53 .text
@@ -47,22 +67,25 @@ __kvm_hyp_init:
47 W(b) . 67 W(b) .
48 68
49__do_hyp_init: 69__do_hyp_init:
70 cmp r0, #0 @ We have a SP?
71 bne phase2 @ Yes, second stage init
72
50 @ Set the HTTBR to point to the hypervisor PGD pointer passed 73 @ Set the HTTBR to point to the hypervisor PGD pointer passed
51 mcrr p15, 4, r0, r1, c2 74 mcrr p15, 4, r2, r3, c2
52 75
53 @ Set the HTCR and VTCR to the same shareability and cacheability 76 @ Set the HTCR and VTCR to the same shareability and cacheability
54 @ settings as the non-secure TTBCR and with T0SZ == 0. 77 @ settings as the non-secure TTBCR and with T0SZ == 0.
55 mrc p15, 4, r0, c2, c0, 2 @ HTCR 78 mrc p15, 4, r0, c2, c0, 2 @ HTCR
56 ldr r12, =HTCR_MASK 79 ldr r2, =HTCR_MASK
57 bic r0, r0, r12 80 bic r0, r0, r2
58 mrc p15, 0, r1, c2, c0, 2 @ TTBCR 81 mrc p15, 0, r1, c2, c0, 2 @ TTBCR
59 and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ) 82 and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ)
60 orr r0, r0, r1 83 orr r0, r0, r1
61 mcr p15, 4, r0, c2, c0, 2 @ HTCR 84 mcr p15, 4, r0, c2, c0, 2 @ HTCR
62 85
63 mrc p15, 4, r1, c2, c1, 2 @ VTCR 86 mrc p15, 4, r1, c2, c1, 2 @ VTCR
64 ldr r12, =VTCR_MASK 87 ldr r2, =VTCR_MASK
65 bic r1, r1, r12 88 bic r1, r1, r2
66 bic r0, r0, #(~VTCR_HTCR_SH) @ clear non-reusable HTCR bits 89 bic r0, r0, #(~VTCR_HTCR_SH) @ clear non-reusable HTCR bits
67 orr r1, r0, r1 90 orr r1, r0, r1
68 orr r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S) 91 orr r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S)
@@ -85,24 +108,41 @@ __do_hyp_init:
85 @ - Memory alignment checks: enabled 108 @ - Memory alignment checks: enabled
86 @ - MMU: enabled (this code must be run from an identity mapping) 109 @ - MMU: enabled (this code must be run from an identity mapping)
87 mrc p15, 4, r0, c1, c0, 0 @ HSCR 110 mrc p15, 4, r0, c1, c0, 0 @ HSCR
88 ldr r12, =HSCTLR_MASK 111 ldr r2, =HSCTLR_MASK
89 bic r0, r0, r12 112 bic r0, r0, r2
90 mrc p15, 0, r1, c1, c0, 0 @ SCTLR 113 mrc p15, 0, r1, c1, c0, 0 @ SCTLR
91 ldr r12, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) 114 ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
92 and r1, r1, r12 115 and r1, r1, r2
93 ARM( ldr r12, =(HSCTLR_M | HSCTLR_A) ) 116 ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) )
94 THUMB( ldr r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) 117 THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) )
95 orr r1, r1, r12 118 orr r1, r1, r2
96 orr r0, r0, r1 119 orr r0, r0, r1
97 isb 120 isb
98 mcr p15, 4, r0, c1, c0, 0 @ HSCR 121 mcr p15, 4, r0, c1, c0, 0 @ HSCR
99 isb
100 122
101 @ Set stack pointer and return to the kernel 123 @ End of init phase-1
102 mov sp, r2 124 eret
125
126phase2:
127 @ Set stack pointer
128 mov sp, r0
103 129
104 @ Set HVBAR to point to the HYP vectors 130 @ Set HVBAR to point to the HYP vectors
105 mcr p15, 4, r3, c12, c0, 0 @ HVBAR 131 mcr p15, 4, r1, c12, c0, 0 @ HVBAR
132
133 @ Jump to the trampoline page
134 ldr r0, =TRAMPOLINE_VA
135 adr r1, target
136 bfi r0, r1, #0, #PAGE_SHIFT
137 mov pc, r0
138
139target: @ We're now in the trampoline code, switch page tables
140 mcrr p15, 4, r2, r3, c2
141 isb
142
143 @ Invalidate the old TLBs
144 mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH
145 dsb
106 146
107 eret 147 eret
108 148
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2f12e4056408..965706578f13 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,8 +32,15 @@
32 32
33extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; 33extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
34 34
35static pgd_t *boot_hyp_pgd;
36static pgd_t *hyp_pgd;
35static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 37static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
36 38
39static void *init_bounce_page;
40static unsigned long hyp_idmap_start;
41static unsigned long hyp_idmap_end;
42static phys_addr_t hyp_idmap_vector;
43
37static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) 44static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
38{ 45{
39 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); 46 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
@@ -71,172 +78,224 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
71 return p; 78 return p;
72} 79}
73 80
74static void free_ptes(pmd_t *pmd, unsigned long addr) 81static void clear_pud_entry(pud_t *pud)
75{ 82{
76 pte_t *pte; 83 pmd_t *pmd_table = pmd_offset(pud, 0);
77 unsigned int i; 84 pud_clear(pud);
85 pmd_free(NULL, pmd_table);
86 put_page(virt_to_page(pud));
87}
78 88
79 for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) { 89static void clear_pmd_entry(pmd_t *pmd)
80 if (!pmd_none(*pmd) && pmd_table(*pmd)) { 90{
81 pte = pte_offset_kernel(pmd, addr); 91 pte_t *pte_table = pte_offset_kernel(pmd, 0);
82 pte_free_kernel(NULL, pte); 92 pmd_clear(pmd);
83 } 93 pte_free_kernel(NULL, pte_table);
84 pmd++; 94 put_page(virt_to_page(pmd));
95}
96
97static bool pmd_empty(pmd_t *pmd)
98{
99 struct page *pmd_page = virt_to_page(pmd);
100 return page_count(pmd_page) == 1;
101}
102
103static void clear_pte_entry(pte_t *pte)
104{
105 if (pte_present(*pte)) {
106 kvm_set_pte(pte, __pte(0));
107 put_page(virt_to_page(pte));
85 } 108 }
86} 109}
87 110
88static void free_hyp_pgd_entry(unsigned long addr) 111static bool pte_empty(pte_t *pte)
112{
113 struct page *pte_page = virt_to_page(pte);
114 return page_count(pte_page) == 1;
115}
116
117static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size)
89{ 118{
90 pgd_t *pgd; 119 pgd_t *pgd;
91 pud_t *pud; 120 pud_t *pud;
92 pmd_t *pmd; 121 pmd_t *pmd;
93 unsigned long hyp_addr = KERN_TO_HYP(addr); 122 pte_t *pte;
123 unsigned long long addr = start, end = start + size;
124 u64 range;
125
126 while (addr < end) {
127 pgd = pgdp + pgd_index(addr);
128 pud = pud_offset(pgd, addr);
129 if (pud_none(*pud)) {
130 addr += PUD_SIZE;
131 continue;
132 }
94 133
95 pgd = hyp_pgd + pgd_index(hyp_addr); 134 pmd = pmd_offset(pud, addr);
96 pud = pud_offset(pgd, hyp_addr); 135 if (pmd_none(*pmd)) {
136 addr += PMD_SIZE;
137 continue;
138 }
97 139
98 if (pud_none(*pud)) 140 pte = pte_offset_kernel(pmd, addr);
99 return; 141 clear_pte_entry(pte);
100 BUG_ON(pud_bad(*pud)); 142 range = PAGE_SIZE;
101 143
102 pmd = pmd_offset(pud, hyp_addr); 144 /* If we emptied the pte, walk back up the ladder */
103 free_ptes(pmd, addr); 145 if (pte_empty(pte)) {
104 pmd_free(NULL, pmd); 146 clear_pmd_entry(pmd);
105 pud_clear(pud); 147 range = PMD_SIZE;
148 if (pmd_empty(pmd)) {
149 clear_pud_entry(pud);
150 range = PUD_SIZE;
151 }
152 }
153
154 addr += range;
155 }
106} 156}
107 157
108/** 158/**
109 * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables 159 * free_boot_hyp_pgd - free HYP boot page tables
110 * 160 *
111 * Assumes this is a page table used strictly in Hyp-mode and therefore contains 161 * Free the HYP boot page tables. The bounce page is also freed.
112 * either mappings in the kernel memory area (above PAGE_OFFSET), or
113 * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END).
114 */ 162 */
115void free_hyp_pmds(void) 163void free_boot_hyp_pgd(void)
116{ 164{
117 unsigned long addr;
118
119 mutex_lock(&kvm_hyp_pgd_mutex); 165 mutex_lock(&kvm_hyp_pgd_mutex);
120 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) 166
121 free_hyp_pgd_entry(addr); 167 if (boot_hyp_pgd) {
122 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) 168 unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
123 free_hyp_pgd_entry(addr); 169 unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
170 kfree(boot_hyp_pgd);
171 boot_hyp_pgd = NULL;
172 }
173
174 if (hyp_pgd)
175 unmap_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
176
177 kfree(init_bounce_page);
178 init_bounce_page = NULL;
179
124 mutex_unlock(&kvm_hyp_pgd_mutex); 180 mutex_unlock(&kvm_hyp_pgd_mutex);
125} 181}
126 182
127static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, 183/**
128 unsigned long end) 184 * free_hyp_pgds - free Hyp-mode page tables
185 *
186 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
187 * therefore contains either mappings in the kernel memory area (above
188 * PAGE_OFFSET), or device mappings in the vmalloc range (from
189 * VMALLOC_START to VMALLOC_END).
190 *
191 * boot_hyp_pgd should only map two pages for the init code.
192 */
193void free_hyp_pgds(void)
129{ 194{
130 pte_t *pte;
131 unsigned long addr; 195 unsigned long addr;
132 struct page *page;
133 196
134 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 197 free_boot_hyp_pgd();
135 unsigned long hyp_addr = KERN_TO_HYP(addr); 198
199 mutex_lock(&kvm_hyp_pgd_mutex);
136 200
137 pte = pte_offset_kernel(pmd, hyp_addr); 201 if (hyp_pgd) {
138 BUG_ON(!virt_addr_valid(addr)); 202 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
139 page = virt_to_page(addr); 203 unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
140 kvm_set_pte(pte, mk_pte(page, PAGE_HYP)); 204 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
205 unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
206 kfree(hyp_pgd);
207 hyp_pgd = NULL;
141 } 208 }
209
210 mutex_unlock(&kvm_hyp_pgd_mutex);
142} 211}
143 212
144static void create_hyp_io_pte_mappings(pmd_t *pmd, unsigned long start, 213static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
145 unsigned long end, 214 unsigned long end, unsigned long pfn,
146 unsigned long *pfn_base) 215 pgprot_t prot)
147{ 216{
148 pte_t *pte; 217 pte_t *pte;
149 unsigned long addr; 218 unsigned long addr;
150 219
151 for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { 220 addr = start;
152 unsigned long hyp_addr = KERN_TO_HYP(addr); 221 do {
153 222 pte = pte_offset_kernel(pmd, addr);
154 pte = pte_offset_kernel(pmd, hyp_addr); 223 kvm_set_pte(pte, pfn_pte(pfn, prot));
155 BUG_ON(pfn_valid(*pfn_base)); 224 get_page(virt_to_page(pte));
156 kvm_set_pte(pte, pfn_pte(*pfn_base, PAGE_HYP_DEVICE)); 225 kvm_flush_dcache_to_poc(pte, sizeof(*pte));
157 (*pfn_base)++; 226 pfn++;
158 } 227 } while (addr += PAGE_SIZE, addr != end);
159} 228}
160 229
161static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, 230static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
162 unsigned long end, unsigned long *pfn_base) 231 unsigned long end, unsigned long pfn,
232 pgprot_t prot)
163{ 233{
164 pmd_t *pmd; 234 pmd_t *pmd;
165 pte_t *pte; 235 pte_t *pte;
166 unsigned long addr, next; 236 unsigned long addr, next;
167 237
168 for (addr = start; addr < end; addr = next) { 238 addr = start;
169 unsigned long hyp_addr = KERN_TO_HYP(addr); 239 do {
170 pmd = pmd_offset(pud, hyp_addr); 240 pmd = pmd_offset(pud, addr);
171 241
172 BUG_ON(pmd_sect(*pmd)); 242 BUG_ON(pmd_sect(*pmd));
173 243
174 if (pmd_none(*pmd)) { 244 if (pmd_none(*pmd)) {
175 pte = pte_alloc_one_kernel(NULL, hyp_addr); 245 pte = pte_alloc_one_kernel(NULL, addr);
176 if (!pte) { 246 if (!pte) {
177 kvm_err("Cannot allocate Hyp pte\n"); 247 kvm_err("Cannot allocate Hyp pte\n");
178 return -ENOMEM; 248 return -ENOMEM;
179 } 249 }
180 pmd_populate_kernel(NULL, pmd, pte); 250 pmd_populate_kernel(NULL, pmd, pte);
251 get_page(virt_to_page(pmd));
252 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
181 } 253 }
182 254
183 next = pmd_addr_end(addr, end); 255 next = pmd_addr_end(addr, end);
184 256
185 /* 257 create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
186 * If pfn_base is NULL, we map kernel pages into HYP with the 258 pfn += (next - addr) >> PAGE_SHIFT;
187 * virtual address. Otherwise, this is considered an I/O 259 } while (addr = next, addr != end);
188 * mapping and we map the physical region starting at
189 * *pfn_base to [start, end[.
190 */
191 if (!pfn_base)
192 create_hyp_pte_mappings(pmd, addr, next);
193 else
194 create_hyp_io_pte_mappings(pmd, addr, next, pfn_base);
195 }
196 260
197 return 0; 261 return 0;
198} 262}
199 263
200static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base) 264static int __create_hyp_mappings(pgd_t *pgdp,
265 unsigned long start, unsigned long end,
266 unsigned long pfn, pgprot_t prot)
201{ 267{
202 unsigned long start = (unsigned long)from;
203 unsigned long end = (unsigned long)to;
204 pgd_t *pgd; 268 pgd_t *pgd;
205 pud_t *pud; 269 pud_t *pud;
206 pmd_t *pmd; 270 pmd_t *pmd;
207 unsigned long addr, next; 271 unsigned long addr, next;
208 int err = 0; 272 int err = 0;
209 273
210 if (start >= end)
211 return -EINVAL;
212 /* Check for a valid kernel memory mapping */
213 if (!pfn_base && (!virt_addr_valid(from) || !virt_addr_valid(to - 1)))
214 return -EINVAL;
215 /* Check for a valid kernel IO mapping */
216 if (pfn_base && (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)))
217 return -EINVAL;
218
219 mutex_lock(&kvm_hyp_pgd_mutex); 274 mutex_lock(&kvm_hyp_pgd_mutex);
220 for (addr = start; addr < end; addr = next) { 275 addr = start & PAGE_MASK;
221 unsigned long hyp_addr = KERN_TO_HYP(addr); 276 end = PAGE_ALIGN(end);
222 pgd = hyp_pgd + pgd_index(hyp_addr); 277 do {
223 pud = pud_offset(pgd, hyp_addr); 278 pgd = pgdp + pgd_index(addr);
279 pud = pud_offset(pgd, addr);
224 280
225 if (pud_none_or_clear_bad(pud)) { 281 if (pud_none_or_clear_bad(pud)) {
226 pmd = pmd_alloc_one(NULL, hyp_addr); 282 pmd = pmd_alloc_one(NULL, addr);
227 if (!pmd) { 283 if (!pmd) {
228 kvm_err("Cannot allocate Hyp pmd\n"); 284 kvm_err("Cannot allocate Hyp pmd\n");
229 err = -ENOMEM; 285 err = -ENOMEM;
230 goto out; 286 goto out;
231 } 287 }
232 pud_populate(NULL, pud, pmd); 288 pud_populate(NULL, pud, pmd);
289 get_page(virt_to_page(pud));
290 kvm_flush_dcache_to_poc(pud, sizeof(*pud));
233 } 291 }
234 292
235 next = pgd_addr_end(addr, end); 293 next = pgd_addr_end(addr, end);
236 err = create_hyp_pmd_mappings(pud, addr, next, pfn_base); 294 err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
237 if (err) 295 if (err)
238 goto out; 296 goto out;
239 } 297 pfn += (next - addr) >> PAGE_SHIFT;
298 } while (addr = next, addr != end);
240out: 299out:
241 mutex_unlock(&kvm_hyp_pgd_mutex); 300 mutex_unlock(&kvm_hyp_pgd_mutex);
242 return err; 301 return err;
@@ -250,27 +309,41 @@ out:
250 * The same virtual address as the kernel virtual address is also used 309 * The same virtual address as the kernel virtual address is also used
251 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 310 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
252 * physical pages. 311 * physical pages.
253 *
254 * Note: Wrapping around zero in the "to" address is not supported.
255 */ 312 */
256int create_hyp_mappings(void *from, void *to) 313int create_hyp_mappings(void *from, void *to)
257{ 314{
258 return __create_hyp_mappings(from, to, NULL); 315 unsigned long phys_addr = virt_to_phys(from);
316 unsigned long start = KERN_TO_HYP((unsigned long)from);
317 unsigned long end = KERN_TO_HYP((unsigned long)to);
318
319 /* Check for a valid kernel memory mapping */
320 if (!virt_addr_valid(from) || !virt_addr_valid(to - 1))
321 return -EINVAL;
322
323 return __create_hyp_mappings(hyp_pgd, start, end,
324 __phys_to_pfn(phys_addr), PAGE_HYP);
259} 325}
260 326
261/** 327/**
262 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode 328 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
263 * @from: The kernel start VA of the range 329 * @from: The kernel start VA of the range
264 * @to: The kernel end VA of the range (exclusive) 330 * @to: The kernel end VA of the range (exclusive)
265 * @addr: The physical start address which gets mapped 331 * @phys_addr: The physical start address which gets mapped
266 * 332 *
267 * The resulting HYP VA is the same as the kernel VA, modulo 333 * The resulting HYP VA is the same as the kernel VA, modulo
268 * HYP_PAGE_OFFSET. 334 * HYP_PAGE_OFFSET.
269 */ 335 */
270int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr) 336int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
271{ 337{
272 unsigned long pfn = __phys_to_pfn(addr); 338 unsigned long start = KERN_TO_HYP((unsigned long)from);
273 return __create_hyp_mappings(from, to, &pfn); 339 unsigned long end = KERN_TO_HYP((unsigned long)to);
340
341 /* Check for a valid kernel IO mapping */
342 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
343 return -EINVAL;
344
345 return __create_hyp_mappings(hyp_pgd, start, end,
346 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
274} 347}
275 348
276/** 349/**
@@ -307,42 +380,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
307 return 0; 380 return 0;
308} 381}
309 382
310static void clear_pud_entry(pud_t *pud)
311{
312 pmd_t *pmd_table = pmd_offset(pud, 0);
313 pud_clear(pud);
314 pmd_free(NULL, pmd_table);
315 put_page(virt_to_page(pud));
316}
317
318static void clear_pmd_entry(pmd_t *pmd)
319{
320 pte_t *pte_table = pte_offset_kernel(pmd, 0);
321 pmd_clear(pmd);
322 pte_free_kernel(NULL, pte_table);
323 put_page(virt_to_page(pmd));
324}
325
326static bool pmd_empty(pmd_t *pmd)
327{
328 struct page *pmd_page = virt_to_page(pmd);
329 return page_count(pmd_page) == 1;
330}
331
332static void clear_pte_entry(pte_t *pte)
333{
334 if (pte_present(*pte)) {
335 kvm_set_pte(pte, __pte(0));
336 put_page(virt_to_page(pte));
337 }
338}
339
340static bool pte_empty(pte_t *pte)
341{
342 struct page *pte_page = virt_to_page(pte);
343 return page_count(pte_page) == 1;
344}
345
346/** 383/**
347 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range 384 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
348 * @kvm: The VM pointer 385 * @kvm: The VM pointer
@@ -356,43 +393,7 @@ static bool pte_empty(pte_t *pte)
356 */ 393 */
357static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) 394static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
358{ 395{
359 pgd_t *pgd; 396 unmap_range(kvm->arch.pgd, start, size);
360 pud_t *pud;
361 pmd_t *pmd;
362 pte_t *pte;
363 phys_addr_t addr = start, end = start + size;
364 u64 range;
365
366 while (addr < end) {
367 pgd = kvm->arch.pgd + pgd_index(addr);
368 pud = pud_offset(pgd, addr);
369 if (pud_none(*pud)) {
370 addr += PUD_SIZE;
371 continue;
372 }
373
374 pmd = pmd_offset(pud, addr);
375 if (pmd_none(*pmd)) {
376 addr += PMD_SIZE;
377 continue;
378 }
379
380 pte = pte_offset_kernel(pmd, addr);
381 clear_pte_entry(pte);
382 range = PAGE_SIZE;
383
384 /* If we emptied the pte, walk back up the ladder */
385 if (pte_empty(pte)) {
386 clear_pmd_entry(pmd);
387 range = PMD_SIZE;
388 if (pmd_empty(pmd)) {
389 clear_pud_entry(pud);
390 range = PUD_SIZE;
391 }
392 }
393
394 addr += range;
395 }
396} 397}
397 398
398/** 399/**
@@ -728,47 +729,105 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
728 729
729phys_addr_t kvm_mmu_get_httbr(void) 730phys_addr_t kvm_mmu_get_httbr(void)
730{ 731{
731 VM_BUG_ON(!virt_addr_valid(hyp_pgd));
732 return virt_to_phys(hyp_pgd); 732 return virt_to_phys(hyp_pgd);
733} 733}
734 734
735phys_addr_t kvm_mmu_get_boot_httbr(void)
736{
737 return virt_to_phys(boot_hyp_pgd);
738}
739
740phys_addr_t kvm_get_idmap_vector(void)
741{
742 return hyp_idmap_vector;
743}
744
735int kvm_mmu_init(void) 745int kvm_mmu_init(void)
736{ 746{
737 if (!hyp_pgd) { 747 int err;
748
749 hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start);
750 hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end);
751 hyp_idmap_vector = virt_to_phys(__kvm_hyp_init);
752
753 if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
754 /*
755 * Our init code is crossing a page boundary. Allocate
756 * a bounce page, copy the code over and use that.
757 */
758 size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
759 phys_addr_t phys_base;
760
761 init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
762 if (!init_bounce_page) {
763 kvm_err("Couldn't allocate HYP init bounce page\n");
764 err = -ENOMEM;
765 goto out;
766 }
767
768 memcpy(init_bounce_page, __hyp_idmap_text_start, len);
769 /*
770 * Warning: the code we just copied to the bounce page
771 * must be flushed to the point of coherency.
772 * Otherwise, the data may be sitting in L2, and HYP
773 * mode won't be able to observe it as it runs with
774 * caches off at that point.
775 */
776 kvm_flush_dcache_to_poc(init_bounce_page, len);
777
778 phys_base = virt_to_phys(init_bounce_page);
779 hyp_idmap_vector += phys_base - hyp_idmap_start;
780 hyp_idmap_start = phys_base;
781 hyp_idmap_end = phys_base + len;
782
783 kvm_info("Using HYP init bounce page @%lx\n",
784 (unsigned long)phys_base);
785 }
786
787 hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
788 boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
789 if (!hyp_pgd || !boot_hyp_pgd) {
738 kvm_err("Hyp mode PGD not allocated\n"); 790 kvm_err("Hyp mode PGD not allocated\n");
739 return -ENOMEM; 791 err = -ENOMEM;
792 goto out;
740 } 793 }
741 794
742 return 0; 795 /* Create the idmap in the boot page tables */
743} 796 err = __create_hyp_mappings(boot_hyp_pgd,
797 hyp_idmap_start, hyp_idmap_end,
798 __phys_to_pfn(hyp_idmap_start),
799 PAGE_HYP);
744 800
745/** 801 if (err) {
746 * kvm_clear_idmap - remove all idmaps from the hyp pgd 802 kvm_err("Failed to idmap %lx-%lx\n",
747 * 803 hyp_idmap_start, hyp_idmap_end);
748 * Free the underlying pmds for all pgds in range and clear the pgds (but 804 goto out;
749 * don't free them) afterwards. 805 }
750 */
751void kvm_clear_hyp_idmap(void)
752{
753 unsigned long addr, end;
754 unsigned long next;
755 pgd_t *pgd = hyp_pgd;
756 pud_t *pud;
757 pmd_t *pmd;
758 806
759 addr = virt_to_phys(__hyp_idmap_text_start); 807 /* Map the very same page at the trampoline VA */
760 end = virt_to_phys(__hyp_idmap_text_end); 808 err = __create_hyp_mappings(boot_hyp_pgd,
809 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
810 __phys_to_pfn(hyp_idmap_start),
811 PAGE_HYP);
812 if (err) {
813 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
814 TRAMPOLINE_VA);
815 goto out;
816 }
761 817
762 pgd += pgd_index(addr); 818 /* Map the same page again into the runtime page tables */
763 do { 819 err = __create_hyp_mappings(hyp_pgd,
764 next = pgd_addr_end(addr, end); 820 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
765 if (pgd_none_or_clear_bad(pgd)) 821 __phys_to_pfn(hyp_idmap_start),
766 continue; 822 PAGE_HYP);
767 pud = pud_offset(pgd, addr); 823 if (err) {
768 pmd = pmd_offset(pud, addr); 824 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
825 TRAMPOLINE_VA);
826 goto out;
827 }
769 828
770 pud_clear(pud); 829 return 0;
771 kvm_clean_pmd_entry(pmd); 830out:
772 pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK)); 831 free_hyp_pgds();
773 } while (pgd++, addr = next, addr < end); 832 return err;
774} 833}
diff --git a/arch/arm/kvm/perf.c b/arch/arm/kvm/perf.c
new file mode 100644
index 000000000000..1a3849da0b4b
--- /dev/null
+++ b/arch/arm/kvm/perf.c
@@ -0,0 +1,68 @@
1/*
2 * Based on the x86 implementation.
3 *
4 * Copyright (C) 2012 ARM Ltd.
5 * Author: Marc Zyngier <marc.zyngier@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include <linux/perf_event.h>
21#include <linux/kvm_host.h>
22
23#include <asm/kvm_emulate.h>
24
25static int kvm_is_in_guest(void)
26{
27 return kvm_arm_get_running_vcpu() != NULL;
28}
29
30static int kvm_is_user_mode(void)
31{
32 struct kvm_vcpu *vcpu;
33
34 vcpu = kvm_arm_get_running_vcpu();
35
36 if (vcpu)
37 return !vcpu_mode_priv(vcpu);
38
39 return 0;
40}
41
42static unsigned long kvm_get_guest_ip(void)
43{
44 struct kvm_vcpu *vcpu;
45
46 vcpu = kvm_arm_get_running_vcpu();
47
48 if (vcpu)
49 return *vcpu_pc(vcpu);
50
51 return 0;
52}
53
54static struct perf_guest_info_callbacks kvm_guest_cbs = {
55 .is_in_guest = kvm_is_in_guest,
56 .is_user_mode = kvm_is_user_mode,
57 .get_guest_ip = kvm_get_guest_ip,
58};
59
60int kvm_perf_init(void)
61{
62 return perf_register_guest_info_callbacks(&kvm_guest_cbs);
63}
64
65int kvm_perf_teardown(void)
66{
67 return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
68}
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 5ee505c937d1..83cb3ac27095 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -8,7 +8,6 @@
8#include <asm/pgtable.h> 8#include <asm/pgtable.h>
9#include <asm/sections.h> 9#include <asm/sections.h>
10#include <asm/system_info.h> 10#include <asm/system_info.h>
11#include <asm/virt.h>
12 11
13pgd_t *idmap_pgd; 12pgd_t *idmap_pgd;
14 13
@@ -83,37 +82,10 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
83 } while (pgd++, addr = next, addr != end); 82 } while (pgd++, addr = next, addr != end);
84} 83}
85 84
86#if defined(CONFIG_ARM_VIRT_EXT) && defined(CONFIG_ARM_LPAE)
87pgd_t *hyp_pgd;
88
89extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
90
91static int __init init_static_idmap_hyp(void)
92{
93 hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
94 if (!hyp_pgd)
95 return -ENOMEM;
96
97 pr_info("Setting up static HYP identity map for 0x%p - 0x%p\n",
98 __hyp_idmap_text_start, __hyp_idmap_text_end);
99 identity_mapping_add(hyp_pgd, __hyp_idmap_text_start,
100 __hyp_idmap_text_end, PMD_SECT_AP1);
101
102 return 0;
103}
104#else
105static int __init init_static_idmap_hyp(void)
106{
107 return 0;
108}
109#endif
110
111extern char __idmap_text_start[], __idmap_text_end[]; 85extern char __idmap_text_start[], __idmap_text_end[];
112 86
113static int __init init_static_idmap(void) 87static int __init init_static_idmap(void)
114{ 88{
115 int ret;
116
117 idmap_pgd = pgd_alloc(&init_mm); 89 idmap_pgd = pgd_alloc(&init_mm);
118 if (!idmap_pgd) 90 if (!idmap_pgd)
119 return -ENOMEM; 91 return -ENOMEM;
@@ -123,12 +95,10 @@ static int __init init_static_idmap(void)
123 identity_mapping_add(idmap_pgd, __idmap_text_start, 95 identity_mapping_add(idmap_pgd, __idmap_text_start,
124 __idmap_text_end, 0); 96 __idmap_text_end, 0);
125 97
126 ret = init_static_idmap_hyp();
127
128 /* Flush L1 for the hardware to see this page table content */ 98 /* Flush L1 for the hardware to see this page table content */
129 flush_cache_louis(); 99 flush_cache_louis();
130 100
131 return ret; 101 return 0;
132} 102}
133early_initcall(init_static_idmap); 103early_initcall(init_static_idmap);
134 104
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index cfa74983c675..989dd3fe8de1 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -26,6 +26,7 @@
26#define KVM_USER_MEM_SLOTS 32 26#define KVM_USER_MEM_SLOTS 32
27 27
28#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 28#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
29#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
29 30
30/* define exit reasons from vmm to kvm*/ 31/* define exit reasons from vmm to kvm*/
31#define EXIT_REASON_VM_PANIC 0 32#define EXIT_REASON_VM_PANIC 0
diff --git a/arch/ia64/include/uapi/asm/kvm.h b/arch/ia64/include/uapi/asm/kvm.h
index ec6c6b301238..99503c284400 100644
--- a/arch/ia64/include/uapi/asm/kvm.h
+++ b/arch/ia64/include/uapi/asm/kvm.h
@@ -27,7 +27,6 @@
27/* Select x86 specific features in <linux/kvm.h> */ 27/* Select x86 specific features in <linux/kvm.h> */
28#define __KVM_HAVE_IOAPIC 28#define __KVM_HAVE_IOAPIC
29#define __KVM_HAVE_IRQ_LINE 29#define __KVM_HAVE_IRQ_LINE
30#define __KVM_HAVE_DEVICE_ASSIGNMENT
31 30
32/* Architectural interrupt line count. */ 31/* Architectural interrupt line count. */
33#define KVM_NR_INTERRUPTS 256 32#define KVM_NR_INTERRUPTS 256
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 2cd225f8c68d..990b86420cc6 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -21,12 +21,11 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on BROKEN 22 depends on BROKEN
23 depends on HAVE_KVM && MODULES 23 depends on HAVE_KVM && MODULES
24 # for device assignment:
25 depends on PCI
26 depends on BROKEN 24 depends on BROKEN
27 select PREEMPT_NOTIFIERS 25 select PREEMPT_NOTIFIERS
28 select ANON_INODES 26 select ANON_INODES
29 select HAVE_KVM_IRQCHIP 27 select HAVE_KVM_IRQCHIP
28 select HAVE_KVM_IRQ_ROUTING
30 select KVM_APIC_ARCHITECTURE 29 select KVM_APIC_ARCHITECTURE
31 select KVM_MMIO 30 select KVM_MMIO
32 ---help--- 31 ---help---
@@ -50,6 +49,17 @@ config KVM_INTEL
50 Provides support for KVM on Itanium 2 processors equipped with the VT 49 Provides support for KVM on Itanium 2 processors equipped with the VT
51 extensions. 50 extensions.
52 51
52config KVM_DEVICE_ASSIGNMENT
53 bool "KVM legacy PCI device assignment support"
54 depends on KVM && PCI && IOMMU_API
55 default y
56 ---help---
57 Provide support for legacy PCI device assignment through KVM. The
58 kernel now also supports a full featured userspace device driver
59 framework through VFIO, which supersedes much of this support.
60
61 If unsure, say Y.
62
53source drivers/vhost/Kconfig 63source drivers/vhost/Kconfig
54 64
55endif # VIRTUALIZATION 65endif # VIRTUALIZATION
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index db3d7c5d1071..1a4053789d01 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,10 +49,10 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/ 49asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
50 50
51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 51common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
52 coalesced_mmio.o irq_comm.o assigned-dev.o) 52 coalesced_mmio.o irq_comm.o)
53 53
54ifeq ($(CONFIG_IOMMU_API),y) 54ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
55common-objs += $(addprefix ../../../virt/kvm/, iommu.o) 55common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o)
56endif 56endif
57 57
58kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o 58kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index ad3126a58644..5b2dc0d10c8f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -204,9 +204,11 @@ int kvm_dev_ioctl_check_extension(long ext)
204 case KVM_CAP_COALESCED_MMIO: 204 case KVM_CAP_COALESCED_MMIO:
205 r = KVM_COALESCED_MMIO_PAGE_OFFSET; 205 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
206 break; 206 break;
207#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
207 case KVM_CAP_IOMMU: 208 case KVM_CAP_IOMMU:
208 r = iommu_present(&pci_bus_type); 209 r = iommu_present(&pci_bus_type);
209 break; 210 break;
211#endif
210 default: 212 default:
211 r = 0; 213 r = 0;
212 } 214 }
@@ -924,13 +926,15 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
924 return 0; 926 return 0;
925} 927}
926 928
927int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 929int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
930 bool line_status)
928{ 931{
929 if (!irqchip_in_kernel(kvm)) 932 if (!irqchip_in_kernel(kvm))
930 return -ENXIO; 933 return -ENXIO;
931 934
932 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 935 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
933 irq_event->irq, irq_event->level); 936 irq_event->irq, irq_event->level,
937 line_status);
934 return 0; 938 return 0;
935} 939}
936 940
@@ -942,24 +946,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
942 int r = -ENOTTY; 946 int r = -ENOTTY;
943 947
944 switch (ioctl) { 948 switch (ioctl) {
945 case KVM_SET_MEMORY_REGION: {
946 struct kvm_memory_region kvm_mem;
947 struct kvm_userspace_memory_region kvm_userspace_mem;
948
949 r = -EFAULT;
950 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
951 goto out;
952 kvm_userspace_mem.slot = kvm_mem.slot;
953 kvm_userspace_mem.flags = kvm_mem.flags;
954 kvm_userspace_mem.guest_phys_addr =
955 kvm_mem.guest_phys_addr;
956 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
957 r = kvm_vm_ioctl_set_memory_region(kvm,
958 &kvm_userspace_mem, false);
959 if (r)
960 goto out;
961 break;
962 }
963 case KVM_CREATE_IRQCHIP: 949 case KVM_CREATE_IRQCHIP:
964 r = -EFAULT; 950 r = -EFAULT;
965 r = kvm_ioapic_init(kvm); 951 r = kvm_ioapic_init(kvm);
@@ -1384,9 +1370,7 @@ void kvm_arch_sync_events(struct kvm *kvm)
1384void kvm_arch_destroy_vm(struct kvm *kvm) 1370void kvm_arch_destroy_vm(struct kvm *kvm)
1385{ 1371{
1386 kvm_iommu_unmap_guest(kvm); 1372 kvm_iommu_unmap_guest(kvm);
1387#ifdef KVM_CAP_DEVICE_ASSIGNMENT
1388 kvm_free_all_assigned_devices(kvm); 1373 kvm_free_all_assigned_devices(kvm);
1389#endif
1390 kfree(kvm->arch.vioapic); 1374 kfree(kvm->arch.vioapic);
1391 kvm_release_vm_pages(kvm); 1375 kvm_release_vm_pages(kvm);
1392} 1376}
@@ -1578,9 +1562,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
1578 1562
1579int kvm_arch_prepare_memory_region(struct kvm *kvm, 1563int kvm_arch_prepare_memory_region(struct kvm *kvm,
1580 struct kvm_memory_slot *memslot, 1564 struct kvm_memory_slot *memslot,
1581 struct kvm_memory_slot old,
1582 struct kvm_userspace_memory_region *mem, 1565 struct kvm_userspace_memory_region *mem,
1583 bool user_alloc) 1566 enum kvm_mr_change change)
1584{ 1567{
1585 unsigned long i; 1568 unsigned long i;
1586 unsigned long pfn; 1569 unsigned long pfn;
@@ -1610,8 +1593,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1610 1593
1611void kvm_arch_commit_memory_region(struct kvm *kvm, 1594void kvm_arch_commit_memory_region(struct kvm *kvm,
1612 struct kvm_userspace_memory_region *mem, 1595 struct kvm_userspace_memory_region *mem,
1613 struct kvm_memory_slot old, 1596 const struct kvm_memory_slot *old,
1614 bool user_alloc) 1597 enum kvm_mr_change change)
1615{ 1598{
1616 return; 1599 return;
1617} 1600}
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c3e2935b6db4..c5f92a926a9a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
27#define kvm_apic_present(x) (true) 27#define kvm_apic_present(x) (true)
28#define kvm_lapic_enabled(x) (true) 28#define kvm_lapic_enabled(x) (true)
29 29
30static inline bool kvm_apic_vid_enabled(void)
31{
32 /* IA64 has no apicv supporting, do nothing here */
33 return false;
34}
35
36#endif 30#endif
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 4bc2c3dad6ad..cf4df8e2139a 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -270,6 +270,9 @@
270#define H_SET_MODE 0x31C 270#define H_SET_MODE 0x31C
271#define MAX_HCALL_OPCODE H_SET_MODE 271#define MAX_HCALL_OPCODE H_SET_MODE
272 272
273/* Platform specific hcalls, used by KVM */
274#define H_RTAS 0xf000
275
273#ifndef __ASSEMBLY__ 276#ifndef __ASSEMBLY__
274 277
275/** 278/**
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 5a56e1c5f851..349ed85c7d61 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void);
142extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 142extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
143extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 143extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
144extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); 144extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
145extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
146 unsigned int vec);
145extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags); 147extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
146extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, 148extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
147 bool upper, u32 val); 149 bool upper, u32 val);
@@ -156,7 +158,8 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
156 unsigned long pte_index); 158 unsigned long pte_index);
157extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, 159extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
158 unsigned long *nb_ret); 160 unsigned long *nb_ret);
159extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); 161extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
162 unsigned long gpa, bool dirty);
160extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 163extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
161 long pte_index, unsigned long pteh, unsigned long ptel); 164 long pte_index, unsigned long pteh, unsigned long ptel);
162extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 165extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
@@ -458,6 +461,8 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
458#define OSI_SC_MAGIC_R4 0x77810F9B 461#define OSI_SC_MAGIC_R4 0x77810F9B
459 462
460#define INS_DCBZ 0x7c0007ec 463#define INS_DCBZ 0x7c0007ec
464/* TO = 31 for unconditional trap */
465#define INS_TW 0x7fe00008
461 466
462/* LPIDs we support with this build -- runtime limit may be lower */ 467/* LPIDs we support with this build -- runtime limit may be lower */
463#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) 468#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 38bec1dc9928..9c1ff330c805 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -268,4 +268,17 @@ static inline int is_vrma_hpte(unsigned long hpte_v)
268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); 268 (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
269} 269}
270 270
271#ifdef CONFIG_KVM_BOOK3S_64_HV
272/*
273 * Note modification of an HPTE; set the HPTE modified bit
274 * if anyone is interested.
275 */
276static inline void note_hpte_modification(struct kvm *kvm,
277 struct revmap_entry *rev)
278{
279 if (atomic_read(&kvm->arch.hpte_mod_interest))
280 rev->guest_rpte |= HPTE_GR_MODIFIED;
281}
282#endif /* CONFIG_KVM_BOOK3S_64_HV */
283
271#endif /* __ASM_KVM_BOOK3S_64_H__ */ 284#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index cdc3d2717cc6..9039d3c97eec 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -20,6 +20,11 @@
20#ifndef __ASM_KVM_BOOK3S_ASM_H__ 20#ifndef __ASM_KVM_BOOK3S_ASM_H__
21#define __ASM_KVM_BOOK3S_ASM_H__ 21#define __ASM_KVM_BOOK3S_ASM_H__
22 22
23/* XICS ICP register offsets */
24#define XICS_XIRR 4
25#define XICS_MFRR 0xc
26#define XICS_IPI 2 /* interrupt source # for IPIs */
27
23#ifdef __ASSEMBLY__ 28#ifdef __ASSEMBLY__
24 29
25#ifdef CONFIG_KVM_BOOK3S_HANDLER 30#ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -81,10 +86,11 @@ struct kvmppc_host_state {
81#ifdef CONFIG_KVM_BOOK3S_64_HV 86#ifdef CONFIG_KVM_BOOK3S_64_HV
82 u8 hwthread_req; 87 u8 hwthread_req;
83 u8 hwthread_state; 88 u8 hwthread_state;
84 89 u8 host_ipi;
85 struct kvm_vcpu *kvm_vcpu; 90 struct kvm_vcpu *kvm_vcpu;
86 struct kvmppc_vcore *kvm_vcore; 91 struct kvmppc_vcore *kvm_vcore;
87 unsigned long xics_phys; 92 unsigned long xics_phys;
93 u32 saved_xirr;
88 u64 dabr; 94 u64 dabr;
89 u64 host_mmcr[3]; 95 u64 host_mmcr[3];
90 u32 host_pmc[8]; 96 u32 host_pmc[8];
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index b7cd3356a532..d3c1eb34c986 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -26,6 +26,8 @@
26/* LPIDs we support with this build -- runtime limit may be lower */ 26/* LPIDs we support with this build -- runtime limit may be lower */
27#define KVMPPC_NR_LPIDS 64 27#define KVMPPC_NR_LPIDS 64
28 28
29#define KVMPPC_INST_EHPRIV 0x7c00021c
30
29static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) 31static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
30{ 32{
31 vcpu->arch.gpr[num] = val; 33 vcpu->arch.gpr[num] = val;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d1bb86074721..af326cde7cb6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,10 @@
44#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 44#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
45#endif 45#endif
46 46
47/* These values are internal and can be increased later */
48#define KVM_NR_IRQCHIPS 1
49#define KVM_IRQCHIP_NUM_PINS 256
50
47#if !defined(CONFIG_KVM_440) 51#if !defined(CONFIG_KVM_440)
48#include <linux/mmu_notifier.h> 52#include <linux/mmu_notifier.h>
49 53
@@ -188,6 +192,10 @@ struct kvmppc_linear_info {
188 int type; 192 int type;
189}; 193};
190 194
195/* XICS components, defined in book3s_xics.c */
196struct kvmppc_xics;
197struct kvmppc_icp;
198
191/* 199/*
192 * The reverse mapping array has one entry for each HPTE, 200 * The reverse mapping array has one entry for each HPTE,
193 * which stores the guest's view of the second word of the HPTE 201 * which stores the guest's view of the second word of the HPTE
@@ -255,6 +263,13 @@ struct kvm_arch {
255#endif /* CONFIG_KVM_BOOK3S_64_HV */ 263#endif /* CONFIG_KVM_BOOK3S_64_HV */
256#ifdef CONFIG_PPC_BOOK3S_64 264#ifdef CONFIG_PPC_BOOK3S_64
257 struct list_head spapr_tce_tables; 265 struct list_head spapr_tce_tables;
266 struct list_head rtas_tokens;
267#endif
268#ifdef CONFIG_KVM_MPIC
269 struct openpic *mpic;
270#endif
271#ifdef CONFIG_KVM_XICS
272 struct kvmppc_xics *xics;
258#endif 273#endif
259}; 274};
260 275
@@ -301,11 +316,13 @@ struct kvmppc_vcore {
301 * that a guest can register. 316 * that a guest can register.
302 */ 317 */
303struct kvmppc_vpa { 318struct kvmppc_vpa {
319 unsigned long gpa; /* Current guest phys addr */
304 void *pinned_addr; /* Address in kernel linear mapping */ 320 void *pinned_addr; /* Address in kernel linear mapping */
305 void *pinned_end; /* End of region */ 321 void *pinned_end; /* End of region */
306 unsigned long next_gpa; /* Guest phys addr for update */ 322 unsigned long next_gpa; /* Guest phys addr for update */
307 unsigned long len; /* Number of bytes required */ 323 unsigned long len; /* Number of bytes required */
308 u8 update_pending; /* 1 => update pinned_addr from next_gpa */ 324 u8 update_pending; /* 1 => update pinned_addr from next_gpa */
325 bool dirty; /* true => area has been modified by kernel */
309}; 326};
310 327
311struct kvmppc_pte { 328struct kvmppc_pte {
@@ -359,6 +376,11 @@ struct kvmppc_slb {
359#define KVMPPC_BOOKE_MAX_IAC 4 376#define KVMPPC_BOOKE_MAX_IAC 4
360#define KVMPPC_BOOKE_MAX_DAC 2 377#define KVMPPC_BOOKE_MAX_DAC 2
361 378
379/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
380#define KVMPPC_EPR_NONE 0 /* EPR not supported */
381#define KVMPPC_EPR_USER 1 /* exit to userspace to fill EPR */
382#define KVMPPC_EPR_KERNEL 2 /* in-kernel irqchip */
383
362struct kvmppc_booke_debug_reg { 384struct kvmppc_booke_debug_reg {
363 u32 dbcr0; 385 u32 dbcr0;
364 u32 dbcr1; 386 u32 dbcr1;
@@ -370,6 +392,12 @@ struct kvmppc_booke_debug_reg {
370 u64 dac[KVMPPC_BOOKE_MAX_DAC]; 392 u64 dac[KVMPPC_BOOKE_MAX_DAC];
371}; 393};
372 394
395#define KVMPPC_IRQ_DEFAULT 0
396#define KVMPPC_IRQ_MPIC 1
397#define KVMPPC_IRQ_XICS 2
398
399struct openpic;
400
373struct kvm_vcpu_arch { 401struct kvm_vcpu_arch {
374 ulong host_stack; 402 ulong host_stack;
375 u32 host_pid; 403 u32 host_pid;
@@ -502,8 +530,11 @@ struct kvm_vcpu_arch {
502 spinlock_t wdt_lock; 530 spinlock_t wdt_lock;
503 struct timer_list wdt_timer; 531 struct timer_list wdt_timer;
504 u32 tlbcfg[4]; 532 u32 tlbcfg[4];
533 u32 tlbps[4];
505 u32 mmucfg; 534 u32 mmucfg;
535 u32 eptcfg;
506 u32 epr; 536 u32 epr;
537 u32 crit_save;
507 struct kvmppc_booke_debug_reg dbg_reg; 538 struct kvmppc_booke_debug_reg dbg_reg;
508#endif 539#endif
509 gpa_t paddr_accessed; 540 gpa_t paddr_accessed;
@@ -521,7 +552,7 @@ struct kvm_vcpu_arch {
521 u8 sane; 552 u8 sane;
522 u8 cpu_type; 553 u8 cpu_type;
523 u8 hcall_needed; 554 u8 hcall_needed;
524 u8 epr_enabled; 555 u8 epr_flags; /* KVMPPC_EPR_xxx */
525 u8 epr_needed; 556 u8 epr_needed;
526 557
527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 558 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -548,6 +579,13 @@ struct kvm_vcpu_arch {
548 unsigned long magic_page_pa; /* phys addr to map the magic page to */ 579 unsigned long magic_page_pa; /* phys addr to map the magic page to */
549 unsigned long magic_page_ea; /* effect. addr to map the magic page to */ 580 unsigned long magic_page_ea; /* effect. addr to map the magic page to */
550 581
582 int irq_type; /* one of KVM_IRQ_* */
583 int irq_cpu_id;
584 struct openpic *mpic; /* KVM_IRQ_MPIC */
585#ifdef CONFIG_KVM_XICS
586 struct kvmppc_icp *icp; /* XICS presentation controller */
587#endif
588
551#ifdef CONFIG_KVM_BOOK3S_64_HV 589#ifdef CONFIG_KVM_BOOK3S_64_HV
552 struct kvm_vcpu_arch_shared shregs; 590 struct kvm_vcpu_arch_shared shregs;
553 591
@@ -588,5 +626,6 @@ struct kvm_vcpu_arch {
588#define KVM_MMIO_REG_FQPR 0x0060 626#define KVM_MMIO_REG_FQPR 0x0060
589 627
590#define __KVM_HAVE_ARCH_WQP 628#define __KVM_HAVE_ARCH_WQP
629#define __KVM_HAVE_CREATE_DEVICE
591 630
592#endif /* __POWERPC_KVM_HOST_H__ */ 631#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 44a657adf416..a5287fe03d77 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,7 +44,7 @@ enum emulation_result {
44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */ 44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */
45 EMULATE_FAIL, /* can't emulate this instruction */ 45 EMULATE_FAIL, /* can't emulate this instruction */
46 EMULATE_AGAIN, /* something went wrong. go again */ 46 EMULATE_AGAIN, /* something went wrong. go again */
47 EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */ 47 EMULATE_EXIT_USER, /* emulation requires exit to user-space */
48}; 48};
49 49
50extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 50extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -104,8 +104,7 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
104extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); 104extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
105extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, 105extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
106 struct kvm_interrupt *irq); 106 struct kvm_interrupt *irq);
107extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 107extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu);
108 struct kvm_interrupt *irq);
109extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); 108extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
110 109
111extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 110extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -131,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
131extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, 130extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
132 struct kvm_memory_slot *memslot, unsigned long porder); 131 struct kvm_memory_slot *memslot, unsigned long porder);
133extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); 132extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
133
134extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, 134extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
135 struct kvm_create_spapr_tce *args); 135 struct kvm_create_spapr_tce *args);
136extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, 136extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -152,7 +152,7 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
152 struct kvm_userspace_memory_region *mem); 152 struct kvm_userspace_memory_region *mem);
153extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 153extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
154 struct kvm_userspace_memory_region *mem, 154 struct kvm_userspace_memory_region *mem,
155 struct kvm_memory_slot old); 155 const struct kvm_memory_slot *old);
156extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 156extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
157 struct kvm_ppc_smmu_info *info); 157 struct kvm_ppc_smmu_info *info);
158extern void kvmppc_core_flush_memslot(struct kvm *kvm, 158extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -165,6 +165,18 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
165 165
166extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); 166extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
167 167
168int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
169
170extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
171extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
172extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
173extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
174 u32 priority);
175extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
176 u32 *priority);
177extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
178extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
179
168/* 180/*
169 * Cuts out inst bits with ordering according to spec. 181 * Cuts out inst bits with ordering according to spec.
170 * That means the leftmost bit is zero. All given bits are included. 182 * That means the leftmost bit is zero. All given bits are included.
@@ -246,12 +258,29 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
246 258
247void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 259void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
248 260
261struct openpic;
262
249#ifdef CONFIG_KVM_BOOK3S_64_HV 263#ifdef CONFIG_KVM_BOOK3S_64_HV
250static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 264static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
251{ 265{
252 paca[cpu].kvm_hstate.xics_phys = addr; 266 paca[cpu].kvm_hstate.xics_phys = addr;
253} 267}
254 268
269static inline u32 kvmppc_get_xics_latch(void)
270{
271 u32 xirr = get_paca()->kvm_hstate.saved_xirr;
272
273 get_paca()->kvm_hstate.saved_xirr = 0;
274
275 return xirr;
276}
277
278static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
279{
280 paca[cpu].kvm_hstate.host_ipi = host_ipi;
281}
282
283extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
255extern void kvm_linear_init(void); 284extern void kvm_linear_init(void);
256 285
257#else 286#else
@@ -260,6 +289,46 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
260 289
261static inline void kvm_linear_init(void) 290static inline void kvm_linear_init(void)
262{} 291{}
292
293static inline u32 kvmppc_get_xics_latch(void)
294{
295 return 0;
296}
297
298static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
299{}
300
301static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
302{
303 kvm_vcpu_kick(vcpu);
304}
305#endif
306
307#ifdef CONFIG_KVM_XICS
308static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
309{
310 return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
311}
312extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
313extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
314extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
315extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
316extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
317extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
318extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
319 struct kvm_vcpu *vcpu, u32 cpu);
320#else
321static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
322 { return 0; }
323static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
324static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
325 unsigned long server)
326 { return -EINVAL; }
327static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
328 struct kvm_irq_level *args)
329 { return -ENOTTY; }
330static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
331 { return 0; }
263#endif 332#endif
264 333
265static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr) 334static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
@@ -271,6 +340,32 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
271#endif 340#endif
272} 341}
273 342
343#ifdef CONFIG_KVM_MPIC
344
345void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
346int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
347 u32 cpu);
348void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu);
349
350#else
351
352static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
353{
354}
355
356static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev,
357 struct kvm_vcpu *vcpu, u32 cpu)
358{
359 return -EINVAL;
360}
361
362static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp,
363 struct kvm_vcpu *vcpu)
364{
365}
366
367#endif /* CONFIG_KVM_MPIC */
368
274int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 369int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
275 struct kvm_config_tlb *cfg); 370 struct kvm_config_tlb *cfg);
276int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, 371int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
@@ -283,8 +378,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids);
283 378
284static inline void kvmppc_mmu_flush_icache(pfn_t pfn) 379static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
285{ 380{
286 /* Clear i-cache for new pages */
287 struct page *page; 381 struct page *page;
382 /*
383 * We can only access pages that the kernel maps
384 * as memory. Bail out for unmapped ones.
385 */
386 if (!pfn_valid(pfn))
387 return;
388
389 /* Clear i-cache for new pages */
288 page = pfn_to_page(pfn); 390 page = pfn_to_page(pfn);
289 if (!test_bit(PG_arch_1, &page->flags)) { 391 if (!test_bit(PG_arch_1, &page->flags)) {
290 flush_dcache_icache_page(page); 392 flush_dcache_icache_page(page);
@@ -324,4 +426,6 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
324 return ea; 426 return ea;
325} 427}
326 428
429extern void xics_wake_cpu(int cpu);
430
327#endif /* __POWERPC_KVM_PPC_H__ */ 431#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 3d17427e4fd7..a6136515c7f2 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -300,6 +300,7 @@
300#define LPCR_PECE1 0x00002000 /* decrementer can cause exit */ 300#define LPCR_PECE1 0x00002000 /* decrementer can cause exit */
301#define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */ 301#define LPCR_PECE2 0x00001000 /* machine check etc can cause exit */
302#define LPCR_MER 0x00000800 /* Mediated External Exception */ 302#define LPCR_MER 0x00000800 /* Mediated External Exception */
303#define LPCR_MER_SH 11
303#define LPCR_LPES 0x0000000c 304#define LPCR_LPES 0x0000000c
304#define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */ 305#define LPCR_LPES0 0x00000008 /* LPAR Env selector 0 */
305#define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */ 306#define LPCR_LPES1 0x00000004 /* LPAR Env selector 1 */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 16064d00adb9..0fb1a6e9ff90 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -25,6 +25,8 @@
25/* Select powerpc specific features in <linux/kvm.h> */ 25/* Select powerpc specific features in <linux/kvm.h> */
26#define __KVM_HAVE_SPAPR_TCE 26#define __KVM_HAVE_SPAPR_TCE
27#define __KVM_HAVE_PPC_SMT 27#define __KVM_HAVE_PPC_SMT
28#define __KVM_HAVE_IRQCHIP
29#define __KVM_HAVE_IRQ_LINE
28 30
29struct kvm_regs { 31struct kvm_regs {
30 __u64 pc; 32 __u64 pc;
@@ -272,8 +274,31 @@ struct kvm_debug_exit_arch {
272 274
273/* for KVM_SET_GUEST_DEBUG */ 275/* for KVM_SET_GUEST_DEBUG */
274struct kvm_guest_debug_arch { 276struct kvm_guest_debug_arch {
277 struct {
278 /* H/W breakpoint/watchpoint address */
279 __u64 addr;
280 /*
281 * Type denotes h/w breakpoint, read watchpoint, write
282 * watchpoint or watchpoint (both read and write).
283 */
284#define KVMPPC_DEBUG_NONE 0x0
285#define KVMPPC_DEBUG_BREAKPOINT (1UL << 1)
286#define KVMPPC_DEBUG_WATCH_WRITE (1UL << 2)
287#define KVMPPC_DEBUG_WATCH_READ (1UL << 3)
288 __u32 type;
289 __u32 reserved;
290 } bp[16];
275}; 291};
276 292
293/* Debug related defines */
294/*
295 * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic
296 * and upper 16 bits are architecture specific. Architecture specific defines
297 * that ioctl is for setting hardware breakpoint or software breakpoint.
298 */
299#define KVM_GUESTDBG_USE_SW_BP 0x00010000
300#define KVM_GUESTDBG_USE_HW_BP 0x00020000
301
277/* definition of registers in kvm_run */ 302/* definition of registers in kvm_run */
278struct kvm_sync_regs { 303struct kvm_sync_regs {
279}; 304};
@@ -299,6 +324,12 @@ struct kvm_allocate_rma {
299 __u64 rma_size; 324 __u64 rma_size;
300}; 325};
301 326
327/* for KVM_CAP_PPC_RTAS */
328struct kvm_rtas_token_args {
329 char name[120];
330 __u64 token; /* Use a token of 0 to undefine a mapping */
331};
332
302struct kvm_book3e_206_tlb_entry { 333struct kvm_book3e_206_tlb_entry {
303 __u32 mas8; 334 __u32 mas8;
304 __u32 mas1; 335 __u32 mas1;
@@ -359,6 +390,26 @@ struct kvm_get_htab_header {
359 __u16 n_invalid; 390 __u16 n_invalid;
360}; 391};
361 392
393/* Per-vcpu XICS interrupt controller state */
394#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
395
396#define KVM_REG_PPC_ICP_CPPR_SHIFT 56 /* current proc priority */
397#define KVM_REG_PPC_ICP_CPPR_MASK 0xff
398#define KVM_REG_PPC_ICP_XISR_SHIFT 32 /* interrupt status field */
399#define KVM_REG_PPC_ICP_XISR_MASK 0xffffff
400#define KVM_REG_PPC_ICP_MFRR_SHIFT 24 /* pending IPI priority */
401#define KVM_REG_PPC_ICP_MFRR_MASK 0xff
402#define KVM_REG_PPC_ICP_PPRI_SHIFT 16 /* pending irq priority */
403#define KVM_REG_PPC_ICP_PPRI_MASK 0xff
404
405/* Device control API: PPC-specific devices */
406#define KVM_DEV_MPIC_GRP_MISC 1
407#define KVM_DEV_MPIC_BASE_ADDR 0 /* 64-bit */
408
409#define KVM_DEV_MPIC_GRP_REGISTER 2 /* 32-bit */
410#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE 3 /* 32-bit */
411
412/* One-Reg API: PPC-specific registers */
362#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 413#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
363#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) 414#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
364#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) 415#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
@@ -417,4 +468,47 @@ struct kvm_get_htab_header {
417#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) 468#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
418#define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) 469#define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
419 470
471/* Timer Status Register OR/CLEAR interface */
472#define KVM_REG_PPC_OR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87)
473#define KVM_REG_PPC_CLEAR_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88)
474#define KVM_REG_PPC_TCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89)
475#define KVM_REG_PPC_TSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a)
476
477/* Debugging: Special instruction for software breakpoint */
478#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b)
479
480/* MMU registers */
481#define KVM_REG_PPC_MAS0 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c)
482#define KVM_REG_PPC_MAS1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d)
483#define KVM_REG_PPC_MAS2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e)
484#define KVM_REG_PPC_MAS7_3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f)
485#define KVM_REG_PPC_MAS4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90)
486#define KVM_REG_PPC_MAS6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91)
487#define KVM_REG_PPC_MMUCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92)
488/*
489 * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using
490 * KVM_CAP_SW_TLB ioctl
491 */
492#define KVM_REG_PPC_TLB0CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93)
493#define KVM_REG_PPC_TLB1CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94)
494#define KVM_REG_PPC_TLB2CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95)
495#define KVM_REG_PPC_TLB3CFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96)
496#define KVM_REG_PPC_TLB0PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97)
497#define KVM_REG_PPC_TLB1PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98)
498#define KVM_REG_PPC_TLB2PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99)
499#define KVM_REG_PPC_TLB3PS (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
500#define KVM_REG_PPC_EPTCFG (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
501
502/* PPC64 eXternal Interrupt Controller Specification */
503#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */
504
505/* Layout of 64-bit source attribute values */
506#define KVM_XICS_DESTINATION_SHIFT 0
507#define KVM_XICS_DESTINATION_MASK 0xffffffffULL
508#define KVM_XICS_PRIORITY_SHIFT 32
509#define KVM_XICS_PRIORITY_MASK 0xff
510#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40)
511#define KVM_XICS_MASKED (1ULL << 41)
512#define KVM_XICS_PENDING (1ULL << 42)
513
420#endif /* __LINUX_KVM_POWERPC_H */ 514#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 172233eab799..b51a97cfedf8 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -480,6 +480,7 @@ int main(void)
480 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); 480 DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
481 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); 481 DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
482 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); 482 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
483 DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
483#endif 484#endif
484#ifdef CONFIG_PPC_BOOK3S 485#ifdef CONFIG_PPC_BOOK3S
485 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); 486 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -576,6 +577,8 @@ int main(void)
576 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); 577 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
577 HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); 578 HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
578 HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); 579 HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
580 HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
581 HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
579 HSTATE_FIELD(HSTATE_MMCR, host_mmcr); 582 HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
580 HSTATE_FIELD(HSTATE_PMC, host_pmc); 583 HSTATE_FIELD(HSTATE_PMC, host_pmc);
581 HSTATE_FIELD(HSTATE_PURR, host_purr); 584 HSTATE_FIELD(HSTATE_PURR, host_purr);
@@ -599,6 +602,7 @@ int main(void)
599 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); 602 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
600 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); 603 DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
601 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); 604 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
605 DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save));
602#endif /* CONFIG_PPC_BOOK3S */ 606#endif /* CONFIG_PPC_BOOK3S */
603#endif /* CONFIG_KVM */ 607#endif /* CONFIG_KVM */
604 608
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 3d7fd21c65f9..2f5c6b6d6877 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -124,6 +124,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
124 return kvmppc_set_sregs_ivor(vcpu, sregs); 124 return kvmppc_set_sregs_ivor(vcpu, sregs);
125} 125}
126 126
127int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
128 union kvmppc_one_reg *val)
129{
130 return -EINVAL;
131}
132
133int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
134 union kvmppc_one_reg *val)
135{
136 return -EINVAL;
137}
138
127struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 139struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
128{ 140{
129 struct kvmppc_vcpu_44x *vcpu_44x; 141 struct kvmppc_vcpu_44x *vcpu_44x;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec72e43..eb643f862579 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -136,21 +136,41 @@ config KVM_E500V2
136 If unsure, say N. 136 If unsure, say N.
137 137
138config KVM_E500MC 138config KVM_E500MC
139 bool "KVM support for PowerPC E500MC/E5500 processors" 139 bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
140 depends on PPC_E500MC 140 depends on PPC_E500MC
141 select KVM 141 select KVM
142 select KVM_MMIO 142 select KVM_MMIO
143 select KVM_BOOKE_HV 143 select KVM_BOOKE_HV
144 select MMU_NOTIFIER 144 select MMU_NOTIFIER
145 ---help--- 145 ---help---
146 Support running unmodified E500MC/E5500 (32-bit) guest kernels in 146 Support running unmodified E500MC/E5500/E6500 guest kernels in
147 virtual machines on E500MC/E5500 host processors. 147 virtual machines on E500MC/E5500/E6500 host processors.
148 148
149 This module provides access to the hardware capabilities through 149 This module provides access to the hardware capabilities through
150 a character device node named /dev/kvm. 150 a character device node named /dev/kvm.
151 151
152 If unsure, say N. 152 If unsure, say N.
153 153
154config KVM_MPIC
155 bool "KVM in-kernel MPIC emulation"
156 depends on KVM && E500
157 select HAVE_KVM_IRQCHIP
158 select HAVE_KVM_IRQ_ROUTING
159 select HAVE_KVM_MSI
160 help
161 Enable support for emulating MPIC devices inside the
162 host kernel, rather than relying on userspace to emulate.
163 Currently, support is limited to certain versions of
164 Freescale's MPIC implementation.
165
166config KVM_XICS
167 bool "KVM in-kernel XICS emulation"
168 depends on KVM_BOOK3S_64 && !KVM_MPIC
169 ---help---
170 Include support for the XICS (eXternal Interrupt Controller
171 Specification) interrupt controller architecture used on
172 IBM POWER (pSeries) servers.
173
154source drivers/vhost/Kconfig 174source drivers/vhost/Kconfig
155 175
156endif # VIRTUALIZATION 176endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772eded8c26..422de3f4d46c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -72,12 +72,18 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
72 book3s_hv.o \ 72 book3s_hv.o \
73 book3s_hv_interrupts.o \ 73 book3s_hv_interrupts.o \
74 book3s_64_mmu_hv.o 74 book3s_64_mmu_hv.o
75kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
76 book3s_hv_rm_xics.o
75kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 77kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
76 book3s_hv_rmhandlers.o \ 78 book3s_hv_rmhandlers.o \
77 book3s_hv_rm_mmu.o \ 79 book3s_hv_rm_mmu.o \
78 book3s_64_vio_hv.o \ 80 book3s_64_vio_hv.o \
79 book3s_hv_ras.o \ 81 book3s_hv_ras.o \
80 book3s_hv_builtin.o 82 book3s_hv_builtin.o \
83 $(kvm-book3s_64-builtin-xics-objs-y)
84
85kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
86 book3s_xics.o
81 87
82kvm-book3s_64-module-objs := \ 88kvm-book3s_64-module-objs := \
83 ../../../virt/kvm/kvm_main.o \ 89 ../../../virt/kvm/kvm_main.o \
@@ -86,6 +92,7 @@ kvm-book3s_64-module-objs := \
86 emulate.o \ 92 emulate.o \
87 book3s.o \ 93 book3s.o \
88 book3s_64_vio.o \ 94 book3s_64_vio.o \
95 book3s_rtas.o \
89 $(kvm-book3s_64-objs-y) 96 $(kvm-book3s_64-objs-y)
90 97
91kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) 98kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
@@ -103,6 +110,9 @@ kvm-book3s_32-objs := \
103 book3s_32_mmu.o 110 book3s_32_mmu.o
104kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) 111kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
105 112
113kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
114kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o)
115
106kvm-objs := $(kvm-objs-m) $(kvm-objs-y) 116kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
107 117
108obj-$(CONFIG_KVM_440) += kvm.o 118obj-$(CONFIG_KVM_440) += kvm.o
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a4b645285240..700df6f1d32c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
104 return prio; 104 return prio;
105} 105}
106 106
107static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, 107void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
108 unsigned int vec) 108 unsigned int vec)
109{ 109{
110 unsigned long old_pending = vcpu->arch.pending_exceptions; 110 unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -160,8 +160,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
160 kvmppc_book3s_queue_irqprio(vcpu, vec); 160 kvmppc_book3s_queue_irqprio(vcpu, vec);
161} 161}
162 162
163void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 163void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
164 struct kvm_interrupt *irq)
165{ 164{
166 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); 165 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
167 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); 166 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
@@ -530,6 +529,21 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
530 val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); 529 val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
531 break; 530 break;
532#endif /* CONFIG_ALTIVEC */ 531#endif /* CONFIG_ALTIVEC */
532 case KVM_REG_PPC_DEBUG_INST: {
533 u32 opcode = INS_TW;
534 r = copy_to_user((u32 __user *)(long)reg->addr,
535 &opcode, sizeof(u32));
536 break;
537 }
538#ifdef CONFIG_KVM_XICS
539 case KVM_REG_PPC_ICP_STATE:
540 if (!vcpu->arch.icp) {
541 r = -ENXIO;
542 break;
543 }
544 val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu));
545 break;
546#endif /* CONFIG_KVM_XICS */
533 default: 547 default:
534 r = -EINVAL; 548 r = -EINVAL;
535 break; 549 break;
@@ -592,6 +606,16 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
592 vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); 606 vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
593 break; 607 break;
594#endif /* CONFIG_ALTIVEC */ 608#endif /* CONFIG_ALTIVEC */
609#ifdef CONFIG_KVM_XICS
610 case KVM_REG_PPC_ICP_STATE:
611 if (!vcpu->arch.icp) {
612 r = -ENXIO;
613 break;
614 }
615 r = kvmppc_xics_set_icp(vcpu,
616 set_reg_val(reg->id, val));
617 break;
618#endif /* CONFIG_KVM_XICS */
595 default: 619 default:
596 r = -EINVAL; 620 r = -EINVAL;
597 break; 621 break;
@@ -607,6 +631,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
607 return 0; 631 return 0;
608} 632}
609 633
634int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
635 struct kvm_guest_debug *dbg)
636{
637 return -EINVAL;
638}
639
610void kvmppc_decrementer_func(unsigned long data) 640void kvmppc_decrementer_func(unsigned long data)
611{ 641{
612 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; 642 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index da98e26f6e45..5880dfb31074 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -893,7 +893,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
893 /* Harvest R and C */ 893 /* Harvest R and C */
894 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); 894 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
895 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; 895 *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
896 rev[i].guest_rpte = ptel | rcbits; 896 if (rcbits & ~rev[i].guest_rpte) {
897 rev[i].guest_rpte = ptel | rcbits;
898 note_hpte_modification(kvm, &rev[i]);
899 }
897 } 900 }
898 unlock_rmap(rmapp); 901 unlock_rmap(rmapp);
899 hptep[0] &= ~HPTE_V_HVLOCK; 902 hptep[0] &= ~HPTE_V_HVLOCK;
@@ -976,7 +979,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
976 /* Now check and modify the HPTE */ 979 /* Now check and modify the HPTE */
977 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { 980 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
978 kvmppc_clear_ref_hpte(kvm, hptep, i); 981 kvmppc_clear_ref_hpte(kvm, hptep, i);
979 rev[i].guest_rpte |= HPTE_R_R; 982 if (!(rev[i].guest_rpte & HPTE_R_R)) {
983 rev[i].guest_rpte |= HPTE_R_R;
984 note_hpte_modification(kvm, &rev[i]);
985 }
980 ret = 1; 986 ret = 1;
981 } 987 }
982 hptep[0] &= ~HPTE_V_HVLOCK; 988 hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1080,7 +1086,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1080 hptep[1] &= ~HPTE_R_C; 1086 hptep[1] &= ~HPTE_R_C;
1081 eieio(); 1087 eieio();
1082 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; 1088 hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
1083 rev[i].guest_rpte |= HPTE_R_C; 1089 if (!(rev[i].guest_rpte & HPTE_R_C)) {
1090 rev[i].guest_rpte |= HPTE_R_C;
1091 note_hpte_modification(kvm, &rev[i]);
1092 }
1084 ret = 1; 1093 ret = 1;
1085 } 1094 }
1086 hptep[0] &= ~HPTE_V_HVLOCK; 1095 hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1090,11 +1099,30 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
1090 return ret; 1099 return ret;
1091} 1100}
1092 1101
1102static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1103 struct kvm_memory_slot *memslot,
1104 unsigned long *map)
1105{
1106 unsigned long gfn;
1107
1108 if (!vpa->dirty || !vpa->pinned_addr)
1109 return;
1110 gfn = vpa->gpa >> PAGE_SHIFT;
1111 if (gfn < memslot->base_gfn ||
1112 gfn >= memslot->base_gfn + memslot->npages)
1113 return;
1114
1115 vpa->dirty = false;
1116 if (map)
1117 __set_bit_le(gfn - memslot->base_gfn, map);
1118}
1119
1093long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1120long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1094 unsigned long *map) 1121 unsigned long *map)
1095{ 1122{
1096 unsigned long i; 1123 unsigned long i;
1097 unsigned long *rmapp; 1124 unsigned long *rmapp;
1125 struct kvm_vcpu *vcpu;
1098 1126
1099 preempt_disable(); 1127 preempt_disable();
1100 rmapp = memslot->arch.rmap; 1128 rmapp = memslot->arch.rmap;
@@ -1103,6 +1131,15 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1103 __set_bit_le(i, map); 1131 __set_bit_le(i, map);
1104 ++rmapp; 1132 ++rmapp;
1105 } 1133 }
1134
1135 /* Harvest dirty bits from VPA and DTL updates */
1136 /* Note: we never modify the SLB shadow buffer areas */
1137 kvm_for_each_vcpu(i, vcpu, kvm) {
1138 spin_lock(&vcpu->arch.vpa_update_lock);
1139 harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
1140 harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
1141 spin_unlock(&vcpu->arch.vpa_update_lock);
1142 }
1106 preempt_enable(); 1143 preempt_enable();
1107 return 0; 1144 return 0;
1108} 1145}
@@ -1114,7 +1151,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1114 unsigned long gfn = gpa >> PAGE_SHIFT; 1151 unsigned long gfn = gpa >> PAGE_SHIFT;
1115 struct page *page, *pages[1]; 1152 struct page *page, *pages[1];
1116 int npages; 1153 int npages;
1117 unsigned long hva, psize, offset; 1154 unsigned long hva, offset;
1118 unsigned long pa; 1155 unsigned long pa;
1119 unsigned long *physp; 1156 unsigned long *physp;
1120 int srcu_idx; 1157 int srcu_idx;
@@ -1146,14 +1183,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1146 } 1183 }
1147 srcu_read_unlock(&kvm->srcu, srcu_idx); 1184 srcu_read_unlock(&kvm->srcu, srcu_idx);
1148 1185
1149 psize = PAGE_SIZE; 1186 offset = gpa & (PAGE_SIZE - 1);
1150 if (PageHuge(page)) {
1151 page = compound_head(page);
1152 psize <<= compound_order(page);
1153 }
1154 offset = gpa & (psize - 1);
1155 if (nb_ret) 1187 if (nb_ret)
1156 *nb_ret = psize - offset; 1188 *nb_ret = PAGE_SIZE - offset;
1157 return page_address(page) + offset; 1189 return page_address(page) + offset;
1158 1190
1159 err: 1191 err:
@@ -1161,11 +1193,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1161 return NULL; 1193 return NULL;
1162} 1194}
1163 1195
1164void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) 1196void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1197 bool dirty)
1165{ 1198{
1166 struct page *page = virt_to_page(va); 1199 struct page *page = virt_to_page(va);
1200 struct kvm_memory_slot *memslot;
1201 unsigned long gfn;
1202 unsigned long *rmap;
1203 int srcu_idx;
1167 1204
1168 put_page(page); 1205 put_page(page);
1206
1207 if (!dirty || !kvm->arch.using_mmu_notifiers)
1208 return;
1209
1210 /* We need to mark this page dirty in the rmap chain */
1211 gfn = gpa >> PAGE_SHIFT;
1212 srcu_idx = srcu_read_lock(&kvm->srcu);
1213 memslot = gfn_to_memslot(kvm, gfn);
1214 if (memslot) {
1215 rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1216 lock_rmap(rmap);
1217 *rmap |= KVMPPC_RMAP_CHANGED;
1218 unlock_rmap(rmap);
1219 }
1220 srcu_read_unlock(&kvm->srcu, srcu_idx);
1169} 1221}
1170 1222
1171/* 1223/*
@@ -1193,16 +1245,36 @@ struct kvm_htab_ctx {
1193 1245
1194#define HPTE_SIZE (2 * sizeof(unsigned long)) 1246#define HPTE_SIZE (2 * sizeof(unsigned long))
1195 1247
1248/*
1249 * Returns 1 if this HPT entry has been modified or has pending
1250 * R/C bit changes.
1251 */
1252static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
1253{
1254 unsigned long rcbits_unset;
1255
1256 if (revp->guest_rpte & HPTE_GR_MODIFIED)
1257 return 1;
1258
1259 /* Also need to consider changes in reference and changed bits */
1260 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1261 if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
1262 return 1;
1263
1264 return 0;
1265}
1266
1196static long record_hpte(unsigned long flags, unsigned long *hptp, 1267static long record_hpte(unsigned long flags, unsigned long *hptp,
1197 unsigned long *hpte, struct revmap_entry *revp, 1268 unsigned long *hpte, struct revmap_entry *revp,
1198 int want_valid, int first_pass) 1269 int want_valid, int first_pass)
1199{ 1270{
1200 unsigned long v, r; 1271 unsigned long v, r;
1272 unsigned long rcbits_unset;
1201 int ok = 1; 1273 int ok = 1;
1202 int valid, dirty; 1274 int valid, dirty;
1203 1275
1204 /* Unmodified entries are uninteresting except on the first pass */ 1276 /* Unmodified entries are uninteresting except on the first pass */
1205 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1277 dirty = hpte_dirty(revp, hptp);
1206 if (!first_pass && !dirty) 1278 if (!first_pass && !dirty)
1207 return 0; 1279 return 0;
1208 1280
@@ -1223,16 +1295,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp,
1223 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1295 while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1224 cpu_relax(); 1296 cpu_relax();
1225 v = hptp[0]; 1297 v = hptp[0];
1298
1299 /* re-evaluate valid and dirty from synchronized HPTE value */
1300 valid = !!(v & HPTE_V_VALID);
1301 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1302
1303 /* Harvest R and C into guest view if necessary */
1304 rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1305 if (valid && (rcbits_unset & hptp[1])) {
1306 revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
1307 HPTE_GR_MODIFIED;
1308 dirty = 1;
1309 }
1310
1226 if (v & HPTE_V_ABSENT) { 1311 if (v & HPTE_V_ABSENT) {
1227 v &= ~HPTE_V_ABSENT; 1312 v &= ~HPTE_V_ABSENT;
1228 v |= HPTE_V_VALID; 1313 v |= HPTE_V_VALID;
1314 valid = 1;
1229 } 1315 }
1230 /* re-evaluate valid and dirty from synchronized HPTE value */
1231 valid = !!(v & HPTE_V_VALID);
1232 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1316 if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1233 valid = 0; 1317 valid = 0;
1234 r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); 1318
1235 dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1319 r = revp->guest_rpte;
1236 /* only clear modified if this is the right sort of entry */ 1320 /* only clear modified if this is the right sort of entry */
1237 if (valid == want_valid && dirty) { 1321 if (valid == want_valid && dirty) {
1238 r &= ~HPTE_GR_MODIFIED; 1322 r &= ~HPTE_GR_MODIFIED;
@@ -1288,7 +1372,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1288 /* Skip uninteresting entries, i.e. clean on not-first pass */ 1372 /* Skip uninteresting entries, i.e. clean on not-first pass */
1289 if (!first_pass) { 1373 if (!first_pass) {
1290 while (i < kvm->arch.hpt_npte && 1374 while (i < kvm->arch.hpt_npte &&
1291 !(revp->guest_rpte & HPTE_GR_MODIFIED)) { 1375 !hpte_dirty(revp, hptp)) {
1292 ++i; 1376 ++i;
1293 hptp += 2; 1377 hptp += 2;
1294 ++revp; 1378 ++revp;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 836c56975e21..1f6344c4408d 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -194,7 +194,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
194 run->papr_hcall.args[i] = gpr; 194 run->papr_hcall.args[i] = gpr;
195 } 195 }
196 196
197 emulated = EMULATE_DO_PAPR; 197 run->exit_reason = KVM_EXIT_PAPR_HCALL;
198 vcpu->arch.hcall_needed = 1;
199 emulated = EMULATE_EXIT_USER;
198 break; 200 break;
199 } 201 }
200#endif 202#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f5416934932b..9de24f8e03c7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -66,6 +66,31 @@
66static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 66static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
67static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 67static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
68 68
69void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
70{
71 int me;
72 int cpu = vcpu->cpu;
73 wait_queue_head_t *wqp;
74
75 wqp = kvm_arch_vcpu_wq(vcpu);
76 if (waitqueue_active(wqp)) {
77 wake_up_interruptible(wqp);
78 ++vcpu->stat.halt_wakeup;
79 }
80
81 me = get_cpu();
82
83 /* CPU points to the first thread of the core */
84 if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
85 int real_cpu = cpu + vcpu->arch.ptid;
86 if (paca[real_cpu].kvm_hstate.xics_phys)
87 xics_wake_cpu(real_cpu);
88 else if (cpu_online(cpu))
89 smp_send_reschedule(cpu);
90 }
91 put_cpu();
92}
93
69/* 94/*
70 * We use the vcpu_load/put functions to measure stolen time. 95 * We use the vcpu_load/put functions to measure stolen time.
71 * Stolen time is counted as time when either the vcpu is able to 96 * Stolen time is counted as time when either the vcpu is able to
@@ -259,7 +284,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
259 len = ((struct reg_vpa *)va)->length.hword; 284 len = ((struct reg_vpa *)va)->length.hword;
260 else 285 else
261 len = ((struct reg_vpa *)va)->length.word; 286 len = ((struct reg_vpa *)va)->length.word;
262 kvmppc_unpin_guest_page(kvm, va); 287 kvmppc_unpin_guest_page(kvm, va, vpa, false);
263 288
264 /* Check length */ 289 /* Check length */
265 if (len > nb || len < sizeof(struct reg_vpa)) 290 if (len > nb || len < sizeof(struct reg_vpa))
@@ -359,13 +384,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
359 va = NULL; 384 va = NULL;
360 nb = 0; 385 nb = 0;
361 if (gpa) 386 if (gpa)
362 va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb); 387 va = kvmppc_pin_guest_page(kvm, gpa, &nb);
363 spin_lock(&vcpu->arch.vpa_update_lock); 388 spin_lock(&vcpu->arch.vpa_update_lock);
364 if (gpa == vpap->next_gpa) 389 if (gpa == vpap->next_gpa)
365 break; 390 break;
366 /* sigh... unpin that one and try again */ 391 /* sigh... unpin that one and try again */
367 if (va) 392 if (va)
368 kvmppc_unpin_guest_page(kvm, va); 393 kvmppc_unpin_guest_page(kvm, va, gpa, false);
369 } 394 }
370 395
371 vpap->update_pending = 0; 396 vpap->update_pending = 0;
@@ -375,12 +400,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
375 * has changed the mappings underlying guest memory, 400 * has changed the mappings underlying guest memory,
376 * so unregister the region. 401 * so unregister the region.
377 */ 402 */
378 kvmppc_unpin_guest_page(kvm, va); 403 kvmppc_unpin_guest_page(kvm, va, gpa, false);
379 va = NULL; 404 va = NULL;
380 } 405 }
381 if (vpap->pinned_addr) 406 if (vpap->pinned_addr)
382 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr); 407 kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
408 vpap->dirty);
409 vpap->gpa = gpa;
383 vpap->pinned_addr = va; 410 vpap->pinned_addr = va;
411 vpap->dirty = false;
384 if (va) 412 if (va)
385 vpap->pinned_end = va + vpap->len; 413 vpap->pinned_end = va + vpap->len;
386} 414}
@@ -472,6 +500,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
472 /* order writing *dt vs. writing vpa->dtl_idx */ 500 /* order writing *dt vs. writing vpa->dtl_idx */
473 smp_wmb(); 501 smp_wmb();
474 vpa->dtl_idx = ++vcpu->arch.dtl_index; 502 vpa->dtl_idx = ++vcpu->arch.dtl_index;
503 vcpu->arch.dtl.dirty = true;
475} 504}
476 505
477int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) 506int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -479,7 +508,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
479 unsigned long req = kvmppc_get_gpr(vcpu, 3); 508 unsigned long req = kvmppc_get_gpr(vcpu, 3);
480 unsigned long target, ret = H_SUCCESS; 509 unsigned long target, ret = H_SUCCESS;
481 struct kvm_vcpu *tvcpu; 510 struct kvm_vcpu *tvcpu;
482 int idx; 511 int idx, rc;
483 512
484 switch (req) { 513 switch (req) {
485 case H_ENTER: 514 case H_ENTER:
@@ -515,6 +544,28 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
515 kvmppc_get_gpr(vcpu, 5), 544 kvmppc_get_gpr(vcpu, 5),
516 kvmppc_get_gpr(vcpu, 6)); 545 kvmppc_get_gpr(vcpu, 6));
517 break; 546 break;
547 case H_RTAS:
548 if (list_empty(&vcpu->kvm->arch.rtas_tokens))
549 return RESUME_HOST;
550
551 rc = kvmppc_rtas_hcall(vcpu);
552
553 if (rc == -ENOENT)
554 return RESUME_HOST;
555 else if (rc == 0)
556 break;
557
558 /* Send the error out to userspace via KVM_RUN */
559 return rc;
560
561 case H_XIRR:
562 case H_CPPR:
563 case H_EOI:
564 case H_IPI:
565 if (kvmppc_xics_enabled(vcpu)) {
566 ret = kvmppc_xics_hcall(vcpu, req);
567 break;
568 } /* fallthrough */
518 default: 569 default:
519 return RESUME_HOST; 570 return RESUME_HOST;
520 } 571 }
@@ -913,15 +964,19 @@ out:
913 return ERR_PTR(err); 964 return ERR_PTR(err);
914} 965}
915 966
967static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
968{
969 if (vpa->pinned_addr)
970 kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
971 vpa->dirty);
972}
973
916void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) 974void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
917{ 975{
918 spin_lock(&vcpu->arch.vpa_update_lock); 976 spin_lock(&vcpu->arch.vpa_update_lock);
919 if (vcpu->arch.dtl.pinned_addr) 977 unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
920 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr); 978 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
921 if (vcpu->arch.slb_shadow.pinned_addr) 979 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
922 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
923 if (vcpu->arch.vpa.pinned_addr)
924 kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
925 spin_unlock(&vcpu->arch.vpa_update_lock); 980 spin_unlock(&vcpu->arch.vpa_update_lock);
926 kvm_vcpu_uninit(vcpu); 981 kvm_vcpu_uninit(vcpu);
927 kmem_cache_free(kvm_vcpu_cache, vcpu); 982 kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -955,7 +1010,6 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
955} 1010}
956 1011
957extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 1012extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
958extern void xics_wake_cpu(int cpu);
959 1013
960static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 1014static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
961 struct kvm_vcpu *vcpu) 1015 struct kvm_vcpu *vcpu)
@@ -1330,9 +1384,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1330 break; 1384 break;
1331 vc->runner = vcpu; 1385 vc->runner = vcpu;
1332 n_ceded = 0; 1386 n_ceded = 0;
1333 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) 1387 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
1334 if (!v->arch.pending_exceptions) 1388 if (!v->arch.pending_exceptions)
1335 n_ceded += v->arch.ceded; 1389 n_ceded += v->arch.ceded;
1390 else
1391 v->arch.ceded = 0;
1392 }
1336 if (n_ceded == vc->n_runnable) 1393 if (n_ceded == vc->n_runnable)
1337 kvmppc_vcore_blocked(vc); 1394 kvmppc_vcore_blocked(vc);
1338 else 1395 else
@@ -1645,12 +1702,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1645 1702
1646void kvmppc_core_commit_memory_region(struct kvm *kvm, 1703void kvmppc_core_commit_memory_region(struct kvm *kvm,
1647 struct kvm_userspace_memory_region *mem, 1704 struct kvm_userspace_memory_region *mem,
1648 struct kvm_memory_slot old) 1705 const struct kvm_memory_slot *old)
1649{ 1706{
1650 unsigned long npages = mem->memory_size >> PAGE_SHIFT; 1707 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
1651 struct kvm_memory_slot *memslot; 1708 struct kvm_memory_slot *memslot;
1652 1709
1653 if (npages && old.npages) { 1710 if (npages && old->npages) {
1654 /* 1711 /*
1655 * If modifying a memslot, reset all the rmap dirty bits. 1712 * If modifying a memslot, reset all the rmap dirty bits.
1656 * If this is a new memslot, we don't need to do anything 1713 * If this is a new memslot, we don't need to do anything
@@ -1827,6 +1884,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1827 cpumask_setall(&kvm->arch.need_tlb_flush); 1884 cpumask_setall(&kvm->arch.need_tlb_flush);
1828 1885
1829 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1886 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1887 INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
1830 1888
1831 kvm->arch.rma = NULL; 1889 kvm->arch.rma = NULL;
1832 1890
@@ -1872,6 +1930,8 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
1872 kvm->arch.rma = NULL; 1930 kvm->arch.rma = NULL;
1873 } 1931 }
1874 1932
1933 kvmppc_rtas_tokens_free(kvm);
1934
1875 kvmppc_free_hpt(kvm); 1935 kvmppc_free_hpt(kvm);
1876 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); 1936 WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
1877} 1937}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19c93bae1aea..6dcbb49105a4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -97,17 +97,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
97} 97}
98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
99 99
100/*
101 * Note modification of an HPTE; set the HPTE modified bit
102 * if anyone is interested.
103 */
104static inline void note_hpte_modification(struct kvm *kvm,
105 struct revmap_entry *rev)
106{
107 if (atomic_read(&kvm->arch.hpte_mod_interest))
108 rev->guest_rpte |= HPTE_GR_MODIFIED;
109}
110
111/* Remove this HPTE from the chain for a real page */ 100/* Remove this HPTE from the chain for a real page */
112static void remove_revmap_chain(struct kvm *kvm, long pte_index, 101static void remove_revmap_chain(struct kvm *kvm, long pte_index,
113 struct revmap_entry *rev, 102 struct revmap_entry *rev,
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 000000000000..b4b0082f761c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,406 @@
1/*
2 * Copyright 2012 Michael Ellerman, IBM Corporation.
3 * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/kvm_host.h>
12#include <linux/err.h>
13
14#include <asm/kvm_book3s.h>
15#include <asm/kvm_ppc.h>
16#include <asm/hvcall.h>
17#include <asm/xics.h>
18#include <asm/debug.h>
19#include <asm/synch.h>
20#include <asm/ppc-opcode.h>
21
22#include "book3s_xics.h"
23
24#define DEBUG_PASSUP
25
26static inline void rm_writeb(unsigned long paddr, u8 val)
27{
28 __asm__ __volatile__("sync; stbcix %0,0,%1"
29 : : "r" (val), "r" (paddr) : "memory");
30}
31
32static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
33 struct kvm_vcpu *this_vcpu)
34{
35 struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
36 unsigned long xics_phys;
37 int cpu;
38
39 /* Mark the target VCPU as having an interrupt pending */
40 vcpu->stat.queue_intr++;
41 set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
42
43 /* Kick self ? Just set MER and return */
44 if (vcpu == this_vcpu) {
45 mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
46 return;
47 }
48
49 /* Check if the core is loaded, if not, too hard */
50 cpu = vcpu->cpu;
51 if (cpu < 0 || cpu >= nr_cpu_ids) {
52 this_icp->rm_action |= XICS_RM_KICK_VCPU;
53 this_icp->rm_kick_target = vcpu;
54 return;
55 }
56 /* In SMT cpu will always point to thread 0, we adjust it */
57 cpu += vcpu->arch.ptid;
58
59 /* Not too hard, then poke the target */
60 xics_phys = paca[cpu].kvm_hstate.xics_phys;
61 rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
62}
63
64static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
65{
66 /* Note: Only called on self ! */
67 clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
68 &vcpu->arch.pending_exceptions);
69 mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
70}
71
72static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
73 union kvmppc_icp_state old,
74 union kvmppc_icp_state new)
75{
76 struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
77 bool success;
78
79 /* Calculate new output value */
80 new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
81
82 /* Attempt atomic update */
83 success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
84 if (!success)
85 goto bail;
86
87 /*
88 * Check for output state update
89 *
90 * Note that this is racy since another processor could be updating
91 * the state already. This is why we never clear the interrupt output
92 * here, we only ever set it. The clear only happens prior to doing
93 * an update and only by the processor itself. Currently we do it
94 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
95 *
96 * We also do not try to figure out whether the EE state has changed,
97 * we unconditionally set it if the new state calls for it. The reason
98 * for that is that we opportunistically remove the pending interrupt
99 * flag when raising CPPR, so we need to set it back here if an
100 * interrupt is still pending.
101 */
102 if (new.out_ee)
103 icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
104
105 /* Expose the state change for debug purposes */
106 this_vcpu->arch.icp->rm_dbgstate = new;
107 this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
108
109 bail:
110 return success;
111}
112
113static inline int check_too_hard(struct kvmppc_xics *xics,
114 struct kvmppc_icp *icp)
115{
116 return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
117}
118
119static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
120 u8 new_cppr)
121{
122 union kvmppc_icp_state old_state, new_state;
123 bool resend;
124
125 /*
126 * This handles several related states in one operation:
127 *
128 * ICP State: Down_CPPR
129 *
130 * Load CPPR with new value and if the XISR is 0
131 * then check for resends:
132 *
133 * ICP State: Resend
134 *
135 * If MFRR is more favored than CPPR, check for IPIs
136 * and notify ICS of a potential resend. This is done
137 * asynchronously (when used in real mode, we will have
138 * to exit here).
139 *
140 * We do not handle the complete Check_IPI as documented
141 * here. In the PAPR, this state will be used for both
142 * Set_MFRR and Down_CPPR. However, we know that we aren't
143 * changing the MFRR state here so we don't need to handle
144 * the case of an MFRR causing a reject of a pending irq,
145 * this will have been handled when the MFRR was set in the
146 * first place.
147 *
148 * Thus we don't have to handle rejects, only resends.
149 *
150 * When implementing real mode for HV KVM, resend will lead to
151 * a H_TOO_HARD return and the whole transaction will be handled
152 * in virtual mode.
153 */
154 do {
155 old_state = new_state = ACCESS_ONCE(icp->state);
156
157 /* Down_CPPR */
158 new_state.cppr = new_cppr;
159
160 /*
161 * Cut down Resend / Check_IPI / IPI
162 *
163 * The logic is that we cannot have a pending interrupt
164 * trumped by an IPI at this point (see above), so we
165 * know that either the pending interrupt is already an
166 * IPI (in which case we don't care to override it) or
167 * it's either more favored than us or non existent
168 */
169 if (new_state.mfrr < new_cppr &&
170 new_state.mfrr <= new_state.pending_pri) {
171 new_state.pending_pri = new_state.mfrr;
172 new_state.xisr = XICS_IPI;
173 }
174
175 /* Latch/clear resend bit */
176 resend = new_state.need_resend;
177 new_state.need_resend = 0;
178
179 } while (!icp_rm_try_update(icp, old_state, new_state));
180
181 /*
182 * Now handle resend checks. Those are asynchronous to the ICP
183 * state update in HW (ie bus transactions) so we can handle them
184 * separately here as well.
185 */
186 if (resend)
187 icp->rm_action |= XICS_RM_CHECK_RESEND;
188}
189
190
191unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
192{
193 union kvmppc_icp_state old_state, new_state;
194 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
195 struct kvmppc_icp *icp = vcpu->arch.icp;
196 u32 xirr;
197
198 if (!xics || !xics->real_mode)
199 return H_TOO_HARD;
200
201 /* First clear the interrupt */
202 icp_rm_clr_vcpu_irq(icp->vcpu);
203
204 /*
205 * ICP State: Accept_Interrupt
206 *
207 * Return the pending interrupt (if any) along with the
208 * current CPPR, then clear the XISR & set CPPR to the
209 * pending priority
210 */
211 do {
212 old_state = new_state = ACCESS_ONCE(icp->state);
213
214 xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
215 if (!old_state.xisr)
216 break;
217 new_state.cppr = new_state.pending_pri;
218 new_state.pending_pri = 0xff;
219 new_state.xisr = 0;
220
221 } while (!icp_rm_try_update(icp, old_state, new_state));
222
223 /* Return the result in GPR4 */
224 vcpu->arch.gpr[4] = xirr;
225
226 return check_too_hard(xics, icp);
227}
228
229int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
230 unsigned long mfrr)
231{
232 union kvmppc_icp_state old_state, new_state;
233 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
234 struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
235 u32 reject;
236 bool resend;
237 bool local;
238
239 if (!xics || !xics->real_mode)
240 return H_TOO_HARD;
241
242 local = this_icp->server_num == server;
243 if (local)
244 icp = this_icp;
245 else
246 icp = kvmppc_xics_find_server(vcpu->kvm, server);
247 if (!icp)
248 return H_PARAMETER;
249
250 /*
251 * ICP state: Set_MFRR
252 *
253 * If the CPPR is more favored than the new MFRR, then
254 * nothing needs to be done as there can be no XISR to
255 * reject.
256 *
257 * If the CPPR is less favored, then we might be replacing
258 * an interrupt, and thus need to possibly reject it as in
259 *
260 * ICP state: Check_IPI
261 */
262 do {
263 old_state = new_state = ACCESS_ONCE(icp->state);
264
265 /* Set_MFRR */
266 new_state.mfrr = mfrr;
267
268 /* Check_IPI */
269 reject = 0;
270 resend = false;
271 if (mfrr < new_state.cppr) {
272 /* Reject a pending interrupt if not an IPI */
273 if (mfrr <= new_state.pending_pri)
274 reject = new_state.xisr;
275 new_state.pending_pri = mfrr;
276 new_state.xisr = XICS_IPI;
277 }
278
279 if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
280 resend = new_state.need_resend;
281 new_state.need_resend = 0;
282 }
283 } while (!icp_rm_try_update(icp, old_state, new_state));
284
285 /* Pass rejects to virtual mode */
286 if (reject && reject != XICS_IPI) {
287 this_icp->rm_action |= XICS_RM_REJECT;
288 this_icp->rm_reject = reject;
289 }
290
291 /* Pass resends to virtual mode */
292 if (resend)
293 this_icp->rm_action |= XICS_RM_CHECK_RESEND;
294
295 return check_too_hard(xics, this_icp);
296}
297
298int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
299{
300 union kvmppc_icp_state old_state, new_state;
301 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
302 struct kvmppc_icp *icp = vcpu->arch.icp;
303 u32 reject;
304
305 if (!xics || !xics->real_mode)
306 return H_TOO_HARD;
307
308 /*
309 * ICP State: Set_CPPR
310 *
311 * We can safely compare the new value with the current
312 * value outside of the transaction as the CPPR is only
313 * ever changed by the processor on itself
314 */
315 if (cppr > icp->state.cppr) {
316 icp_rm_down_cppr(xics, icp, cppr);
317 goto bail;
318 } else if (cppr == icp->state.cppr)
319 return H_SUCCESS;
320
321 /*
322 * ICP State: Up_CPPR
323 *
324 * The processor is raising its priority, this can result
325 * in a rejection of a pending interrupt:
326 *
327 * ICP State: Reject_Current
328 *
329 * We can remove EE from the current processor, the update
330 * transaction will set it again if needed
331 */
332 icp_rm_clr_vcpu_irq(icp->vcpu);
333
334 do {
335 old_state = new_state = ACCESS_ONCE(icp->state);
336
337 reject = 0;
338 new_state.cppr = cppr;
339
340 if (cppr <= new_state.pending_pri) {
341 reject = new_state.xisr;
342 new_state.xisr = 0;
343 new_state.pending_pri = 0xff;
344 }
345
346 } while (!icp_rm_try_update(icp, old_state, new_state));
347
348 /* Pass rejects to virtual mode */
349 if (reject && reject != XICS_IPI) {
350 icp->rm_action |= XICS_RM_REJECT;
351 icp->rm_reject = reject;
352 }
353 bail:
354 return check_too_hard(xics, icp);
355}
356
357int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
358{
359 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
360 struct kvmppc_icp *icp = vcpu->arch.icp;
361 struct kvmppc_ics *ics;
362 struct ics_irq_state *state;
363 u32 irq = xirr & 0x00ffffff;
364 u16 src;
365
366 if (!xics || !xics->real_mode)
367 return H_TOO_HARD;
368
369 /*
370 * ICP State: EOI
371 *
372 * Note: If EOI is incorrectly used by SW to lower the CPPR
373 * value (ie more favored), we do not check for rejection of
374 * a pending interrupt, this is a SW error and PAPR sepcifies
375 * that we don't have to deal with it.
376 *
377 * The sending of an EOI to the ICS is handled after the
378 * CPPR update
379 *
380 * ICP State: Down_CPPR which we handle
381 * in a separate function as it's shared with H_CPPR.
382 */
383 icp_rm_down_cppr(xics, icp, xirr >> 24);
384
385 /* IPIs have no EOI */
386 if (irq == XICS_IPI)
387 goto bail;
388 /*
389 * EOI handling: If the interrupt is still asserted, we need to
390 * resend it. We can take a lockless "peek" at the ICS state here.
391 *
392 * "Message" interrupts will never have "asserted" set
393 */
394 ics = kvmppc_xics_find_ics(xics, irq, &src);
395 if (!ics)
396 goto bail;
397 state = &ics->irq_state[src];
398
399 /* Still asserted, resend it, we make it look like a reject */
400 if (state->asserted) {
401 icp->rm_action |= XICS_RM_REJECT;
402 icp->rm_reject = irq;
403 }
404 bail:
405 return check_too_hard(xics, icp);
406}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e33d11f1b977..b02f91e4c70d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -79,10 +79,6 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
79 * * 79 * *
80 *****************************************************************************/ 80 *****************************************************************************/
81 81
82#define XICS_XIRR 4
83#define XICS_QIRR 0xc
84#define XICS_IPI 2 /* interrupt source # for IPIs */
85
86/* 82/*
87 * We come in here when wakened from nap mode on a secondary hw thread. 83 * We come in here when wakened from nap mode on a secondary hw thread.
88 * Relocation is off and most register values are lost. 84 * Relocation is off and most register values are lost.
@@ -101,50 +97,51 @@ kvm_start_guest:
101 li r0,1 97 li r0,1
102 stb r0,PACA_NAPSTATELOST(r13) 98 stb r0,PACA_NAPSTATELOST(r13)
103 99
104 /* get vcpu pointer, NULL if we have no vcpu to run */ 100 /* were we napping due to cede? */
105 ld r4,HSTATE_KVM_VCPU(r13) 101 lbz r0,HSTATE_NAPPING(r13)
106 cmpdi cr1,r4,0 102 cmpwi r0,0
103 bne kvm_end_cede
104
105 /*
106 * We weren't napping due to cede, so this must be a secondary
107 * thread being woken up to run a guest, or being woken up due
108 * to a stray IPI. (Or due to some machine check or hypervisor
109 * maintenance interrupt while the core is in KVM.)
110 */
107 111
108 /* Check the wake reason in SRR1 to see why we got here */ 112 /* Check the wake reason in SRR1 to see why we got here */
109 mfspr r3,SPRN_SRR1 113 mfspr r3,SPRN_SRR1
110 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ 114 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
111 cmpwi r3,4 /* was it an external interrupt? */ 115 cmpwi r3,4 /* was it an external interrupt? */
112 bne 27f 116 bne 27f /* if not */
113 117 ld r5,HSTATE_XICS_PHYS(r13)
114 /* 118 li r7,XICS_XIRR /* if it was an external interrupt, */
115 * External interrupt - for now assume it is an IPI, since we
116 * should never get any other interrupts sent to offline threads.
117 * Only do this for secondary threads.
118 */
119 beq cr1,25f
120 lwz r3,VCPU_PTID(r4)
121 cmpwi r3,0
122 beq 27f
12325: ld r5,HSTATE_XICS_PHYS(r13)
124 li r0,0xff
125 li r6,XICS_QIRR
126 li r7,XICS_XIRR
127 lwzcix r8,r5,r7 /* get and ack the interrupt */ 119 lwzcix r8,r5,r7 /* get and ack the interrupt */
128 sync 120 sync
129 clrldi. r9,r8,40 /* get interrupt source ID. */ 121 clrldi. r9,r8,40 /* get interrupt source ID. */
130 beq 27f /* none there? */ 122 beq 28f /* none there? */
131 cmpwi r9,XICS_IPI 123 cmpwi r9,XICS_IPI /* was it an IPI? */
132 bne 26f 124 bne 29f
125 li r0,0xff
126 li r6,XICS_MFRR
133 stbcix r0,r5,r6 /* clear IPI */ 127 stbcix r0,r5,r6 /* clear IPI */
13426: stwcix r8,r5,r7 /* EOI the interrupt */ 128 stwcix r8,r5,r7 /* EOI the interrupt */
135 129 sync /* order loading of vcpu after that */
13627: /* XXX should handle hypervisor maintenance interrupts etc. here */
137 130
138 /* reload vcpu pointer after clearing the IPI */ 131 /* get vcpu pointer, NULL if we have no vcpu to run */
139 ld r4,HSTATE_KVM_VCPU(r13) 132 ld r4,HSTATE_KVM_VCPU(r13)
140 cmpdi r4,0 133 cmpdi r4,0
141 /* if we have no vcpu to run, go back to sleep */ 134 /* if we have no vcpu to run, go back to sleep */
142 beq kvm_no_guest 135 beq kvm_no_guest
136 b kvmppc_hv_entry
143 137
144 /* were we napping due to cede? */ 13827: /* XXX should handle hypervisor maintenance interrupts etc. here */
145 lbz r0,HSTATE_NAPPING(r13) 139 b kvm_no_guest
146 cmpwi r0,0 14028: /* SRR1 said external but ICP said nope?? */
147 bne kvm_end_cede 141 b kvm_no_guest
14229: /* External non-IPI interrupt to offline secondary thread? help?? */
143 stw r8,HSTATE_SAVED_XIRR(r13)
144 b kvm_no_guest
148 145
149.global kvmppc_hv_entry 146.global kvmppc_hv_entry
150kvmppc_hv_entry: 147kvmppc_hv_entry:
@@ -260,6 +257,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
260 lwz r5, LPPACA_YIELDCOUNT(r3) 257 lwz r5, LPPACA_YIELDCOUNT(r3)
261 addi r5, r5, 1 258 addi r5, r5, 1
262 stw r5, LPPACA_YIELDCOUNT(r3) 259 stw r5, LPPACA_YIELDCOUNT(r3)
260 li r6, 1
261 stb r6, VCPU_VPA_DIRTY(r4)
26325: 26225:
264 /* Load up DAR and DSISR */ 263 /* Load up DAR and DSISR */
265 ld r5, VCPU_DAR(r4) 264 ld r5, VCPU_DAR(r4)
@@ -485,20 +484,20 @@ toc_tlbie_lock:
485 mtctr r6 484 mtctr r6
486 mtxer r7 485 mtxer r7
487 486
487 ld r10, VCPU_PC(r4)
488 ld r11, VCPU_MSR(r4)
488kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ 489kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
489 ld r6, VCPU_SRR0(r4) 490 ld r6, VCPU_SRR0(r4)
490 ld r7, VCPU_SRR1(r4) 491 ld r7, VCPU_SRR1(r4)
491 ld r10, VCPU_PC(r4)
492 ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */
493 492
493 /* r11 = vcpu->arch.msr & ~MSR_HV */
494 rldicl r11, r11, 63 - MSR_HV_LG, 1 494 rldicl r11, r11, 63 - MSR_HV_LG, 1
495 rotldi r11, r11, 1 + MSR_HV_LG 495 rotldi r11, r11, 1 + MSR_HV_LG
496 ori r11, r11, MSR_ME 496 ori r11, r11, MSR_ME
497 497
498 /* Check if we can deliver an external or decrementer interrupt now */ 498 /* Check if we can deliver an external or decrementer interrupt now */
499 ld r0,VCPU_PENDING_EXC(r4) 499 ld r0,VCPU_PENDING_EXC(r4)
500 li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) 500 lis r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
501 oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
502 and r0,r0,r8 501 and r0,r0,r8
503 cmpdi cr1,r0,0 502 cmpdi cr1,r0,0
504 andi. r0,r11,MSR_EE 503 andi. r0,r11,MSR_EE
@@ -526,10 +525,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
526 /* Move SRR0 and SRR1 into the respective regs */ 525 /* Move SRR0 and SRR1 into the respective regs */
5275: mtspr SPRN_SRR0, r6 5265: mtspr SPRN_SRR0, r6
528 mtspr SPRN_SRR1, r7 527 mtspr SPRN_SRR1, r7
529 li r0,0
530 stb r0,VCPU_CEDED(r4) /* cancel cede */
531 528
532fast_guest_return: 529fast_guest_return:
530 li r0,0
531 stb r0,VCPU_CEDED(r4) /* cancel cede */
533 mtspr SPRN_HSRR0,r10 532 mtspr SPRN_HSRR0,r10
534 mtspr SPRN_HSRR1,r11 533 mtspr SPRN_HSRR1,r11
535 534
@@ -676,17 +675,99 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
676 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 675 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
677 beq hcall_try_real_mode 676 beq hcall_try_real_mode
678 677
679 /* Check for mediated interrupts (could be done earlier really ...) */ 678 /* Only handle external interrupts here on arch 206 and later */
680BEGIN_FTR_SECTION 679BEGIN_FTR_SECTION
681 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL 680 b ext_interrupt_to_host
682 bne+ 1f 681END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
683 andi. r0,r11,MSR_EE 682
684 beq 1f 683 /* External interrupt ? */
685 mfspr r5,SPRN_LPCR 684 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
686 andi. r0,r5,LPCR_MER 685 bne+ ext_interrupt_to_host
687 bne bounce_ext_interrupt 686
6881: 687 /* External interrupt, first check for host_ipi. If this is
689END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 688 * set, we know the host wants us out so let's do it now
689 */
690do_ext_interrupt:
691 lbz r0, HSTATE_HOST_IPI(r13)
692 cmpwi r0, 0
693 bne ext_interrupt_to_host
694
695 /* Now read the interrupt from the ICP */
696 ld r5, HSTATE_XICS_PHYS(r13)
697 li r7, XICS_XIRR
698 cmpdi r5, 0
699 beq- ext_interrupt_to_host
700 lwzcix r3, r5, r7
701 rlwinm. r0, r3, 0, 0xffffff
702 sync
703 beq 3f /* if nothing pending in the ICP */
704
705 /* We found something in the ICP...
706 *
707 * If it's not an IPI, stash it in the PACA and return to
708 * the host, we don't (yet) handle directing real external
709 * interrupts directly to the guest
710 */
711 cmpwi r0, XICS_IPI
712 bne ext_stash_for_host
713
714 /* It's an IPI, clear the MFRR and EOI it */
715 li r0, 0xff
716 li r6, XICS_MFRR
717 stbcix r0, r5, r6 /* clear the IPI */
718 stwcix r3, r5, r7 /* EOI it */
719 sync
720
721 /* We need to re-check host IPI now in case it got set in the
722 * meantime. If it's clear, we bounce the interrupt to the
723 * guest
724 */
725 lbz r0, HSTATE_HOST_IPI(r13)
726 cmpwi r0, 0
727 bne- 1f
728
729 /* Allright, looks like an IPI for the guest, we need to set MER */
7303:
731 /* Check if any CPU is heading out to the host, if so head out too */
732 ld r5, HSTATE_KVM_VCORE(r13)
733 lwz r0, VCORE_ENTRY_EXIT(r5)
734 cmpwi r0, 0x100
735 bge ext_interrupt_to_host
736
737 /* See if there is a pending interrupt for the guest */
738 mfspr r8, SPRN_LPCR
739 ld r0, VCPU_PENDING_EXC(r9)
740 /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
741 rldicl. r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
742 rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
743 beq 2f
744
745 /* And if the guest EE is set, we can deliver immediately, else
746 * we return to the guest with MER set
747 */
748 andi. r0, r11, MSR_EE
749 beq 2f
750 mtspr SPRN_SRR0, r10
751 mtspr SPRN_SRR1, r11
752 li r10, BOOK3S_INTERRUPT_EXTERNAL
753 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
754 rotldi r11, r11, 63
7552: mr r4, r9
756 mtspr SPRN_LPCR, r8
757 b fast_guest_return
758
759 /* We raced with the host, we need to resend that IPI, bummer */
7601: li r0, IPI_PRIORITY
761 stbcix r0, r5, r6 /* set the IPI */
762 sync
763 b ext_interrupt_to_host
764
765ext_stash_for_host:
766 /* It's not an IPI and it's for the host, stash it in the PACA
767 * before exit, it will be picked up by the host ICP driver
768 */
769 stw r3, HSTATE_SAVED_XIRR(r13)
770ext_interrupt_to_host:
690 771
691guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 772guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
692 /* Save DEC */ 773 /* Save DEC */
@@ -829,7 +910,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
829 beq 44f 910 beq 44f
830 ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 911 ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
831 li r0,IPI_PRIORITY 912 li r0,IPI_PRIORITY
832 li r7,XICS_QIRR 913 li r7,XICS_MFRR
833 stbcix r0,r7,r8 /* trigger the IPI */ 914 stbcix r0,r7,r8 /* trigger the IPI */
83444: srdi. r3,r3,1 91544: srdi. r3,r3,1
835 addi r6,r6,PACA_SIZE 916 addi r6,r6,PACA_SIZE
@@ -1018,6 +1099,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
1018 lwz r3, LPPACA_YIELDCOUNT(r8) 1099 lwz r3, LPPACA_YIELDCOUNT(r8)
1019 addi r3, r3, 1 1100 addi r3, r3, 1
1020 stw r3, LPPACA_YIELDCOUNT(r8) 1101 stw r3, LPPACA_YIELDCOUNT(r8)
1102 li r3, 1
1103 stb r3, VCPU_VPA_DIRTY(r9)
102125: 110425:
1022 /* Save PMU registers if requested */ 1105 /* Save PMU registers if requested */
1023 /* r8 and cr0.eq are live here */ 1106 /* r8 and cr0.eq are live here */
@@ -1350,11 +1433,19 @@ hcall_real_table:
1350 .long 0 /* 0x58 */ 1433 .long 0 /* 0x58 */
1351 .long 0 /* 0x5c */ 1434 .long 0 /* 0x5c */
1352 .long 0 /* 0x60 */ 1435 .long 0 /* 0x60 */
1353 .long 0 /* 0x64 */ 1436#ifdef CONFIG_KVM_XICS
1354 .long 0 /* 0x68 */ 1437 .long .kvmppc_rm_h_eoi - hcall_real_table
1355 .long 0 /* 0x6c */ 1438 .long .kvmppc_rm_h_cppr - hcall_real_table
1356 .long 0 /* 0x70 */ 1439 .long .kvmppc_rm_h_ipi - hcall_real_table
1357 .long 0 /* 0x74 */ 1440 .long 0 /* 0x70 - H_IPOLL */
1441 .long .kvmppc_rm_h_xirr - hcall_real_table
1442#else
1443 .long 0 /* 0x64 - H_EOI */
1444 .long 0 /* 0x68 - H_CPPR */
1445 .long 0 /* 0x6c - H_IPI */
1446 .long 0 /* 0x70 - H_IPOLL */
1447 .long 0 /* 0x74 - H_XIRR */
1448#endif
1358 .long 0 /* 0x78 */ 1449 .long 0 /* 0x78 */
1359 .long 0 /* 0x7c */ 1450 .long 0 /* 0x7c */
1360 .long 0 /* 0x80 */ 1451 .long 0 /* 0x80 */
@@ -1405,15 +1496,6 @@ ignore_hdec:
1405 mr r4,r9 1496 mr r4,r9
1406 b fast_guest_return 1497 b fast_guest_return
1407 1498
1408bounce_ext_interrupt:
1409 mr r4,r9
1410 mtspr SPRN_SRR0,r10
1411 mtspr SPRN_SRR1,r11
1412 li r10,BOOK3S_INTERRUPT_EXTERNAL
1413 li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1414 rotldi r11,r11,63
1415 b fast_guest_return
1416
1417_GLOBAL(kvmppc_h_set_dabr) 1499_GLOBAL(kvmppc_h_set_dabr)
1418 std r4,VCPU_DABR(r3) 1500 std r4,VCPU_DABR(r3)
1419 /* Work around P7 bug where DABR can get corrupted on mtspr */ 1501 /* Work around P7 bug where DABR can get corrupted on mtspr */
@@ -1519,6 +1601,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
1519 b . 1601 b .
1520 1602
1521kvm_end_cede: 1603kvm_end_cede:
1604 /* get vcpu pointer */
1605 ld r4, HSTATE_KVM_VCPU(r13)
1606
1522 /* Woken by external or decrementer interrupt */ 1607 /* Woken by external or decrementer interrupt */
1523 ld r1, HSTATE_HOST_R1(r13) 1608 ld r1, HSTATE_HOST_R1(r13)
1524 1609
@@ -1558,6 +1643,16 @@ kvm_end_cede:
1558 li r0,0 1643 li r0,0
1559 stb r0,HSTATE_NAPPING(r13) 1644 stb r0,HSTATE_NAPPING(r13)
1560 1645
1646 /* Check the wake reason in SRR1 to see why we got here */
1647 mfspr r3, SPRN_SRR1
1648 rlwinm r3, r3, 44-31, 0x7 /* extract wake reason field */
1649 cmpwi r3, 4 /* was it an external interrupt? */
1650 li r12, BOOK3S_INTERRUPT_EXTERNAL
1651 mr r9, r4
1652 ld r10, VCPU_PC(r9)
1653 ld r11, VCPU_MSR(r9)
1654 beq do_ext_interrupt /* if so */
1655
1561 /* see if any other thread is already exiting */ 1656 /* see if any other thread is already exiting */
1562 lwz r0,VCORE_ENTRY_EXIT(r5) 1657 lwz r0,VCORE_ENTRY_EXIT(r5)
1563 cmpwi r0,0x100 1658 cmpwi r0,0x100
@@ -1577,8 +1672,7 @@ kvm_cede_prodded:
1577 1672
1578 /* we've ceded but we want to give control to the host */ 1673 /* we've ceded but we want to give control to the host */
1579kvm_cede_exit: 1674kvm_cede_exit:
1580 li r3,H_TOO_HARD 1675 b hcall_real_fallback
1581 blr
1582 1676
1583 /* Try to handle a machine check in real mode */ 1677 /* Try to handle a machine check in real mode */
1584machine_check_realmode: 1678machine_check_realmode:
@@ -1626,7 +1720,7 @@ secondary_nap:
1626 beq 37f 1720 beq 37f
1627 sync 1721 sync
1628 li r0, 0xff 1722 li r0, 0xff
1629 li r6, XICS_QIRR 1723 li r6, XICS_MFRR
1630 stbcix r0, r5, r6 /* clear the IPI */ 1724 stbcix r0, r5, r6 /* clear the IPI */
1631 stwcix r3, r5, r7 /* EOI it */ 1725 stwcix r3, r5, r7 /* EOI it */
163237: sync 172637: sync
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index dbdc15aa8127..bdc40b8e77d9 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,9 +762,7 @@ program_interrupt:
762 run->exit_reason = KVM_EXIT_MMIO; 762 run->exit_reason = KVM_EXIT_MMIO;
763 r = RESUME_HOST_NV; 763 r = RESUME_HOST_NV;
764 break; 764 break;
765 case EMULATE_DO_PAPR: 765 case EMULATE_EXIT_USER:
766 run->exit_reason = KVM_EXIT_PAPR_HCALL;
767 vcpu->arch.hcall_needed = 1;
768 r = RESUME_HOST_NV; 766 r = RESUME_HOST_NV;
769 break; 767 break;
770 default: 768 default:
@@ -1283,7 +1281,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1283 1281
1284void kvmppc_core_commit_memory_region(struct kvm *kvm, 1282void kvmppc_core_commit_memory_region(struct kvm *kvm,
1285 struct kvm_userspace_memory_region *mem, 1283 struct kvm_userspace_memory_region *mem,
1286 struct kvm_memory_slot old) 1284 const struct kvm_memory_slot *old)
1287{ 1285{
1288} 1286}
1289 1287
@@ -1298,6 +1296,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
1298{ 1296{
1299#ifdef CONFIG_PPC64 1297#ifdef CONFIG_PPC64
1300 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1298 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
1299 INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
1301#endif 1300#endif
1302 1301
1303 if (firmware_has_feature(FW_FEATURE_SET_MODE)) { 1302 if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ee02b30878ed..b24309c6c2d5 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
227 return EMULATE_DONE; 227 return EMULATE_DONE;
228} 228}
229 229
230static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
231{
232 long rc = kvmppc_xics_hcall(vcpu, cmd);
233 kvmppc_set_gpr(vcpu, 3, rc);
234 return EMULATE_DONE;
235}
236
230int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) 237int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
231{ 238{
232 switch (cmd) { 239 switch (cmd) {
@@ -246,6 +253,20 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
246 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 253 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
247 vcpu->stat.halt_wakeup++; 254 vcpu->stat.halt_wakeup++;
248 return EMULATE_DONE; 255 return EMULATE_DONE;
256 case H_XIRR:
257 case H_CPPR:
258 case H_EOI:
259 case H_IPI:
260 if (kvmppc_xics_enabled(vcpu))
261 return kvmppc_h_pr_xics_hcall(vcpu, cmd);
262 break;
263 case H_RTAS:
264 if (list_empty(&vcpu->kvm->arch.rtas_tokens))
265 return RESUME_HOST;
266 if (kvmppc_rtas_hcall(vcpu))
267 break;
268 kvmppc_set_gpr(vcpu, 3, 0);
269 return EMULATE_DONE;
249 } 270 }
250 271
251 return EMULATE_FAIL; 272 return EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 000000000000..3219ba895246
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,274 @@
1/*
2 * Copyright 2012 Michael Ellerman, IBM Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2, as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/kernel.h>
10#include <linux/kvm_host.h>
11#include <linux/kvm.h>
12#include <linux/err.h>
13
14#include <asm/uaccess.h>
15#include <asm/kvm_book3s.h>
16#include <asm/kvm_ppc.h>
17#include <asm/hvcall.h>
18#include <asm/rtas.h>
19
20#ifdef CONFIG_KVM_XICS
21static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
22{
23 u32 irq, server, priority;
24 int rc;
25
26 if (args->nargs != 3 || args->nret != 1) {
27 rc = -3;
28 goto out;
29 }
30
31 irq = args->args[0];
32 server = args->args[1];
33 priority = args->args[2];
34
35 rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
36 if (rc)
37 rc = -3;
38out:
39 args->rets[0] = rc;
40}
41
42static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
43{
44 u32 irq, server, priority;
45 int rc;
46
47 if (args->nargs != 1 || args->nret != 3) {
48 rc = -3;
49 goto out;
50 }
51
52 irq = args->args[0];
53
54 server = priority = 0;
55 rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
56 if (rc) {
57 rc = -3;
58 goto out;
59 }
60
61 args->rets[1] = server;
62 args->rets[2] = priority;
63out:
64 args->rets[0] = rc;
65}
66
67static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
68{
69 u32 irq;
70 int rc;
71
72 if (args->nargs != 1 || args->nret != 1) {
73 rc = -3;
74 goto out;
75 }
76
77 irq = args->args[0];
78
79 rc = kvmppc_xics_int_off(vcpu->kvm, irq);
80 if (rc)
81 rc = -3;
82out:
83 args->rets[0] = rc;
84}
85
86static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
87{
88 u32 irq;
89 int rc;
90
91 if (args->nargs != 1 || args->nret != 1) {
92 rc = -3;
93 goto out;
94 }
95
96 irq = args->args[0];
97
98 rc = kvmppc_xics_int_on(vcpu->kvm, irq);
99 if (rc)
100 rc = -3;
101out:
102 args->rets[0] = rc;
103}
104#endif /* CONFIG_KVM_XICS */
105
106struct rtas_handler {
107 void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
108 char *name;
109};
110
111static struct rtas_handler rtas_handlers[] = {
112#ifdef CONFIG_KVM_XICS
113 { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
114 { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
115 { .name = "ibm,int-off", .handler = kvm_rtas_int_off },
116 { .name = "ibm,int-on", .handler = kvm_rtas_int_on },
117#endif
118};
119
120struct rtas_token_definition {
121 struct list_head list;
122 struct rtas_handler *handler;
123 u64 token;
124};
125
126static int rtas_name_matches(char *s1, char *s2)
127{
128 struct kvm_rtas_token_args args;
129 return !strncmp(s1, s2, sizeof(args.name));
130}
131
132static int rtas_token_undefine(struct kvm *kvm, char *name)
133{
134 struct rtas_token_definition *d, *tmp;
135
136 lockdep_assert_held(&kvm->lock);
137
138 list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
139 if (rtas_name_matches(d->handler->name, name)) {
140 list_del(&d->list);
141 kfree(d);
142 return 0;
143 }
144 }
145
146 /* It's not an error to undefine an undefined token */
147 return 0;
148}
149
150static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
151{
152 struct rtas_token_definition *d;
153 struct rtas_handler *h = NULL;
154 bool found;
155 int i;
156
157 lockdep_assert_held(&kvm->lock);
158
159 list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
160 if (d->token == token)
161 return -EEXIST;
162 }
163
164 found = false;
165 for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
166 h = &rtas_handlers[i];
167 if (rtas_name_matches(h->name, name)) {
168 found = true;
169 break;
170 }
171 }
172
173 if (!found)
174 return -ENOENT;
175
176 d = kzalloc(sizeof(*d), GFP_KERNEL);
177 if (!d)
178 return -ENOMEM;
179
180 d->handler = h;
181 d->token = token;
182
183 list_add_tail(&d->list, &kvm->arch.rtas_tokens);
184
185 return 0;
186}
187
188int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
189{
190 struct kvm_rtas_token_args args;
191 int rc;
192
193 if (copy_from_user(&args, argp, sizeof(args)))
194 return -EFAULT;
195
196 mutex_lock(&kvm->lock);
197
198 if (args.token)
199 rc = rtas_token_define(kvm, args.name, args.token);
200 else
201 rc = rtas_token_undefine(kvm, args.name);
202
203 mutex_unlock(&kvm->lock);
204
205 return rc;
206}
207
208int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
209{
210 struct rtas_token_definition *d;
211 struct rtas_args args;
212 rtas_arg_t *orig_rets;
213 gpa_t args_phys;
214 int rc;
215
216 /* r4 contains the guest physical address of the RTAS args */
217 args_phys = kvmppc_get_gpr(vcpu, 4);
218
219 rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
220 if (rc)
221 goto fail;
222
223 /*
224 * args->rets is a pointer into args->args. Now that we've
225 * copied args we need to fix it up to point into our copy,
226 * not the guest args. We also need to save the original
227 * value so we can restore it on the way out.
228 */
229 orig_rets = args.rets;
230 args.rets = &args.args[args.nargs];
231
232 mutex_lock(&vcpu->kvm->lock);
233
234 rc = -ENOENT;
235 list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
236 if (d->token == args.token) {
237 d->handler->handler(vcpu, &args);
238 rc = 0;
239 break;
240 }
241 }
242
243 mutex_unlock(&vcpu->kvm->lock);
244
245 if (rc == 0) {
246 args.rets = orig_rets;
247 rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
248 if (rc)
249 goto fail;
250 }
251
252 return rc;
253
254fail:
255 /*
256 * We only get here if the guest has called RTAS with a bogus
257 * args pointer. That means we can't get to the args, and so we
258 * can't fail the RTAS call. So fail right out to userspace,
259 * which should kill the guest.
260 */
261 return rc;
262}
263
264void kvmppc_rtas_tokens_free(struct kvm *kvm)
265{
266 struct rtas_token_definition *d, *tmp;
267
268 lockdep_assert_held(&kvm->lock);
269
270 list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
271 list_del(&d->list);
272 kfree(d);
273 }
274}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 000000000000..f7a103756618
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1270 @@
1/*
2 * Copyright 2012 Michael Ellerman, IBM Corporation.
3 * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11#include <linux/kvm_host.h>
12#include <linux/err.h>
13#include <linux/gfp.h>
14#include <linux/anon_inodes.h>
15
16#include <asm/uaccess.h>
17#include <asm/kvm_book3s.h>
18#include <asm/kvm_ppc.h>
19#include <asm/hvcall.h>
20#include <asm/xics.h>
21#include <asm/debug.h>
22
23#include <linux/debugfs.h>
24#include <linux/seq_file.h>
25
26#include "book3s_xics.h"
27
28#if 1
29#define XICS_DBG(fmt...) do { } while (0)
30#else
31#define XICS_DBG(fmt...) trace_printk(fmt)
32#endif
33
34#define ENABLE_REALMODE true
35#define DEBUG_REALMODE false
36
37/*
38 * LOCKING
39 * =======
40 *
41 * Each ICS has a mutex protecting the information about the IRQ
42 * sources and avoiding simultaneous deliveries if the same interrupt.
43 *
44 * ICP operations are done via a single compare & swap transaction
45 * (most ICP state fits in the union kvmppc_icp_state)
46 */
47
48/*
49 * TODO
50 * ====
51 *
52 * - To speed up resends, keep a bitmap of "resend" set bits in the
53 * ICS
54 *
55 * - Speed up server# -> ICP lookup (array ? hash table ?)
56 *
57 * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
58 * locks array to improve scalability
59 */
60
61/* -- ICS routines -- */
62
63static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
64 u32 new_irq);
65
66static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level,
67 bool report_status)
68{
69 struct ics_irq_state *state;
70 struct kvmppc_ics *ics;
71 u16 src;
72
73 XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
74
75 ics = kvmppc_xics_find_ics(xics, irq, &src);
76 if (!ics) {
77 XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
78 return -EINVAL;
79 }
80 state = &ics->irq_state[src];
81 if (!state->exists)
82 return -EINVAL;
83
84 if (report_status)
85 return state->asserted;
86
87 /*
88 * We set state->asserted locklessly. This should be fine as
89 * we are the only setter, thus concurrent access is undefined
90 * to begin with.
91 */
92 if (level == KVM_INTERRUPT_SET_LEVEL)
93 state->asserted = 1;
94 else if (level == KVM_INTERRUPT_UNSET) {
95 state->asserted = 0;
96 return 0;
97 }
98
99 /* Attempt delivery */
100 icp_deliver_irq(xics, NULL, irq);
101
102 return state->asserted;
103}
104
105static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
106 struct kvmppc_icp *icp)
107{
108 int i;
109
110 mutex_lock(&ics->lock);
111
112 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
113 struct ics_irq_state *state = &ics->irq_state[i];
114
115 if (!state->resend)
116 continue;
117
118 XICS_DBG("resend %#x prio %#x\n", state->number,
119 state->priority);
120
121 mutex_unlock(&ics->lock);
122 icp_deliver_irq(xics, icp, state->number);
123 mutex_lock(&ics->lock);
124 }
125
126 mutex_unlock(&ics->lock);
127}
128
129static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
130 struct ics_irq_state *state,
131 u32 server, u32 priority, u32 saved_priority)
132{
133 bool deliver;
134
135 mutex_lock(&ics->lock);
136
137 state->server = server;
138 state->priority = priority;
139 state->saved_priority = saved_priority;
140 deliver = false;
141 if ((state->masked_pending || state->resend) && priority != MASKED) {
142 state->masked_pending = 0;
143 deliver = true;
144 }
145
146 mutex_unlock(&ics->lock);
147
148 return deliver;
149}
150
151int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
152{
153 struct kvmppc_xics *xics = kvm->arch.xics;
154 struct kvmppc_icp *icp;
155 struct kvmppc_ics *ics;
156 struct ics_irq_state *state;
157 u16 src;
158
159 if (!xics)
160 return -ENODEV;
161
162 ics = kvmppc_xics_find_ics(xics, irq, &src);
163 if (!ics)
164 return -EINVAL;
165 state = &ics->irq_state[src];
166
167 icp = kvmppc_xics_find_server(kvm, server);
168 if (!icp)
169 return -EINVAL;
170
171 XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
172 irq, server, priority,
173 state->masked_pending, state->resend);
174
175 if (write_xive(xics, ics, state, server, priority, priority))
176 icp_deliver_irq(xics, icp, irq);
177
178 return 0;
179}
180
181int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
182{
183 struct kvmppc_xics *xics = kvm->arch.xics;
184 struct kvmppc_ics *ics;
185 struct ics_irq_state *state;
186 u16 src;
187
188 if (!xics)
189 return -ENODEV;
190
191 ics = kvmppc_xics_find_ics(xics, irq, &src);
192 if (!ics)
193 return -EINVAL;
194 state = &ics->irq_state[src];
195
196 mutex_lock(&ics->lock);
197 *server = state->server;
198 *priority = state->priority;
199 mutex_unlock(&ics->lock);
200
201 return 0;
202}
203
204int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
205{
206 struct kvmppc_xics *xics = kvm->arch.xics;
207 struct kvmppc_icp *icp;
208 struct kvmppc_ics *ics;
209 struct ics_irq_state *state;
210 u16 src;
211
212 if (!xics)
213 return -ENODEV;
214
215 ics = kvmppc_xics_find_ics(xics, irq, &src);
216 if (!ics)
217 return -EINVAL;
218 state = &ics->irq_state[src];
219
220 icp = kvmppc_xics_find_server(kvm, state->server);
221 if (!icp)
222 return -EINVAL;
223
224 if (write_xive(xics, ics, state, state->server, state->saved_priority,
225 state->saved_priority))
226 icp_deliver_irq(xics, icp, irq);
227
228 return 0;
229}
230
231int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
232{
233 struct kvmppc_xics *xics = kvm->arch.xics;
234 struct kvmppc_ics *ics;
235 struct ics_irq_state *state;
236 u16 src;
237
238 if (!xics)
239 return -ENODEV;
240
241 ics = kvmppc_xics_find_ics(xics, irq, &src);
242 if (!ics)
243 return -EINVAL;
244 state = &ics->irq_state[src];
245
246 write_xive(xics, ics, state, state->server, MASKED, state->priority);
247
248 return 0;
249}
250
251/* -- ICP routines, including hcalls -- */
252
253static inline bool icp_try_update(struct kvmppc_icp *icp,
254 union kvmppc_icp_state old,
255 union kvmppc_icp_state new,
256 bool change_self)
257{
258 bool success;
259
260 /* Calculate new output value */
261 new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
262
263 /* Attempt atomic update */
264 success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
265 if (!success)
266 goto bail;
267
268 XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
269 icp->server_num,
270 old.cppr, old.mfrr, old.pending_pri, old.xisr,
271 old.need_resend, old.out_ee);
272 XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
273 new.cppr, new.mfrr, new.pending_pri, new.xisr,
274 new.need_resend, new.out_ee);
275 /*
276 * Check for output state update
277 *
278 * Note that this is racy since another processor could be updating
279 * the state already. This is why we never clear the interrupt output
280 * here, we only ever set it. The clear only happens prior to doing
281 * an update and only by the processor itself. Currently we do it
282 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
283 *
284 * We also do not try to figure out whether the EE state has changed,
285 * we unconditionally set it if the new state calls for it. The reason
286 * for that is that we opportunistically remove the pending interrupt
287 * flag when raising CPPR, so we need to set it back here if an
288 * interrupt is still pending.
289 */
290 if (new.out_ee) {
291 kvmppc_book3s_queue_irqprio(icp->vcpu,
292 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
293 if (!change_self)
294 kvmppc_fast_vcpu_kick(icp->vcpu);
295 }
296 bail:
297 return success;
298}
299
300static void icp_check_resend(struct kvmppc_xics *xics,
301 struct kvmppc_icp *icp)
302{
303 u32 icsid;
304
305 /* Order this load with the test for need_resend in the caller */
306 smp_rmb();
307 for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
308 struct kvmppc_ics *ics = xics->ics[icsid];
309
310 if (!test_and_clear_bit(icsid, icp->resend_map))
311 continue;
312 if (!ics)
313 continue;
314 ics_check_resend(xics, ics, icp);
315 }
316}
317
318static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
319 u32 *reject)
320{
321 union kvmppc_icp_state old_state, new_state;
322 bool success;
323
324 XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
325 icp->server_num);
326
327 do {
328 old_state = new_state = ACCESS_ONCE(icp->state);
329
330 *reject = 0;
331
332 /* See if we can deliver */
333 success = new_state.cppr > priority &&
334 new_state.mfrr > priority &&
335 new_state.pending_pri > priority;
336
337 /*
338 * If we can, check for a rejection and perform the
339 * delivery
340 */
341 if (success) {
342 *reject = new_state.xisr;
343 new_state.xisr = irq;
344 new_state.pending_pri = priority;
345 } else {
346 /*
347 * If we failed to deliver we set need_resend
348 * so a subsequent CPPR state change causes us
349 * to try a new delivery.
350 */
351 new_state.need_resend = true;
352 }
353
354 } while (!icp_try_update(icp, old_state, new_state, false));
355
356 return success;
357}
358
359static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
360 u32 new_irq)
361{
362 struct ics_irq_state *state;
363 struct kvmppc_ics *ics;
364 u32 reject;
365 u16 src;
366
367 /*
368 * This is used both for initial delivery of an interrupt and
369 * for subsequent rejection.
370 *
371 * Rejection can be racy vs. resends. We have evaluated the
372 * rejection in an atomic ICP transaction which is now complete,
373 * so potentially the ICP can already accept the interrupt again.
374 *
375 * So we need to retry the delivery. Essentially the reject path
376 * boils down to a failed delivery. Always.
377 *
378 * Now the interrupt could also have moved to a different target,
379 * thus we may need to re-do the ICP lookup as well
380 */
381
382 again:
383 /* Get the ICS state and lock it */
384 ics = kvmppc_xics_find_ics(xics, new_irq, &src);
385 if (!ics) {
386 XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
387 return;
388 }
389 state = &ics->irq_state[src];
390
391 /* Get a lock on the ICS */
392 mutex_lock(&ics->lock);
393
394 /* Get our server */
395 if (!icp || state->server != icp->server_num) {
396 icp = kvmppc_xics_find_server(xics->kvm, state->server);
397 if (!icp) {
398 pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
399 new_irq, state->server);
400 goto out;
401 }
402 }
403
404 /* Clear the resend bit of that interrupt */
405 state->resend = 0;
406
407 /*
408 * If masked, bail out
409 *
410 * Note: PAPR doesn't mention anything about masked pending
411 * when doing a resend, only when doing a delivery.
412 *
413 * However that would have the effect of losing a masked
414 * interrupt that was rejected and isn't consistent with
415 * the whole masked_pending business which is about not
416 * losing interrupts that occur while masked.
417 *
418 * I don't differenciate normal deliveries and resends, this
419 * implementation will differ from PAPR and not lose such
420 * interrupts.
421 */
422 if (state->priority == MASKED) {
423 XICS_DBG("irq %#x masked pending\n", new_irq);
424 state->masked_pending = 1;
425 goto out;
426 }
427
428 /*
429 * Try the delivery, this will set the need_resend flag
430 * in the ICP as part of the atomic transaction if the
431 * delivery is not possible.
432 *
433 * Note that if successful, the new delivery might have itself
434 * rejected an interrupt that was "delivered" before we took the
435 * icp mutex.
436 *
437 * In this case we do the whole sequence all over again for the
438 * new guy. We cannot assume that the rejected interrupt is less
439 * favored than the new one, and thus doesn't need to be delivered,
440 * because by the time we exit icp_try_to_deliver() the target
441 * processor may well have alrady consumed & completed it, and thus
442 * the rejected interrupt might actually be already acceptable.
443 */
444 if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
445 /*
446 * Delivery was successful, did we reject somebody else ?
447 */
448 if (reject && reject != XICS_IPI) {
449 mutex_unlock(&ics->lock);
450 new_irq = reject;
451 goto again;
452 }
453 } else {
454 /*
455 * We failed to deliver the interrupt we need to set the
456 * resend map bit and mark the ICS state as needing a resend
457 */
458 set_bit(ics->icsid, icp->resend_map);
459 state->resend = 1;
460
461 /*
462 * If the need_resend flag got cleared in the ICP some time
463 * between icp_try_to_deliver() atomic update and now, then
464 * we know it might have missed the resend_map bit. So we
465 * retry
466 */
467 smp_mb();
468 if (!icp->state.need_resend) {
469 mutex_unlock(&ics->lock);
470 goto again;
471 }
472 }
473 out:
474 mutex_unlock(&ics->lock);
475}
476
477static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
478 u8 new_cppr)
479{
480 union kvmppc_icp_state old_state, new_state;
481 bool resend;
482
483 /*
484 * This handles several related states in one operation:
485 *
486 * ICP State: Down_CPPR
487 *
488 * Load CPPR with new value and if the XISR is 0
489 * then check for resends:
490 *
491 * ICP State: Resend
492 *
493 * If MFRR is more favored than CPPR, check for IPIs
494 * and notify ICS of a potential resend. This is done
495 * asynchronously (when used in real mode, we will have
496 * to exit here).
497 *
498 * We do not handle the complete Check_IPI as documented
499 * here. In the PAPR, this state will be used for both
500 * Set_MFRR and Down_CPPR. However, we know that we aren't
501 * changing the MFRR state here so we don't need to handle
502 * the case of an MFRR causing a reject of a pending irq,
503 * this will have been handled when the MFRR was set in the
504 * first place.
505 *
506 * Thus we don't have to handle rejects, only resends.
507 *
508 * When implementing real mode for HV KVM, resend will lead to
509 * a H_TOO_HARD return and the whole transaction will be handled
510 * in virtual mode.
511 */
512 do {
513 old_state = new_state = ACCESS_ONCE(icp->state);
514
515 /* Down_CPPR */
516 new_state.cppr = new_cppr;
517
518 /*
519 * Cut down Resend / Check_IPI / IPI
520 *
521 * The logic is that we cannot have a pending interrupt
522 * trumped by an IPI at this point (see above), so we
523 * know that either the pending interrupt is already an
524 * IPI (in which case we don't care to override it) or
525 * it's either more favored than us or non existent
526 */
527 if (new_state.mfrr < new_cppr &&
528 new_state.mfrr <= new_state.pending_pri) {
529 WARN_ON(new_state.xisr != XICS_IPI &&
530 new_state.xisr != 0);
531 new_state.pending_pri = new_state.mfrr;
532 new_state.xisr = XICS_IPI;
533 }
534
535 /* Latch/clear resend bit */
536 resend = new_state.need_resend;
537 new_state.need_resend = 0;
538
539 } while (!icp_try_update(icp, old_state, new_state, true));
540
541 /*
542 * Now handle resend checks. Those are asynchronous to the ICP
543 * state update in HW (ie bus transactions) so we can handle them
544 * separately here too
545 */
546 if (resend)
547 icp_check_resend(xics, icp);
548}
549
550static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
551{
552 union kvmppc_icp_state old_state, new_state;
553 struct kvmppc_icp *icp = vcpu->arch.icp;
554 u32 xirr;
555
556 /* First, remove EE from the processor */
557 kvmppc_book3s_dequeue_irqprio(icp->vcpu,
558 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
559
560 /*
561 * ICP State: Accept_Interrupt
562 *
563 * Return the pending interrupt (if any) along with the
564 * current CPPR, then clear the XISR & set CPPR to the
565 * pending priority
566 */
567 do {
568 old_state = new_state = ACCESS_ONCE(icp->state);
569
570 xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
571 if (!old_state.xisr)
572 break;
573 new_state.cppr = new_state.pending_pri;
574 new_state.pending_pri = 0xff;
575 new_state.xisr = 0;
576
577 } while (!icp_try_update(icp, old_state, new_state, true));
578
579 XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
580
581 return xirr;
582}
583
584static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
585 unsigned long mfrr)
586{
587 union kvmppc_icp_state old_state, new_state;
588 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
589 struct kvmppc_icp *icp;
590 u32 reject;
591 bool resend;
592 bool local;
593
594 XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
595 vcpu->vcpu_id, server, mfrr);
596
597 icp = vcpu->arch.icp;
598 local = icp->server_num == server;
599 if (!local) {
600 icp = kvmppc_xics_find_server(vcpu->kvm, server);
601 if (!icp)
602 return H_PARAMETER;
603 }
604
605 /*
606 * ICP state: Set_MFRR
607 *
608 * If the CPPR is more favored than the new MFRR, then
609 * nothing needs to be rejected as there can be no XISR to
610 * reject. If the MFRR is being made less favored then
611 * there might be a previously-rejected interrupt needing
612 * to be resent.
613 *
614 * If the CPPR is less favored, then we might be replacing
615 * an interrupt, and thus need to possibly reject it as in
616 *
617 * ICP state: Check_IPI
618 */
619 do {
620 old_state = new_state = ACCESS_ONCE(icp->state);
621
622 /* Set_MFRR */
623 new_state.mfrr = mfrr;
624
625 /* Check_IPI */
626 reject = 0;
627 resend = false;
628 if (mfrr < new_state.cppr) {
629 /* Reject a pending interrupt if not an IPI */
630 if (mfrr <= new_state.pending_pri)
631 reject = new_state.xisr;
632 new_state.pending_pri = mfrr;
633 new_state.xisr = XICS_IPI;
634 }
635
636 if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
637 resend = new_state.need_resend;
638 new_state.need_resend = 0;
639 }
640 } while (!icp_try_update(icp, old_state, new_state, local));
641
642 /* Handle reject */
643 if (reject && reject != XICS_IPI)
644 icp_deliver_irq(xics, icp, reject);
645
646 /* Handle resend */
647 if (resend)
648 icp_check_resend(xics, icp);
649
650 return H_SUCCESS;
651}
652
653static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
654{
655 union kvmppc_icp_state old_state, new_state;
656 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
657 struct kvmppc_icp *icp = vcpu->arch.icp;
658 u32 reject;
659
660 XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
661
662 /*
663 * ICP State: Set_CPPR
664 *
665 * We can safely compare the new value with the current
666 * value outside of the transaction as the CPPR is only
667 * ever changed by the processor on itself
668 */
669 if (cppr > icp->state.cppr)
670 icp_down_cppr(xics, icp, cppr);
671 else if (cppr == icp->state.cppr)
672 return;
673
674 /*
675 * ICP State: Up_CPPR
676 *
677 * The processor is raising its priority, this can result
678 * in a rejection of a pending interrupt:
679 *
680 * ICP State: Reject_Current
681 *
682 * We can remove EE from the current processor, the update
683 * transaction will set it again if needed
684 */
685 kvmppc_book3s_dequeue_irqprio(icp->vcpu,
686 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
687
688 do {
689 old_state = new_state = ACCESS_ONCE(icp->state);
690
691 reject = 0;
692 new_state.cppr = cppr;
693
694 if (cppr <= new_state.pending_pri) {
695 reject = new_state.xisr;
696 new_state.xisr = 0;
697 new_state.pending_pri = 0xff;
698 }
699
700 } while (!icp_try_update(icp, old_state, new_state, true));
701
702 /*
703 * Check for rejects. They are handled by doing a new delivery
704 * attempt (see comments in icp_deliver_irq).
705 */
706 if (reject && reject != XICS_IPI)
707 icp_deliver_irq(xics, icp, reject);
708}
709
710static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
711{
712 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
713 struct kvmppc_icp *icp = vcpu->arch.icp;
714 struct kvmppc_ics *ics;
715 struct ics_irq_state *state;
716 u32 irq = xirr & 0x00ffffff;
717 u16 src;
718
719 XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
720
721 /*
722 * ICP State: EOI
723 *
724 * Note: If EOI is incorrectly used by SW to lower the CPPR
725 * value (ie more favored), we do not check for rejection of
726 * a pending interrupt, this is a SW error and PAPR sepcifies
727 * that we don't have to deal with it.
728 *
729 * The sending of an EOI to the ICS is handled after the
730 * CPPR update
731 *
732 * ICP State: Down_CPPR which we handle
733 * in a separate function as it's shared with H_CPPR.
734 */
735 icp_down_cppr(xics, icp, xirr >> 24);
736
737 /* IPIs have no EOI */
738 if (irq == XICS_IPI)
739 return H_SUCCESS;
740 /*
741 * EOI handling: If the interrupt is still asserted, we need to
742 * resend it. We can take a lockless "peek" at the ICS state here.
743 *
744 * "Message" interrupts will never have "asserted" set
745 */
746 ics = kvmppc_xics_find_ics(xics, irq, &src);
747 if (!ics) {
748 XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
749 return H_PARAMETER;
750 }
751 state = &ics->irq_state[src];
752
753 /* Still asserted, resend it */
754 if (state->asserted)
755 icp_deliver_irq(xics, icp, irq);
756
757 return H_SUCCESS;
758}
759
760static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
761{
762 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
763 struct kvmppc_icp *icp = vcpu->arch.icp;
764
765 XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
766 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
767
768 if (icp->rm_action & XICS_RM_KICK_VCPU)
769 kvmppc_fast_vcpu_kick(icp->rm_kick_target);
770 if (icp->rm_action & XICS_RM_CHECK_RESEND)
771 icp_check_resend(xics, icp);
772 if (icp->rm_action & XICS_RM_REJECT)
773 icp_deliver_irq(xics, icp, icp->rm_reject);
774
775 icp->rm_action = 0;
776
777 return H_SUCCESS;
778}
779
780int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
781{
782 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
783 unsigned long res;
784 int rc = H_SUCCESS;
785
786 /* Check if we have an ICP */
787 if (!xics || !vcpu->arch.icp)
788 return H_HARDWARE;
789
790 /* Check for real mode returning too hard */
791 if (xics->real_mode)
792 return kvmppc_xics_rm_complete(vcpu, req);
793
794 switch (req) {
795 case H_XIRR:
796 res = kvmppc_h_xirr(vcpu);
797 kvmppc_set_gpr(vcpu, 4, res);
798 break;
799 case H_CPPR:
800 kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
801 break;
802 case H_EOI:
803 rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
804 break;
805 case H_IPI:
806 rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
807 kvmppc_get_gpr(vcpu, 5));
808 break;
809 }
810
811 return rc;
812}
813
814
815/* -- Initialisation code etc. -- */
816
817static int xics_debug_show(struct seq_file *m, void *private)
818{
819 struct kvmppc_xics *xics = m->private;
820 struct kvm *kvm = xics->kvm;
821 struct kvm_vcpu *vcpu;
822 int icsid, i;
823
824 if (!kvm)
825 return 0;
826
827 seq_printf(m, "=========\nICP state\n=========\n");
828
829 kvm_for_each_vcpu(i, vcpu, kvm) {
830 struct kvmppc_icp *icp = vcpu->arch.icp;
831 union kvmppc_icp_state state;
832
833 if (!icp)
834 continue;
835
836 state.raw = ACCESS_ONCE(icp->state.raw);
837 seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
838 icp->server_num, state.xisr,
839 state.pending_pri, state.cppr, state.mfrr,
840 state.out_ee, state.need_resend);
841 }
842
843 for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
844 struct kvmppc_ics *ics = xics->ics[icsid];
845
846 if (!ics)
847 continue;
848
849 seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
850 icsid);
851
852 mutex_lock(&ics->lock);
853
854 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
855 struct ics_irq_state *irq = &ics->irq_state[i];
856
857 seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
858 irq->number, irq->server, irq->priority,
859 irq->saved_priority, irq->asserted,
860 irq->resend, irq->masked_pending);
861
862 }
863 mutex_unlock(&ics->lock);
864 }
865 return 0;
866}
867
868static int xics_debug_open(struct inode *inode, struct file *file)
869{
870 return single_open(file, xics_debug_show, inode->i_private);
871}
872
873static const struct file_operations xics_debug_fops = {
874 .open = xics_debug_open,
875 .read = seq_read,
876 .llseek = seq_lseek,
877 .release = single_release,
878};
879
880static void xics_debugfs_init(struct kvmppc_xics *xics)
881{
882 char *name;
883
884 name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
885 if (!name) {
886 pr_err("%s: no memory for name\n", __func__);
887 return;
888 }
889
890 xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
891 xics, &xics_debug_fops);
892
893 pr_debug("%s: created %s\n", __func__, name);
894 kfree(name);
895}
896
897static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
898 struct kvmppc_xics *xics, int irq)
899{
900 struct kvmppc_ics *ics;
901 int i, icsid;
902
903 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
904
905 mutex_lock(&kvm->lock);
906
907 /* ICS already exists - somebody else got here first */
908 if (xics->ics[icsid])
909 goto out;
910
911 /* Create the ICS */
912 ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
913 if (!ics)
914 goto out;
915
916 mutex_init(&ics->lock);
917 ics->icsid = icsid;
918
919 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
920 ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
921 ics->irq_state[i].priority = MASKED;
922 ics->irq_state[i].saved_priority = MASKED;
923 }
924 smp_wmb();
925 xics->ics[icsid] = ics;
926
927 if (icsid > xics->max_icsid)
928 xics->max_icsid = icsid;
929
930 out:
931 mutex_unlock(&kvm->lock);
932 return xics->ics[icsid];
933}
934
935int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
936{
937 struct kvmppc_icp *icp;
938
939 if (!vcpu->kvm->arch.xics)
940 return -ENODEV;
941
942 if (kvmppc_xics_find_server(vcpu->kvm, server_num))
943 return -EEXIST;
944
945 icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
946 if (!icp)
947 return -ENOMEM;
948
949 icp->vcpu = vcpu;
950 icp->server_num = server_num;
951 icp->state.mfrr = MASKED;
952 icp->state.pending_pri = MASKED;
953 vcpu->arch.icp = icp;
954
955 XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
956
957 return 0;
958}
959
960u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
961{
962 struct kvmppc_icp *icp = vcpu->arch.icp;
963 union kvmppc_icp_state state;
964
965 if (!icp)
966 return 0;
967 state = icp->state;
968 return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
969 ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
970 ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
971 ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
972}
973
974int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
975{
976 struct kvmppc_icp *icp = vcpu->arch.icp;
977 struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
978 union kvmppc_icp_state old_state, new_state;
979 struct kvmppc_ics *ics;
980 u8 cppr, mfrr, pending_pri;
981 u32 xisr;
982 u16 src;
983 bool resend;
984
985 if (!icp || !xics)
986 return -ENOENT;
987
988 cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
989 xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
990 KVM_REG_PPC_ICP_XISR_MASK;
991 mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
992 pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
993
994 /* Require the new state to be internally consistent */
995 if (xisr == 0) {
996 if (pending_pri != 0xff)
997 return -EINVAL;
998 } else if (xisr == XICS_IPI) {
999 if (pending_pri != mfrr || pending_pri >= cppr)
1000 return -EINVAL;
1001 } else {
1002 if (pending_pri >= mfrr || pending_pri >= cppr)
1003 return -EINVAL;
1004 ics = kvmppc_xics_find_ics(xics, xisr, &src);
1005 if (!ics)
1006 return -EINVAL;
1007 }
1008
1009 new_state.raw = 0;
1010 new_state.cppr = cppr;
1011 new_state.xisr = xisr;
1012 new_state.mfrr = mfrr;
1013 new_state.pending_pri = pending_pri;
1014
1015 /*
1016 * Deassert the CPU interrupt request.
1017 * icp_try_update will reassert it if necessary.
1018 */
1019 kvmppc_book3s_dequeue_irqprio(icp->vcpu,
1020 BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
1021
1022 /*
1023 * Note that if we displace an interrupt from old_state.xisr,
1024 * we don't mark it as rejected. We expect userspace to set
1025 * the state of the interrupt sources to be consistent with
1026 * the ICP states (either before or afterwards, which doesn't
1027 * matter). We do handle resends due to CPPR becoming less
1028 * favoured because that is necessary to end up with a
1029 * consistent state in the situation where userspace restores
1030 * the ICS states before the ICP states.
1031 */
1032 do {
1033 old_state = ACCESS_ONCE(icp->state);
1034
1035 if (new_state.mfrr <= old_state.mfrr) {
1036 resend = false;
1037 new_state.need_resend = old_state.need_resend;
1038 } else {
1039 resend = old_state.need_resend;
1040 new_state.need_resend = 0;
1041 }
1042 } while (!icp_try_update(icp, old_state, new_state, false));
1043
1044 if (resend)
1045 icp_check_resend(xics, icp);
1046
1047 return 0;
1048}
1049
1050static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
1051{
1052 int ret;
1053 struct kvmppc_ics *ics;
1054 struct ics_irq_state *irqp;
1055 u64 __user *ubufp = (u64 __user *) addr;
1056 u16 idx;
1057 u64 val, prio;
1058
1059 ics = kvmppc_xics_find_ics(xics, irq, &idx);
1060 if (!ics)
1061 return -ENOENT;
1062
1063 irqp = &ics->irq_state[idx];
1064 mutex_lock(&ics->lock);
1065 ret = -ENOENT;
1066 if (irqp->exists) {
1067 val = irqp->server;
1068 prio = irqp->priority;
1069 if (prio == MASKED) {
1070 val |= KVM_XICS_MASKED;
1071 prio = irqp->saved_priority;
1072 }
1073 val |= prio << KVM_XICS_PRIORITY_SHIFT;
1074 if (irqp->asserted)
1075 val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
1076 else if (irqp->masked_pending || irqp->resend)
1077 val |= KVM_XICS_PENDING;
1078 ret = 0;
1079 }
1080 mutex_unlock(&ics->lock);
1081
1082 if (!ret && put_user(val, ubufp))
1083 ret = -EFAULT;
1084
1085 return ret;
1086}
1087
1088static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
1089{
1090 struct kvmppc_ics *ics;
1091 struct ics_irq_state *irqp;
1092 u64 __user *ubufp = (u64 __user *) addr;
1093 u16 idx;
1094 u64 val;
1095 u8 prio;
1096 u32 server;
1097
1098 if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
1099 return -ENOENT;
1100
1101 ics = kvmppc_xics_find_ics(xics, irq, &idx);
1102 if (!ics) {
1103 ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
1104 if (!ics)
1105 return -ENOMEM;
1106 }
1107 irqp = &ics->irq_state[idx];
1108 if (get_user(val, ubufp))
1109 return -EFAULT;
1110
1111 server = val & KVM_XICS_DESTINATION_MASK;
1112 prio = val >> KVM_XICS_PRIORITY_SHIFT;
1113 if (prio != MASKED &&
1114 kvmppc_xics_find_server(xics->kvm, server) == NULL)
1115 return -EINVAL;
1116
1117 mutex_lock(&ics->lock);
1118 irqp->server = server;
1119 irqp->saved_priority = prio;
1120 if (val & KVM_XICS_MASKED)
1121 prio = MASKED;
1122 irqp->priority = prio;
1123 irqp->resend = 0;
1124 irqp->masked_pending = 0;
1125 irqp->asserted = 0;
1126 if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
1127 irqp->asserted = 1;
1128 irqp->exists = 1;
1129 mutex_unlock(&ics->lock);
1130
1131 if (val & KVM_XICS_PENDING)
1132 icp_deliver_irq(xics, NULL, irqp->number);
1133
1134 return 0;
1135}
1136
1137int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
1138 bool line_status)
1139{
1140 struct kvmppc_xics *xics = kvm->arch.xics;
1141
1142 return ics_deliver_irq(xics, irq, level, line_status);
1143}
1144
1145static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1146{
1147 struct kvmppc_xics *xics = dev->private;
1148
1149 switch (attr->group) {
1150 case KVM_DEV_XICS_GRP_SOURCES:
1151 return xics_set_source(xics, attr->attr, attr->addr);
1152 }
1153 return -ENXIO;
1154}
1155
1156static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1157{
1158 struct kvmppc_xics *xics = dev->private;
1159
1160 switch (attr->group) {
1161 case KVM_DEV_XICS_GRP_SOURCES:
1162 return xics_get_source(xics, attr->attr, attr->addr);
1163 }
1164 return -ENXIO;
1165}
1166
1167static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1168{
1169 switch (attr->group) {
1170 case KVM_DEV_XICS_GRP_SOURCES:
1171 if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
1172 attr->attr < KVMPPC_XICS_NR_IRQS)
1173 return 0;
1174 break;
1175 }
1176 return -ENXIO;
1177}
1178
1179static void kvmppc_xics_free(struct kvm_device *dev)
1180{
1181 struct kvmppc_xics *xics = dev->private;
1182 int i;
1183 struct kvm *kvm = xics->kvm;
1184
1185 debugfs_remove(xics->dentry);
1186
1187 if (kvm)
1188 kvm->arch.xics = NULL;
1189
1190 for (i = 0; i <= xics->max_icsid; i++)
1191 kfree(xics->ics[i]);
1192 kfree(xics);
1193 kfree(dev);
1194}
1195
1196static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
1197{
1198 struct kvmppc_xics *xics;
1199 struct kvm *kvm = dev->kvm;
1200 int ret = 0;
1201
1202 xics = kzalloc(sizeof(*xics), GFP_KERNEL);
1203 if (!xics)
1204 return -ENOMEM;
1205
1206 dev->private = xics;
1207 xics->dev = dev;
1208 xics->kvm = kvm;
1209
1210 /* Already there ? */
1211 mutex_lock(&kvm->lock);
1212 if (kvm->arch.xics)
1213 ret = -EEXIST;
1214 else
1215 kvm->arch.xics = xics;
1216 mutex_unlock(&kvm->lock);
1217
1218 if (ret)
1219 return ret;
1220
1221 xics_debugfs_init(xics);
1222
1223#ifdef CONFIG_KVM_BOOK3S_64_HV
1224 if (cpu_has_feature(CPU_FTR_ARCH_206)) {
1225 /* Enable real mode support */
1226 xics->real_mode = ENABLE_REALMODE;
1227 xics->real_mode_dbg = DEBUG_REALMODE;
1228 }
1229#endif /* CONFIG_KVM_BOOK3S_64_HV */
1230
1231 return 0;
1232}
1233
1234struct kvm_device_ops kvm_xics_ops = {
1235 .name = "kvm-xics",
1236 .create = kvmppc_xics_create,
1237 .destroy = kvmppc_xics_free,
1238 .set_attr = xics_set_attr,
1239 .get_attr = xics_get_attr,
1240 .has_attr = xics_has_attr,
1241};
1242
1243int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
1244 u32 xcpu)
1245{
1246 struct kvmppc_xics *xics = dev->private;
1247 int r = -EBUSY;
1248
1249 if (dev->ops != &kvm_xics_ops)
1250 return -EPERM;
1251 if (xics->kvm != vcpu->kvm)
1252 return -EPERM;
1253 if (vcpu->arch.irq_type)
1254 return -EBUSY;
1255
1256 r = kvmppc_xics_create_icp(vcpu, xcpu);
1257 if (!r)
1258 vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
1259
1260 return r;
1261}
1262
1263void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
1264{
1265 if (!vcpu->arch.icp)
1266 return;
1267 kfree(vcpu->arch.icp);
1268 vcpu->arch.icp = NULL;
1269 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
1270}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 000000000000..dd9326c5c19b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,130 @@
1/*
2 * Copyright 2012 Michael Ellerman, IBM Corporation.
3 * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
8 */
9
10#ifndef _KVM_PPC_BOOK3S_XICS_H
11#define _KVM_PPC_BOOK3S_XICS_H
12
13/*
14 * We use a two-level tree to store interrupt source information.
15 * There are up to 1024 ICS nodes, each of which can represent
16 * 1024 sources.
17 */
18#define KVMPPC_XICS_MAX_ICS_ID 1023
19#define KVMPPC_XICS_ICS_SHIFT 10
20#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT)
21#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1)
22
23/*
24 * Interrupt source numbers below this are reserved, for example
25 * 0 is "no interrupt", and 2 is used for IPIs.
26 */
27#define KVMPPC_XICS_FIRST_IRQ 16
28#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \
29 KVMPPC_XICS_IRQ_PER_ICS)
30
31/* Priority value to use for disabling an interrupt */
32#define MASKED 0xff
33
34/* State for one irq source */
35struct ics_irq_state {
36 u32 number;
37 u32 server;
38 u8 priority;
39 u8 saved_priority;
40 u8 resend;
41 u8 masked_pending;
42 u8 asserted; /* Only for LSI */
43 u8 exists;
44};
45
46/* Atomic ICP state, updated with a single compare & swap */
47union kvmppc_icp_state {
48 unsigned long raw;
49 struct {
50 u8 out_ee:1;
51 u8 need_resend:1;
52 u8 cppr;
53 u8 mfrr;
54 u8 pending_pri;
55 u32 xisr;
56 };
57};
58
59/* One bit per ICS */
60#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
61
62struct kvmppc_icp {
63 struct kvm_vcpu *vcpu;
64 unsigned long server_num;
65 union kvmppc_icp_state state;
66 unsigned long resend_map[ICP_RESEND_MAP_SIZE];
67
68 /* Real mode might find something too hard, here's the action
69 * it might request from virtual mode
70 */
71#define XICS_RM_KICK_VCPU 0x1
72#define XICS_RM_CHECK_RESEND 0x2
73#define XICS_RM_REJECT 0x4
74 u32 rm_action;
75 struct kvm_vcpu *rm_kick_target;
76 u32 rm_reject;
77
78 /* Debug stuff for real mode */
79 union kvmppc_icp_state rm_dbgstate;
80 struct kvm_vcpu *rm_dbgtgt;
81};
82
83struct kvmppc_ics {
84 struct mutex lock;
85 u16 icsid;
86 struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
87};
88
89struct kvmppc_xics {
90 struct kvm *kvm;
91 struct kvm_device *dev;
92 struct dentry *dentry;
93 u32 max_icsid;
94 bool real_mode;
95 bool real_mode_dbg;
96 struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
97};
98
99static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
100 u32 nr)
101{
102 struct kvm_vcpu *vcpu = NULL;
103 int i;
104
105 kvm_for_each_vcpu(i, vcpu, kvm) {
106 if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
107 return vcpu->arch.icp;
108 }
109 return NULL;
110}
111
112static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
113 u32 irq, u16 *source)
114{
115 u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
116 u16 src = irq & KVMPPC_XICS_SRC_MASK;
117 struct kvmppc_ics *ics;
118
119 if (source)
120 *source = src;
121 if (icsid > KVMPPC_XICS_MAX_ICS_ID)
122 return NULL;
123 ics = xics->ics[icsid];
124 if (!ics)
125 return NULL;
126 return ics;
127}
128
129
130#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 020923e43134..1020119226db 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -222,8 +222,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
222 kvmppc_booke_queue_irqprio(vcpu, prio); 222 kvmppc_booke_queue_irqprio(vcpu, prio);
223} 223}
224 224
225void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 225void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
226 struct kvm_interrupt *irq)
227{ 226{
228 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); 227 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
229 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 228 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
@@ -347,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
347 keep_irq = true; 346 keep_irq = true;
348 } 347 }
349 348
350 if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled) 349 if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
351 update_epr = true; 350 update_epr = true;
352 351
353 switch (priority) { 352 switch (priority) {
@@ -428,8 +427,14 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
428 set_guest_esr(vcpu, vcpu->arch.queued_esr); 427 set_guest_esr(vcpu, vcpu->arch.queued_esr);
429 if (update_dear == true) 428 if (update_dear == true)
430 set_guest_dear(vcpu, vcpu->arch.queued_dear); 429 set_guest_dear(vcpu, vcpu->arch.queued_dear);
431 if (update_epr == true) 430 if (update_epr == true) {
432 kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); 431 if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
432 kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
433 else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
434 BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
435 kvmppc_mpic_set_epr(vcpu);
436 }
437 }
433 438
434 new_msr &= msr_mask; 439 new_msr &= msr_mask;
435#if defined(CONFIG_64BIT) 440#if defined(CONFIG_64BIT)
@@ -746,6 +751,9 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
746 kvmppc_core_queue_program(vcpu, ESR_PIL); 751 kvmppc_core_queue_program(vcpu, ESR_PIL);
747 return RESUME_HOST; 752 return RESUME_HOST;
748 753
754 case EMULATE_EXIT_USER:
755 return RESUME_HOST;
756
749 default: 757 default:
750 BUG(); 758 BUG();
751 } 759 }
@@ -1148,6 +1156,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
1148 return r; 1156 return r;
1149} 1157}
1150 1158
1159static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr)
1160{
1161 u32 old_tsr = vcpu->arch.tsr;
1162
1163 vcpu->arch.tsr = new_tsr;
1164
1165 if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
1166 arm_next_watchdog(vcpu);
1167
1168 update_timer_ints(vcpu);
1169}
1170
1151/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ 1171/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
1152int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 1172int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
1153{ 1173{
@@ -1287,16 +1307,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
1287 kvmppc_emulate_dec(vcpu); 1307 kvmppc_emulate_dec(vcpu);
1288 } 1308 }
1289 1309
1290 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 1310 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR)
1291 u32 old_tsr = vcpu->arch.tsr; 1311 kvmppc_set_tsr(vcpu, sregs->u.e.tsr);
1292
1293 vcpu->arch.tsr = sregs->u.e.tsr;
1294
1295 if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
1296 arm_next_watchdog(vcpu);
1297
1298 update_timer_ints(vcpu);
1299 }
1300 1312
1301 return 0; 1313 return 0;
1302} 1314}
@@ -1409,84 +1421,134 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
1409 1421
1410int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1422int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1411{ 1423{
1412 int r = -EINVAL; 1424 int r = 0;
1425 union kvmppc_one_reg val;
1426 int size;
1427 long int i;
1428
1429 size = one_reg_size(reg->id);
1430 if (size > sizeof(val))
1431 return -EINVAL;
1413 1432
1414 switch (reg->id) { 1433 switch (reg->id) {
1415 case KVM_REG_PPC_IAC1: 1434 case KVM_REG_PPC_IAC1:
1416 case KVM_REG_PPC_IAC2: 1435 case KVM_REG_PPC_IAC2:
1417 case KVM_REG_PPC_IAC3: 1436 case KVM_REG_PPC_IAC3:
1418 case KVM_REG_PPC_IAC4: { 1437 case KVM_REG_PPC_IAC4:
1419 int iac = reg->id - KVM_REG_PPC_IAC1; 1438 i = reg->id - KVM_REG_PPC_IAC1;
1420 r = copy_to_user((u64 __user *)(long)reg->addr, 1439 val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]);
1421 &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
1422 break; 1440 break;
1423 }
1424 case KVM_REG_PPC_DAC1: 1441 case KVM_REG_PPC_DAC1:
1425 case KVM_REG_PPC_DAC2: { 1442 case KVM_REG_PPC_DAC2:
1426 int dac = reg->id - KVM_REG_PPC_DAC1; 1443 i = reg->id - KVM_REG_PPC_DAC1;
1427 r = copy_to_user((u64 __user *)(long)reg->addr, 1444 val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]);
1428 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
1429 break; 1445 break;
1430 }
1431 case KVM_REG_PPC_EPR: { 1446 case KVM_REG_PPC_EPR: {
1432 u32 epr = get_guest_epr(vcpu); 1447 u32 epr = get_guest_epr(vcpu);
1433 r = put_user(epr, (u32 __user *)(long)reg->addr); 1448 val = get_reg_val(reg->id, epr);
1434 break; 1449 break;
1435 } 1450 }
1436#if defined(CONFIG_64BIT) 1451#if defined(CONFIG_64BIT)
1437 case KVM_REG_PPC_EPCR: 1452 case KVM_REG_PPC_EPCR:
1438 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); 1453 val = get_reg_val(reg->id, vcpu->arch.epcr);
1439 break; 1454 break;
1440#endif 1455#endif
1456 case KVM_REG_PPC_TCR:
1457 val = get_reg_val(reg->id, vcpu->arch.tcr);
1458 break;
1459 case KVM_REG_PPC_TSR:
1460 val = get_reg_val(reg->id, vcpu->arch.tsr);
1461 break;
1462 case KVM_REG_PPC_DEBUG_INST:
1463 val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV);
1464 break;
1441 default: 1465 default:
1466 r = kvmppc_get_one_reg(vcpu, reg->id, &val);
1442 break; 1467 break;
1443 } 1468 }
1469
1470 if (r)
1471 return r;
1472
1473 if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
1474 r = -EFAULT;
1475
1444 return r; 1476 return r;
1445} 1477}
1446 1478
1447int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1479int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1448{ 1480{
1449 int r = -EINVAL; 1481 int r = 0;
1482 union kvmppc_one_reg val;
1483 int size;
1484 long int i;
1485
1486 size = one_reg_size(reg->id);
1487 if (size > sizeof(val))
1488 return -EINVAL;
1489
1490 if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
1491 return -EFAULT;
1450 1492
1451 switch (reg->id) { 1493 switch (reg->id) {
1452 case KVM_REG_PPC_IAC1: 1494 case KVM_REG_PPC_IAC1:
1453 case KVM_REG_PPC_IAC2: 1495 case KVM_REG_PPC_IAC2:
1454 case KVM_REG_PPC_IAC3: 1496 case KVM_REG_PPC_IAC3:
1455 case KVM_REG_PPC_IAC4: { 1497 case KVM_REG_PPC_IAC4:
1456 int iac = reg->id - KVM_REG_PPC_IAC1; 1498 i = reg->id - KVM_REG_PPC_IAC1;
1457 r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], 1499 vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val);
1458 (u64 __user *)(long)reg->addr, sizeof(u64));
1459 break; 1500 break;
1460 }
1461 case KVM_REG_PPC_DAC1: 1501 case KVM_REG_PPC_DAC1:
1462 case KVM_REG_PPC_DAC2: { 1502 case KVM_REG_PPC_DAC2:
1463 int dac = reg->id - KVM_REG_PPC_DAC1; 1503 i = reg->id - KVM_REG_PPC_DAC1;
1464 r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], 1504 vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val);
1465 (u64 __user *)(long)reg->addr, sizeof(u64));
1466 break; 1505 break;
1467 }
1468 case KVM_REG_PPC_EPR: { 1506 case KVM_REG_PPC_EPR: {
1469 u32 new_epr; 1507 u32 new_epr = set_reg_val(reg->id, val);
1470 r = get_user(new_epr, (u32 __user *)(long)reg->addr); 1508 kvmppc_set_epr(vcpu, new_epr);
1471 if (!r)
1472 kvmppc_set_epr(vcpu, new_epr);
1473 break; 1509 break;
1474 } 1510 }
1475#if defined(CONFIG_64BIT) 1511#if defined(CONFIG_64BIT)
1476 case KVM_REG_PPC_EPCR: { 1512 case KVM_REG_PPC_EPCR: {
1477 u32 new_epcr; 1513 u32 new_epcr = set_reg_val(reg->id, val);
1478 r = get_user(new_epcr, (u32 __user *)(long)reg->addr); 1514 kvmppc_set_epcr(vcpu, new_epcr);
1479 if (r == 0)
1480 kvmppc_set_epcr(vcpu, new_epcr);
1481 break; 1515 break;
1482 } 1516 }
1483#endif 1517#endif
1518 case KVM_REG_PPC_OR_TSR: {
1519 u32 tsr_bits = set_reg_val(reg->id, val);
1520 kvmppc_set_tsr_bits(vcpu, tsr_bits);
1521 break;
1522 }
1523 case KVM_REG_PPC_CLEAR_TSR: {
1524 u32 tsr_bits = set_reg_val(reg->id, val);
1525 kvmppc_clr_tsr_bits(vcpu, tsr_bits);
1526 break;
1527 }
1528 case KVM_REG_PPC_TSR: {
1529 u32 tsr = set_reg_val(reg->id, val);
1530 kvmppc_set_tsr(vcpu, tsr);
1531 break;
1532 }
1533 case KVM_REG_PPC_TCR: {
1534 u32 tcr = set_reg_val(reg->id, val);
1535 kvmppc_set_tcr(vcpu, tcr);
1536 break;
1537 }
1484 default: 1538 default:
1539 r = kvmppc_set_one_reg(vcpu, reg->id, &val);
1485 break; 1540 break;
1486 } 1541 }
1542
1487 return r; 1543 return r;
1488} 1544}
1489 1545
1546int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
1547 struct kvm_guest_debug *dbg)
1548{
1549 return -EINVAL;
1550}
1551
1490int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1552int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
1491{ 1553{
1492 return -ENOTSUPP; 1554 return -ENOTSUPP;
@@ -1531,7 +1593,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
1531 1593
1532void kvmppc_core_commit_memory_region(struct kvm *kvm, 1594void kvmppc_core_commit_memory_region(struct kvm *kvm,
1533 struct kvm_userspace_memory_region *mem, 1595 struct kvm_userspace_memory_region *mem,
1534 struct kvm_memory_slot old) 1596 const struct kvm_memory_slot *old)
1535{ 1597{
1536} 1598}
1537 1599
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index f4bb55c96517..2c6deb5ef2fe 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -54,8 +54,7 @@
54 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ 54 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
55 (1<<BOOKE_INTERRUPT_ALIGNMENT)) 55 (1<<BOOKE_INTERRUPT_ALIGNMENT))
56 56
57.macro KVM_HANDLER ivor_nr scratch srr0 57.macro __KVM_HANDLER ivor_nr scratch srr0
58_GLOBAL(kvmppc_handler_\ivor_nr)
59 /* Get pointer to vcpu and record exit number. */ 58 /* Get pointer to vcpu and record exit number. */
60 mtspr \scratch , r4 59 mtspr \scratch , r4
61 mfspr r4, SPRN_SPRG_THREAD 60 mfspr r4, SPRN_SPRG_THREAD
@@ -76,6 +75,43 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
76 bctr 75 bctr
77.endm 76.endm
78 77
78.macro KVM_HANDLER ivor_nr scratch srr0
79_GLOBAL(kvmppc_handler_\ivor_nr)
80 __KVM_HANDLER \ivor_nr \scratch \srr0
81.endm
82
83.macro KVM_DBG_HANDLER ivor_nr scratch srr0
84_GLOBAL(kvmppc_handler_\ivor_nr)
85 mtspr \scratch, r4
86 mfspr r4, SPRN_SPRG_THREAD
87 lwz r4, THREAD_KVM_VCPU(r4)
88 stw r3, VCPU_CRIT_SAVE(r4)
89 mfcr r3
90 mfspr r4, SPRN_CSRR1
91 andi. r4, r4, MSR_PR
92 bne 1f
93 /* debug interrupt happened in enter/exit path */
94 mfspr r4, SPRN_CSRR1
95 rlwinm r4, r4, 0, ~MSR_DE
96 mtspr SPRN_CSRR1, r4
97 lis r4, 0xffff
98 ori r4, r4, 0xffff
99 mtspr SPRN_DBSR, r4
100 mfspr r4, SPRN_SPRG_THREAD
101 lwz r4, THREAD_KVM_VCPU(r4)
102 mtcr r3
103 lwz r3, VCPU_CRIT_SAVE(r4)
104 mfspr r4, \scratch
105 rfci
1061: /* debug interrupt happened in guest */
107 mtcr r3
108 mfspr r4, SPRN_SPRG_THREAD
109 lwz r4, THREAD_KVM_VCPU(r4)
110 lwz r3, VCPU_CRIT_SAVE(r4)
111 mfspr r4, \scratch
112 __KVM_HANDLER \ivor_nr \scratch \srr0
113.endm
114
79.macro KVM_HANDLER_ADDR ivor_nr 115.macro KVM_HANDLER_ADDR ivor_nr
80 .long kvmppc_handler_\ivor_nr 116 .long kvmppc_handler_\ivor_nr
81.endm 117.endm
@@ -100,7 +136,7 @@ KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
100KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 136KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
101KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 137KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
102KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 138KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
103KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 139KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
104KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 140KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
105KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 141KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
106KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 142KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 6dd4de7802bf..ce6b73c29612 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -425,6 +425,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
425 return kvmppc_set_sregs_ivor(vcpu, sregs); 425 return kvmppc_set_sregs_ivor(vcpu, sregs);
426} 426}
427 427
428int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
429 union kvmppc_one_reg *val)
430{
431 int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
432 return r;
433}
434
435int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
436 union kvmppc_one_reg *val)
437{
438 int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
439 return r;
440}
441
428struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 442struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
429{ 443{
430 struct kvmppc_vcpu_e500 *vcpu_e500; 444 struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 33db48a8ce24..c2e5e98453a6 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -23,6 +23,10 @@
23#include <asm/mmu-book3e.h> 23#include <asm/mmu-book3e.h>
24#include <asm/tlb.h> 24#include <asm/tlb.h>
25 25
26enum vcpu_ftr {
27 VCPU_FTR_MMU_V2
28};
29
26#define E500_PID_NUM 3 30#define E500_PID_NUM 3
27#define E500_TLB_NUM 2 31#define E500_TLB_NUM 2
28 32
@@ -131,6 +135,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
131void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 135void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
132int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 136int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
133 137
138int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
139 union kvmppc_one_reg *val);
140int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
141 union kvmppc_one_reg *val);
134 142
135#ifdef CONFIG_KVM_E500V2 143#ifdef CONFIG_KVM_E500V2
136unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, 144unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -295,4 +303,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
295#define get_tlb_sts(gtlbe) (MAS1_TS) 303#define get_tlb_sts(gtlbe) (MAS1_TS)
296#endif /* !BOOKE_HV */ 304#endif /* !BOOKE_HV */
297 305
306static inline bool has_feature(const struct kvm_vcpu *vcpu,
307 enum vcpu_ftr ftr)
308{
309 bool has_ftr;
310 switch (ftr) {
311 case VCPU_FTR_MMU_V2:
312 has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
313 break;
314 default:
315 return false;
316 }
317 return has_ftr;
318}
319
298#endif /* KVM_E500_H */ 320#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e78f353a836a..b10a01243abd 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -284,6 +284,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
284 case SPRN_TLB1CFG: 284 case SPRN_TLB1CFG:
285 *spr_val = vcpu->arch.tlbcfg[1]; 285 *spr_val = vcpu->arch.tlbcfg[1];
286 break; 286 break;
287 case SPRN_TLB0PS:
288 if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
289 return EMULATE_FAIL;
290 *spr_val = vcpu->arch.tlbps[0];
291 break;
292 case SPRN_TLB1PS:
293 if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
294 return EMULATE_FAIL;
295 *spr_val = vcpu->arch.tlbps[1];
296 break;
287 case SPRN_L1CSR0: 297 case SPRN_L1CSR0:
288 *spr_val = vcpu_e500->l1csr0; 298 *spr_val = vcpu_e500->l1csr0;
289 break; 299 break;
@@ -307,6 +317,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
307 case SPRN_MMUCFG: 317 case SPRN_MMUCFG:
308 *spr_val = vcpu->arch.mmucfg; 318 *spr_val = vcpu->arch.mmucfg;
309 break; 319 break;
320 case SPRN_EPTCFG:
321 if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
322 return EMULATE_FAIL;
323 /*
324 * Legacy Linux guests access EPTCFG register even if the E.PT
325 * category is disabled in the VM. Give them a chance to live.
326 */
327 *spr_val = vcpu->arch.eptcfg;
328 break;
310 329
311 /* extra exceptions */ 330 /* extra exceptions */
312 case SPRN_IVOR32: 331 case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 5c4475983f78..c41a5a96b558 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -596,6 +596,140 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
596 return 0; 596 return 0;
597} 597}
598 598
599int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
600 union kvmppc_one_reg *val)
601{
602 int r = 0;
603 long int i;
604
605 switch (id) {
606 case KVM_REG_PPC_MAS0:
607 *val = get_reg_val(id, vcpu->arch.shared->mas0);
608 break;
609 case KVM_REG_PPC_MAS1:
610 *val = get_reg_val(id, vcpu->arch.shared->mas1);
611 break;
612 case KVM_REG_PPC_MAS2:
613 *val = get_reg_val(id, vcpu->arch.shared->mas2);
614 break;
615 case KVM_REG_PPC_MAS7_3:
616 *val = get_reg_val(id, vcpu->arch.shared->mas7_3);
617 break;
618 case KVM_REG_PPC_MAS4:
619 *val = get_reg_val(id, vcpu->arch.shared->mas4);
620 break;
621 case KVM_REG_PPC_MAS6:
622 *val = get_reg_val(id, vcpu->arch.shared->mas6);
623 break;
624 case KVM_REG_PPC_MMUCFG:
625 *val = get_reg_val(id, vcpu->arch.mmucfg);
626 break;
627 case KVM_REG_PPC_EPTCFG:
628 *val = get_reg_val(id, vcpu->arch.eptcfg);
629 break;
630 case KVM_REG_PPC_TLB0CFG:
631 case KVM_REG_PPC_TLB1CFG:
632 case KVM_REG_PPC_TLB2CFG:
633 case KVM_REG_PPC_TLB3CFG:
634 i = id - KVM_REG_PPC_TLB0CFG;
635 *val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
636 break;
637 case KVM_REG_PPC_TLB0PS:
638 case KVM_REG_PPC_TLB1PS:
639 case KVM_REG_PPC_TLB2PS:
640 case KVM_REG_PPC_TLB3PS:
641 i = id - KVM_REG_PPC_TLB0PS;
642 *val = get_reg_val(id, vcpu->arch.tlbps[i]);
643 break;
644 default:
645 r = -EINVAL;
646 break;
647 }
648
649 return r;
650}
651
652int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
653 union kvmppc_one_reg *val)
654{
655 int r = 0;
656 long int i;
657
658 switch (id) {
659 case KVM_REG_PPC_MAS0:
660 vcpu->arch.shared->mas0 = set_reg_val(id, *val);
661 break;
662 case KVM_REG_PPC_MAS1:
663 vcpu->arch.shared->mas1 = set_reg_val(id, *val);
664 break;
665 case KVM_REG_PPC_MAS2:
666 vcpu->arch.shared->mas2 = set_reg_val(id, *val);
667 break;
668 case KVM_REG_PPC_MAS7_3:
669 vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
670 break;
671 case KVM_REG_PPC_MAS4:
672 vcpu->arch.shared->mas4 = set_reg_val(id, *val);
673 break;
674 case KVM_REG_PPC_MAS6:
675 vcpu->arch.shared->mas6 = set_reg_val(id, *val);
676 break;
677 /* Only allow MMU registers to be set to the config supported by KVM */
678 case KVM_REG_PPC_MMUCFG: {
679 u32 reg = set_reg_val(id, *val);
680 if (reg != vcpu->arch.mmucfg)
681 r = -EINVAL;
682 break;
683 }
684 case KVM_REG_PPC_EPTCFG: {
685 u32 reg = set_reg_val(id, *val);
686 if (reg != vcpu->arch.eptcfg)
687 r = -EINVAL;
688 break;
689 }
690 case KVM_REG_PPC_TLB0CFG:
691 case KVM_REG_PPC_TLB1CFG:
692 case KVM_REG_PPC_TLB2CFG:
693 case KVM_REG_PPC_TLB3CFG: {
694 /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
695 u32 reg = set_reg_val(id, *val);
696 i = id - KVM_REG_PPC_TLB0CFG;
697 if (reg != vcpu->arch.tlbcfg[i])
698 r = -EINVAL;
699 break;
700 }
701 case KVM_REG_PPC_TLB0PS:
702 case KVM_REG_PPC_TLB1PS:
703 case KVM_REG_PPC_TLB2PS:
704 case KVM_REG_PPC_TLB3PS: {
705 u32 reg = set_reg_val(id, *val);
706 i = id - KVM_REG_PPC_TLB0PS;
707 if (reg != vcpu->arch.tlbps[i])
708 r = -EINVAL;
709 break;
710 }
711 default:
712 r = -EINVAL;
713 break;
714 }
715
716 return r;
717}
718
719static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
720 struct kvm_book3e_206_tlb_params *params)
721{
722 vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
723 if (params->tlb_sizes[0] <= 2048)
724 vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
725 vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
726
727 vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
728 vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
729 vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
730 return 0;
731}
732
599int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 733int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
600 struct kvm_config_tlb *cfg) 734 struct kvm_config_tlb *cfg)
601{ 735{
@@ -692,16 +826,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
692 vcpu_e500->gtlb_offset[0] = 0; 826 vcpu_e500->gtlb_offset[0] = 0;
693 vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; 827 vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
694 828
695 vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; 829 /* Update vcpu's MMU geometry based on SW_TLB input */
696 830 vcpu_mmu_geometry_update(vcpu, &params);
697 vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
698 if (params.tlb_sizes[0] <= 2048)
699 vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
700 vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
701
702 vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
703 vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
704 vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
705 831
706 vcpu_e500->shared_tlb_pages = pages; 832 vcpu_e500->shared_tlb_pages = pages;
707 vcpu_e500->num_shared_tlb_pages = num_pages; 833 vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -737,6 +863,39 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
737 return 0; 863 return 0;
738} 864}
739 865
866/* Vcpu's MMU default configuration */
867static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
868 struct kvmppc_e500_tlb_params *params)
869{
870 /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
871 vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
872
873 /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
874 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
875 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
876 vcpu->arch.tlbcfg[0] |= params[0].entries;
877 vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
878
879 vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
880 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
881 vcpu->arch.tlbcfg[1] |= params[1].entries;
882 vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
883
884 if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
885 vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
886 vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
887
888 vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
889
890 /* Guest mmu emulation currently doesn't handle E.PT */
891 vcpu->arch.eptcfg = 0;
892 vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
893 vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
894 }
895
896 return 0;
897}
898
740int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) 899int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
741{ 900{
742 struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; 901 struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
@@ -781,18 +940,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
781 if (!vcpu_e500->g2h_tlb1_map) 940 if (!vcpu_e500->g2h_tlb1_map)
782 goto err; 941 goto err;
783 942
784 /* Init TLB configuration register */ 943 vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
785 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
786 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
787 vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
788 vcpu->arch.tlbcfg[0] |=
789 vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
790
791 vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
792 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
793 vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries;
794 vcpu->arch.tlbcfg[1] |=
795 vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
796 944
797 kvmppc_recalc_tlb1map_range(vcpu_e500); 945 kvmppc_recalc_tlb1map_range(vcpu_e500);
798 return 0; 946 return 0;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 2f4baa074b2e..753cc99eff2b 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -177,6 +177,8 @@ int kvmppc_core_check_processor_compat(void)
177 r = 0; 177 r = 0;
178 else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) 178 else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
179 r = 0; 179 r = 0;
180 else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
181 r = 0;
180 else 182 else
181 r = -ENOTSUPP; 183 r = -ENOTSUPP;
182 184
@@ -260,6 +262,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
260 return kvmppc_set_sregs_ivor(vcpu, sregs); 262 return kvmppc_set_sregs_ivor(vcpu, sregs);
261} 263}
262 264
265int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
266 union kvmppc_one_reg *val)
267{
268 int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
269 return r;
270}
271
272int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
273 union kvmppc_one_reg *val)
274{
275 int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
276 return r;
277}
278
263struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) 279struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
264{ 280{
265 struct kvmppc_vcpu_e500 *vcpu_e500; 281 struct kvmppc_vcpu_e500 *vcpu_e500;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 7a73b6f72a8b..631a2650e4e4 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -38,6 +38,7 @@
38 38
39#define OP_31_XOP_TRAP 4 39#define OP_31_XOP_TRAP 4
40#define OP_31_XOP_LWZX 23 40#define OP_31_XOP_LWZX 23
41#define OP_31_XOP_DCBST 54
41#define OP_31_XOP_TRAP_64 68 42#define OP_31_XOP_TRAP_64 68
42#define OP_31_XOP_DCBF 86 43#define OP_31_XOP_DCBF 86
43#define OP_31_XOP_LBZX 87 44#define OP_31_XOP_LBZX 87
@@ -370,6 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
370 emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); 371 emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
371 break; 372 break;
372 373
374 case OP_31_XOP_DCBST:
373 case OP_31_XOP_DCBF: 375 case OP_31_XOP_DCBF:
374 case OP_31_XOP_DCBI: 376 case OP_31_XOP_DCBI:
375 /* Do nothing. The guest is performing dcbi because 377 /* Do nothing. The guest is performing dcbi because
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
new file mode 100644
index 000000000000..5a9a10b90762
--- /dev/null
+++ b/arch/powerpc/kvm/irq.h
@@ -0,0 +1,20 @@
1#ifndef __IRQ_H
2#define __IRQ_H
3
4#include <linux/kvm_host.h>
5
6static inline int irqchip_in_kernel(struct kvm *kvm)
7{
8 int ret = 0;
9
10#ifdef CONFIG_KVM_MPIC
11 ret = ret || (kvm->arch.mpic != NULL);
12#endif
13#ifdef CONFIG_KVM_XICS
14 ret = ret || (kvm->arch.xics != NULL);
15#endif
16 smp_rmb();
17 return ret;
18}
19
20#endif
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 000000000000..2861ae9eaae6
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1853 @@
1/*
2 * OpenPIC emulation
3 *
4 * Copyright (c) 2004 Jocelyn Mayer
5 * 2011 Alexander Graf
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
25
26#include <linux/slab.h>
27#include <linux/mutex.h>
28#include <linux/kvm_host.h>
29#include <linux/errno.h>
30#include <linux/fs.h>
31#include <linux/anon_inodes.h>
32#include <asm/uaccess.h>
33#include <asm/mpic.h>
34#include <asm/kvm_para.h>
35#include <asm/kvm_host.h>
36#include <asm/kvm_ppc.h>
37#include "iodev.h"
38
39#define MAX_CPU 32
40#define MAX_SRC 256
41#define MAX_TMR 4
42#define MAX_IPI 4
43#define MAX_MSI 8
44#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR)
45#define VID 0x03 /* MPIC version ID */
46
47/* OpenPIC capability flags */
48#define OPENPIC_FLAG_IDR_CRIT (1 << 0)
49#define OPENPIC_FLAG_ILR (2 << 0)
50
51/* OpenPIC address map */
52#define OPENPIC_REG_SIZE 0x40000
53#define OPENPIC_GLB_REG_START 0x0
54#define OPENPIC_GLB_REG_SIZE 0x10F0
55#define OPENPIC_TMR_REG_START 0x10F0
56#define OPENPIC_TMR_REG_SIZE 0x220
57#define OPENPIC_MSI_REG_START 0x1600
58#define OPENPIC_MSI_REG_SIZE 0x200
59#define OPENPIC_SUMMARY_REG_START 0x3800
60#define OPENPIC_SUMMARY_REG_SIZE 0x800
61#define OPENPIC_SRC_REG_START 0x10000
62#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20)
63#define OPENPIC_CPU_REG_START 0x20000
64#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000))
65
66struct fsl_mpic_info {
67 int max_ext;
68};
69
70static struct fsl_mpic_info fsl_mpic_20 = {
71 .max_ext = 12,
72};
73
74static struct fsl_mpic_info fsl_mpic_42 = {
75 .max_ext = 12,
76};
77
78#define FRR_NIRQ_SHIFT 16
79#define FRR_NCPU_SHIFT 8
80#define FRR_VID_SHIFT 0
81
82#define VID_REVISION_1_2 2
83#define VID_REVISION_1_3 3
84
85#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */
86
87#define GCR_RESET 0x80000000
88#define GCR_MODE_PASS 0x00000000
89#define GCR_MODE_MIXED 0x20000000
90#define GCR_MODE_PROXY 0x60000000
91
92#define TBCR_CI 0x80000000 /* count inhibit */
93#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */
94
95#define IDR_EP_SHIFT 31
96#define IDR_EP_MASK (1 << IDR_EP_SHIFT)
97#define IDR_CI0_SHIFT 30
98#define IDR_CI1_SHIFT 29
99#define IDR_P1_SHIFT 1
100#define IDR_P0_SHIFT 0
101
102#define ILR_INTTGT_MASK 0x000000ff
103#define ILR_INTTGT_INT 0x00
104#define ILR_INTTGT_CINT 0x01 /* critical */
105#define ILR_INTTGT_MCP 0x02 /* machine check */
106#define NUM_OUTPUTS 3
107
108#define MSIIR_OFFSET 0x140
109#define MSIIR_SRS_SHIFT 29
110#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT)
111#define MSIIR_IBS_SHIFT 24
112#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT)
113
114static int get_current_cpu(void)
115{
116#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
117 struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
118 return vcpu ? vcpu->arch.irq_cpu_id : -1;
119#else
120 /* XXX */
121 return -1;
122#endif
123}
124
125static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
126 u32 val, int idx);
127static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
128 u32 *ptr, int idx);
129
130enum irq_type {
131 IRQ_TYPE_NORMAL = 0,
132 IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */
133 IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */
134};
135
136struct irq_queue {
137 /* Round up to the nearest 64 IRQs so that the queue length
138 * won't change when moving between 32 and 64 bit hosts.
139 */
140 unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
141 int next;
142 int priority;
143};
144
145struct irq_source {
146 uint32_t ivpr; /* IRQ vector/priority register */
147 uint32_t idr; /* IRQ destination register */
148 uint32_t destmask; /* bitmap of CPU destinations */
149 int last_cpu;
150 int output; /* IRQ level, e.g. ILR_INTTGT_INT */
151 int pending; /* TRUE if IRQ is pending */
152 enum irq_type type;
153 bool level:1; /* level-triggered */
154 bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */
155};
156
157#define IVPR_MASK_SHIFT 31
158#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT)
159#define IVPR_ACTIVITY_SHIFT 30
160#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT)
161#define IVPR_MODE_SHIFT 29
162#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT)
163#define IVPR_POLARITY_SHIFT 23
164#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT)
165#define IVPR_SENSE_SHIFT 22
166#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT)
167
168#define IVPR_PRIORITY_MASK (0xF << 16)
169#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
170#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
171
172/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
173#define IDR_EP 0x80000000 /* external pin */
174#define IDR_CI 0x40000000 /* critical interrupt */
175
176struct irq_dest {
177 struct kvm_vcpu *vcpu;
178
179 int32_t ctpr; /* CPU current task priority */
180 struct irq_queue raised;
181 struct irq_queue servicing;
182
183 /* Count of IRQ sources asserting on non-INT outputs */
184 uint32_t outputs_active[NUM_OUTPUTS];
185};
186
187#define MAX_MMIO_REGIONS 10
188
189struct openpic {
190 struct kvm *kvm;
191 struct kvm_device *dev;
192 struct kvm_io_device mmio;
193 const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS];
194 int num_mmio_regions;
195
196 gpa_t reg_base;
197 spinlock_t lock;
198
199 /* Behavior control */
200 struct fsl_mpic_info *fsl;
201 uint32_t model;
202 uint32_t flags;
203 uint32_t nb_irqs;
204 uint32_t vid;
205 uint32_t vir; /* Vendor identification register */
206 uint32_t vector_mask;
207 uint32_t tfrr_reset;
208 uint32_t ivpr_reset;
209 uint32_t idr_reset;
210 uint32_t brr1;
211 uint32_t mpic_mode_mask;
212
213 /* Global registers */
214 uint32_t frr; /* Feature reporting register */
215 uint32_t gcr; /* Global configuration register */
216 uint32_t pir; /* Processor initialization register */
217 uint32_t spve; /* Spurious vector register */
218 uint32_t tfrr; /* Timer frequency reporting register */
219 /* Source registers */
220 struct irq_source src[MAX_IRQ];
221 /* Local registers per output pin */
222 struct irq_dest dst[MAX_CPU];
223 uint32_t nb_cpus;
224 /* Timer registers */
225 struct {
226 uint32_t tccr; /* Global timer current count register */
227 uint32_t tbcr; /* Global timer base count register */
228 } timers[MAX_TMR];
229 /* Shared MSI registers */
230 struct {
231 uint32_t msir; /* Shared Message Signaled Interrupt Register */
232 } msi[MAX_MSI];
233 uint32_t max_irq;
234 uint32_t irq_ipi0;
235 uint32_t irq_tim0;
236 uint32_t irq_msi;
237};
238
239
240static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
241 int output)
242{
243 struct kvm_interrupt irq = {
244 .irq = KVM_INTERRUPT_SET_LEVEL,
245 };
246
247 if (!dst->vcpu) {
248 pr_debug("%s: destination cpu %d does not exist\n",
249 __func__, (int)(dst - &opp->dst[0]));
250 return;
251 }
252
253 pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
254 output);
255
256 if (output != ILR_INTTGT_INT) /* TODO */
257 return;
258
259 kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
260}
261
262static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
263 int output)
264{
265 if (!dst->vcpu) {
266 pr_debug("%s: destination cpu %d does not exist\n",
267 __func__, (int)(dst - &opp->dst[0]));
268 return;
269 }
270
271 pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
272 output);
273
274 if (output != ILR_INTTGT_INT) /* TODO */
275 return;
276
277 kvmppc_core_dequeue_external(dst->vcpu);
278}
279
280static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
281{
282 set_bit(n_IRQ, q->queue);
283}
284
285static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
286{
287 clear_bit(n_IRQ, q->queue);
288}
289
290static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ)
291{
292 return test_bit(n_IRQ, q->queue);
293}
294
295static void IRQ_check(struct openpic *opp, struct irq_queue *q)
296{
297 int irq = -1;
298 int next = -1;
299 int priority = -1;
300
301 for (;;) {
302 irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
303 if (irq == opp->max_irq)
304 break;
305
306 pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
307 irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
308
309 if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
310 next = irq;
311 priority = IVPR_PRIORITY(opp->src[irq].ivpr);
312 }
313 }
314
315 q->next = next;
316 q->priority = priority;
317}
318
319static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
320{
321 /* XXX: optimize */
322 IRQ_check(opp, q);
323
324 return q->next;
325}
326
327static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
328 bool active, bool was_active)
329{
330 struct irq_dest *dst;
331 struct irq_source *src;
332 int priority;
333
334 dst = &opp->dst[n_CPU];
335 src = &opp->src[n_IRQ];
336
337 pr_debug("%s: IRQ %d active %d was %d\n",
338 __func__, n_IRQ, active, was_active);
339
340 if (src->output != ILR_INTTGT_INT) {
341 pr_debug("%s: output %d irq %d active %d was %d count %d\n",
342 __func__, src->output, n_IRQ, active, was_active,
343 dst->outputs_active[src->output]);
344
345 /* On Freescale MPIC, critical interrupts ignore priority,
346 * IACK, EOI, etc. Before MPIC v4.1 they also ignore
347 * masking.
348 */
349 if (active) {
350 if (!was_active &&
351 dst->outputs_active[src->output]++ == 0) {
352 pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
353 __func__, src->output, n_CPU, n_IRQ);
354 mpic_irq_raise(opp, dst, src->output);
355 }
356 } else {
357 if (was_active &&
358 --dst->outputs_active[src->output] == 0) {
359 pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
360 __func__, src->output, n_CPU, n_IRQ);
361 mpic_irq_lower(opp, dst, src->output);
362 }
363 }
364
365 return;
366 }
367
368 priority = IVPR_PRIORITY(src->ivpr);
369
370 /* Even if the interrupt doesn't have enough priority,
371 * it is still raised, in case ctpr is lowered later.
372 */
373 if (active)
374 IRQ_setbit(&dst->raised, n_IRQ);
375 else
376 IRQ_resetbit(&dst->raised, n_IRQ);
377
378 IRQ_check(opp, &dst->raised);
379
380 if (active && priority <= dst->ctpr) {
381 pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
382 __func__, n_IRQ, priority, dst->ctpr, n_CPU);
383 active = 0;
384 }
385
386 if (active) {
387 if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
388 priority <= dst->servicing.priority) {
389 pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
390 __func__, n_IRQ, dst->servicing.next, n_CPU);
391 } else {
392 pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
393 __func__, n_CPU, n_IRQ, dst->raised.next);
394 mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
395 }
396 } else {
397 IRQ_get_next(opp, &dst->servicing);
398 if (dst->raised.priority > dst->ctpr &&
399 dst->raised.priority > dst->servicing.priority) {
400 pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
401 __func__, n_IRQ, dst->raised.next,
402 dst->raised.priority, dst->ctpr,
403 dst->servicing.priority, n_CPU);
404 /* IRQ line stays asserted */
405 } else {
406 pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
407 __func__, n_IRQ, dst->ctpr,
408 dst->servicing.priority, n_CPU);
409 mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
410 }
411 }
412}
413
414/* update pic state because registers for n_IRQ have changed value */
415static void openpic_update_irq(struct openpic *opp, int n_IRQ)
416{
417 struct irq_source *src;
418 bool active, was_active;
419 int i;
420
421 src = &opp->src[n_IRQ];
422 active = src->pending;
423
424 if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
425 /* Interrupt source is disabled */
426 pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
427 active = false;
428 }
429
430 was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
431
432 /*
433 * We don't have a similar check for already-active because
434 * ctpr may have changed and we need to withdraw the interrupt.
435 */
436 if (!active && !was_active) {
437 pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
438 return;
439 }
440
441 if (active)
442 src->ivpr |= IVPR_ACTIVITY_MASK;
443 else
444 src->ivpr &= ~IVPR_ACTIVITY_MASK;
445
446 if (src->destmask == 0) {
447 /* No target */
448 pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
449 return;
450 }
451
452 if (src->destmask == (1 << src->last_cpu)) {
453 /* Only one CPU is allowed to receive this IRQ */
454 IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
455 } else if (!(src->ivpr & IVPR_MODE_MASK)) {
456 /* Directed delivery mode */
457 for (i = 0; i < opp->nb_cpus; i++) {
458 if (src->destmask & (1 << i)) {
459 IRQ_local_pipe(opp, i, n_IRQ, active,
460 was_active);
461 }
462 }
463 } else {
464 /* Distributed delivery mode */
465 for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
466 if (i == opp->nb_cpus)
467 i = 0;
468
469 if (src->destmask & (1 << i)) {
470 IRQ_local_pipe(opp, i, n_IRQ, active,
471 was_active);
472 src->last_cpu = i;
473 break;
474 }
475 }
476 }
477}
478
479static void openpic_set_irq(void *opaque, int n_IRQ, int level)
480{
481 struct openpic *opp = opaque;
482 struct irq_source *src;
483
484 if (n_IRQ >= MAX_IRQ) {
485 WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
486 return;
487 }
488
489 src = &opp->src[n_IRQ];
490 pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
491 n_IRQ, level, src->ivpr);
492 if (src->level) {
493 /* level-sensitive irq */
494 src->pending = level;
495 openpic_update_irq(opp, n_IRQ);
496 } else {
497 /* edge-sensitive irq */
498 if (level) {
499 src->pending = 1;
500 openpic_update_irq(opp, n_IRQ);
501 }
502
503 if (src->output != ILR_INTTGT_INT) {
504 /* Edge-triggered interrupts shouldn't be used
505 * with non-INT delivery, but just in case,
506 * try to make it do something sane rather than
507 * cause an interrupt storm. This is close to
508 * what you'd probably see happen in real hardware.
509 */
510 src->pending = 0;
511 openpic_update_irq(opp, n_IRQ);
512 }
513 }
514}
515
516static void openpic_reset(struct openpic *opp)
517{
518 int i;
519
520 opp->gcr = GCR_RESET;
521 /* Initialise controller registers */
522 opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
523 (opp->vid << FRR_VID_SHIFT);
524
525 opp->pir = 0;
526 opp->spve = -1 & opp->vector_mask;
527 opp->tfrr = opp->tfrr_reset;
528 /* Initialise IRQ sources */
529 for (i = 0; i < opp->max_irq; i++) {
530 opp->src[i].ivpr = opp->ivpr_reset;
531 opp->src[i].idr = opp->idr_reset;
532
533 switch (opp->src[i].type) {
534 case IRQ_TYPE_NORMAL:
535 opp->src[i].level =
536 !!(opp->ivpr_reset & IVPR_SENSE_MASK);
537 break;
538
539 case IRQ_TYPE_FSLINT:
540 opp->src[i].ivpr |= IVPR_POLARITY_MASK;
541 break;
542
543 case IRQ_TYPE_FSLSPECIAL:
544 break;
545 }
546 }
547 /* Initialise IRQ destinations */
548 for (i = 0; i < MAX_CPU; i++) {
549 opp->dst[i].ctpr = 15;
550 memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
551 opp->dst[i].raised.next = -1;
552 memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
553 opp->dst[i].servicing.next = -1;
554 }
555 /* Initialise timers */
556 for (i = 0; i < MAX_TMR; i++) {
557 opp->timers[i].tccr = 0;
558 opp->timers[i].tbcr = TBCR_CI;
559 }
560 /* Go out of RESET state */
561 opp->gcr = 0;
562}
563
564static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
565{
566 return opp->src[n_IRQ].idr;
567}
568
569static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
570{
571 if (opp->flags & OPENPIC_FLAG_ILR)
572 return opp->src[n_IRQ].output;
573
574 return 0xffffffff;
575}
576
577static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
578{
579 return opp->src[n_IRQ].ivpr;
580}
581
582static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
583 uint32_t val)
584{
585 struct irq_source *src = &opp->src[n_IRQ];
586 uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
587 uint32_t crit_mask = 0;
588 uint32_t mask = normal_mask;
589 int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
590 int i;
591
592 if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
593 crit_mask = mask << crit_shift;
594 mask |= crit_mask | IDR_EP;
595 }
596
597 src->idr = val & mask;
598 pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
599
600 if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
601 if (src->idr & crit_mask) {
602 if (src->idr & normal_mask) {
603 pr_debug("%s: IRQ configured for multiple output types, using critical\n",
604 __func__);
605 }
606
607 src->output = ILR_INTTGT_CINT;
608 src->nomask = true;
609 src->destmask = 0;
610
611 for (i = 0; i < opp->nb_cpus; i++) {
612 int n_ci = IDR_CI0_SHIFT - i;
613
614 if (src->idr & (1UL << n_ci))
615 src->destmask |= 1UL << i;
616 }
617 } else {
618 src->output = ILR_INTTGT_INT;
619 src->nomask = false;
620 src->destmask = src->idr & normal_mask;
621 }
622 } else {
623 src->destmask = src->idr;
624 }
625}
626
627static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
628 uint32_t val)
629{
630 if (opp->flags & OPENPIC_FLAG_ILR) {
631 struct irq_source *src = &opp->src[n_IRQ];
632
633 src->output = val & ILR_INTTGT_MASK;
634 pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
635 src->output);
636
637 /* TODO: on MPIC v4.0 only, set nomask for non-INT */
638 }
639}
640
641static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
642 uint32_t val)
643{
644 uint32_t mask;
645
646 /* NOTE when implementing newer FSL MPIC models: starting with v4.0,
647 * the polarity bit is read-only on internal interrupts.
648 */
649 mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
650 IVPR_POLARITY_MASK | opp->vector_mask;
651
652 /* ACTIVITY bit is read-only */
653 opp->src[n_IRQ].ivpr =
654 (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
655
656 /* For FSL internal interrupts, The sense bit is reserved and zero,
657 * and the interrupt is always level-triggered. Timers and IPIs
658 * have no sense or polarity bits, and are edge-triggered.
659 */
660 switch (opp->src[n_IRQ].type) {
661 case IRQ_TYPE_NORMAL:
662 opp->src[n_IRQ].level =
663 !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
664 break;
665
666 case IRQ_TYPE_FSLINT:
667 opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
668 break;
669
670 case IRQ_TYPE_FSLSPECIAL:
671 opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
672 break;
673 }
674
675 openpic_update_irq(opp, n_IRQ);
676 pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
677 opp->src[n_IRQ].ivpr);
678}
679
680static void openpic_gcr_write(struct openpic *opp, uint64_t val)
681{
682 if (val & GCR_RESET) {
683 openpic_reset(opp);
684 return;
685 }
686
687 opp->gcr &= ~opp->mpic_mode_mask;
688 opp->gcr |= val & opp->mpic_mode_mask;
689}
690
691static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
692{
693 struct openpic *opp = opaque;
694 int err = 0;
695
696 pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
697 if (addr & 0xF)
698 return 0;
699
700 switch (addr) {
701 case 0x00: /* Block Revision Register1 (BRR1) is Readonly */
702 break;
703 case 0x40:
704 case 0x50:
705 case 0x60:
706 case 0x70:
707 case 0x80:
708 case 0x90:
709 case 0xA0:
710 case 0xB0:
711 err = openpic_cpu_write_internal(opp, addr, val,
712 get_current_cpu());
713 break;
714 case 0x1000: /* FRR */
715 break;
716 case 0x1020: /* GCR */
717 openpic_gcr_write(opp, val);
718 break;
719 case 0x1080: /* VIR */
720 break;
721 case 0x1090: /* PIR */
722 /*
723 * This register is used to reset a CPU core --
724 * let userspace handle it.
725 */
726 err = -ENXIO;
727 break;
728 case 0x10A0: /* IPI_IVPR */
729 case 0x10B0:
730 case 0x10C0:
731 case 0x10D0: {
732 int idx;
733 idx = (addr - 0x10A0) >> 4;
734 write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
735 break;
736 }
737 case 0x10E0: /* SPVE */
738 opp->spve = val & opp->vector_mask;
739 break;
740 default:
741 break;
742 }
743
744 return err;
745}
746
747static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
748{
749 struct openpic *opp = opaque;
750 u32 retval;
751 int err = 0;
752
753 pr_debug("%s: addr %#llx\n", __func__, addr);
754 retval = 0xFFFFFFFF;
755 if (addr & 0xF)
756 goto out;
757
758 switch (addr) {
759 case 0x1000: /* FRR */
760 retval = opp->frr;
761 retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
762 break;
763 case 0x1020: /* GCR */
764 retval = opp->gcr;
765 break;
766 case 0x1080: /* VIR */
767 retval = opp->vir;
768 break;
769 case 0x1090: /* PIR */
770 retval = 0x00000000;
771 break;
772 case 0x00: /* Block Revision Register1 (BRR1) */
773 retval = opp->brr1;
774 break;
775 case 0x40:
776 case 0x50:
777 case 0x60:
778 case 0x70:
779 case 0x80:
780 case 0x90:
781 case 0xA0:
782 case 0xB0:
783 err = openpic_cpu_read_internal(opp, addr,
784 &retval, get_current_cpu());
785 break;
786 case 0x10A0: /* IPI_IVPR */
787 case 0x10B0:
788 case 0x10C0:
789 case 0x10D0:
790 {
791 int idx;
792 idx = (addr - 0x10A0) >> 4;
793 retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
794 }
795 break;
796 case 0x10E0: /* SPVE */
797 retval = opp->spve;
798 break;
799 default:
800 break;
801 }
802
803out:
804 pr_debug("%s: => 0x%08x\n", __func__, retval);
805 *ptr = retval;
806 return err;
807}
808
809static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
810{
811 struct openpic *opp = opaque;
812 int idx;
813
814 addr += 0x10f0;
815
816 pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
817 if (addr & 0xF)
818 return 0;
819
820 if (addr == 0x10f0) {
821 /* TFRR */
822 opp->tfrr = val;
823 return 0;
824 }
825
826 idx = (addr >> 6) & 0x3;
827 addr = addr & 0x30;
828
829 switch (addr & 0x30) {
830 case 0x00: /* TCCR */
831 break;
832 case 0x10: /* TBCR */
833 if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
834 (val & TBCR_CI) == 0 &&
835 (opp->timers[idx].tbcr & TBCR_CI) != 0)
836 opp->timers[idx].tccr &= ~TCCR_TOG;
837
838 opp->timers[idx].tbcr = val;
839 break;
840 case 0x20: /* TVPR */
841 write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
842 break;
843 case 0x30: /* TDR */
844 write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
845 break;
846 }
847
848 return 0;
849}
850
851static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
852{
853 struct openpic *opp = opaque;
854 uint32_t retval = -1;
855 int idx;
856
857 pr_debug("%s: addr %#llx\n", __func__, addr);
858 if (addr & 0xF)
859 goto out;
860
861 idx = (addr >> 6) & 0x3;
862 if (addr == 0x0) {
863 /* TFRR */
864 retval = opp->tfrr;
865 goto out;
866 }
867
868 switch (addr & 0x30) {
869 case 0x00: /* TCCR */
870 retval = opp->timers[idx].tccr;
871 break;
872 case 0x10: /* TBCR */
873 retval = opp->timers[idx].tbcr;
874 break;
875 case 0x20: /* TIPV */
876 retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
877 break;
878 case 0x30: /* TIDE (TIDR) */
879 retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
880 break;
881 }
882
883out:
884 pr_debug("%s: => 0x%08x\n", __func__, retval);
885 *ptr = retval;
886 return 0;
887}
888
889static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
890{
891 struct openpic *opp = opaque;
892 int idx;
893
894 pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
895
896 addr = addr & 0xffff;
897 idx = addr >> 5;
898
899 switch (addr & 0x1f) {
900 case 0x00:
901 write_IRQreg_ivpr(opp, idx, val);
902 break;
903 case 0x10:
904 write_IRQreg_idr(opp, idx, val);
905 break;
906 case 0x18:
907 write_IRQreg_ilr(opp, idx, val);
908 break;
909 }
910
911 return 0;
912}
913
914static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
915{
916 struct openpic *opp = opaque;
917 uint32_t retval;
918 int idx;
919
920 pr_debug("%s: addr %#llx\n", __func__, addr);
921 retval = 0xFFFFFFFF;
922
923 addr = addr & 0xffff;
924 idx = addr >> 5;
925
926 switch (addr & 0x1f) {
927 case 0x00:
928 retval = read_IRQreg_ivpr(opp, idx);
929 break;
930 case 0x10:
931 retval = read_IRQreg_idr(opp, idx);
932 break;
933 case 0x18:
934 retval = read_IRQreg_ilr(opp, idx);
935 break;
936 }
937
938 pr_debug("%s: => 0x%08x\n", __func__, retval);
939 *ptr = retval;
940 return 0;
941}
942
943static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
944{
945 struct openpic *opp = opaque;
946 int idx = opp->irq_msi;
947 int srs, ibs;
948
949 pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
950 if (addr & 0xF)
951 return 0;
952
953 switch (addr) {
954 case MSIIR_OFFSET:
955 srs = val >> MSIIR_SRS_SHIFT;
956 idx += srs;
957 ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
958 opp->msi[srs].msir |= 1 << ibs;
959 openpic_set_irq(opp, idx, 1);
960 break;
961 default:
962 /* most registers are read-only, thus ignored */
963 break;
964 }
965
966 return 0;
967}
968
969static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
970{
971 struct openpic *opp = opaque;
972 uint32_t r = 0;
973 int i, srs;
974
975 pr_debug("%s: addr %#llx\n", __func__, addr);
976 if (addr & 0xF)
977 return -ENXIO;
978
979 srs = addr >> 4;
980
981 switch (addr) {
982 case 0x00:
983 case 0x10:
984 case 0x20:
985 case 0x30:
986 case 0x40:
987 case 0x50:
988 case 0x60:
989 case 0x70: /* MSIRs */
990 r = opp->msi[srs].msir;
991 /* Clear on read */
992 opp->msi[srs].msir = 0;
993 openpic_set_irq(opp, opp->irq_msi + srs, 0);
994 break;
995 case 0x120: /* MSISR */
996 for (i = 0; i < MAX_MSI; i++)
997 r |= (opp->msi[i].msir ? 1 : 0) << i;
998 break;
999 }
1000
1001 pr_debug("%s: => 0x%08x\n", __func__, r);
1002 *ptr = r;
1003 return 0;
1004}
1005
1006static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
1007{
1008 uint32_t r = 0;
1009
1010 pr_debug("%s: addr %#llx\n", __func__, addr);
1011
1012 /* TODO: EISR/EIMR */
1013
1014 *ptr = r;
1015 return 0;
1016}
1017
1018static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
1019{
1020 pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
1021
1022 /* TODO: EISR/EIMR */
1023 return 0;
1024}
1025
1026static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
1027 u32 val, int idx)
1028{
1029 struct openpic *opp = opaque;
1030 struct irq_source *src;
1031 struct irq_dest *dst;
1032 int s_IRQ, n_IRQ;
1033
1034 pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
1035 addr, val);
1036
1037 if (idx < 0)
1038 return 0;
1039
1040 if (addr & 0xF)
1041 return 0;
1042
1043 dst = &opp->dst[idx];
1044 addr &= 0xFF0;
1045 switch (addr) {
1046 case 0x40: /* IPIDR */
1047 case 0x50:
1048 case 0x60:
1049 case 0x70:
1050 idx = (addr - 0x40) >> 4;
1051 /* we use IDE as mask which CPUs to deliver the IPI to still. */
1052 opp->src[opp->irq_ipi0 + idx].destmask |= val;
1053 openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
1054 openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
1055 break;
1056 case 0x80: /* CTPR */
1057 dst->ctpr = val & 0x0000000F;
1058
1059 pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
1060 __func__, idx, dst->ctpr, dst->raised.priority,
1061 dst->servicing.priority);
1062
1063 if (dst->raised.priority <= dst->ctpr) {
1064 pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
1065 __func__, idx);
1066 mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
1067 } else if (dst->raised.priority > dst->servicing.priority) {
1068 pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
1069 __func__, idx, dst->raised.next);
1070 mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
1071 }
1072
1073 break;
1074 case 0x90: /* WHOAMI */
1075 /* Read-only register */
1076 break;
1077 case 0xA0: /* IACK */
1078 /* Read-only register */
1079 break;
1080 case 0xB0: { /* EOI */
1081 int notify_eoi;
1082
1083 pr_debug("EOI\n");
1084 s_IRQ = IRQ_get_next(opp, &dst->servicing);
1085
1086 if (s_IRQ < 0) {
1087 pr_debug("%s: EOI with no interrupt in service\n",
1088 __func__);
1089 break;
1090 }
1091
1092 IRQ_resetbit(&dst->servicing, s_IRQ);
1093 /* Notify listeners that the IRQ is over */
1094 notify_eoi = s_IRQ;
1095 /* Set up next servicing IRQ */
1096 s_IRQ = IRQ_get_next(opp, &dst->servicing);
1097 /* Check queued interrupts. */
1098 n_IRQ = IRQ_get_next(opp, &dst->raised);
1099 src = &opp->src[n_IRQ];
1100 if (n_IRQ != -1 &&
1101 (s_IRQ == -1 ||
1102 IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
1103 pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
1104 idx, n_IRQ);
1105 mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
1106 }
1107
1108 spin_unlock(&opp->lock);
1109 kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
1110 spin_lock(&opp->lock);
1111
1112 break;
1113 }
1114 default:
1115 break;
1116 }
1117
1118 return 0;
1119}
1120
1121static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
1122{
1123 struct openpic *opp = opaque;
1124
1125 return openpic_cpu_write_internal(opp, addr, val,
1126 (addr & 0x1f000) >> 12);
1127}
1128
1129static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
1130 int cpu)
1131{
1132 struct irq_source *src;
1133 int retval, irq;
1134
1135 pr_debug("Lower OpenPIC INT output\n");
1136 mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
1137
1138 irq = IRQ_get_next(opp, &dst->raised);
1139 pr_debug("IACK: irq=%d\n", irq);
1140
1141 if (irq == -1)
1142 /* No more interrupt pending */
1143 return opp->spve;
1144
1145 src = &opp->src[irq];
1146 if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
1147 !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
1148 pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
1149 __func__, irq, dst->ctpr, src->ivpr);
1150 openpic_update_irq(opp, irq);
1151 retval = opp->spve;
1152 } else {
1153 /* IRQ enter servicing state */
1154 IRQ_setbit(&dst->servicing, irq);
1155 retval = IVPR_VECTOR(opp, src->ivpr);
1156 }
1157
1158 if (!src->level) {
1159 /* edge-sensitive IRQ */
1160 src->ivpr &= ~IVPR_ACTIVITY_MASK;
1161 src->pending = 0;
1162 IRQ_resetbit(&dst->raised, irq);
1163 }
1164
1165 if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
1166 src->destmask &= ~(1 << cpu);
1167 if (src->destmask && !src->level) {
1168 /* trigger on CPUs that didn't know about it yet */
1169 openpic_set_irq(opp, irq, 1);
1170 openpic_set_irq(opp, irq, 0);
1171 /* if all CPUs knew about it, set active bit again */
1172 src->ivpr |= IVPR_ACTIVITY_MASK;
1173 }
1174 }
1175
1176 return retval;
1177}
1178
1179void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
1180{
1181 struct openpic *opp = vcpu->arch.mpic;
1182 int cpu = vcpu->arch.irq_cpu_id;
1183 unsigned long flags;
1184
1185 spin_lock_irqsave(&opp->lock, flags);
1186
1187 if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
1188 kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
1189
1190 spin_unlock_irqrestore(&opp->lock, flags);
1191}
1192
1193static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
1194 u32 *ptr, int idx)
1195{
1196 struct openpic *opp = opaque;
1197 struct irq_dest *dst;
1198 uint32_t retval;
1199
1200 pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
1201 retval = 0xFFFFFFFF;
1202
1203 if (idx < 0)
1204 goto out;
1205
1206 if (addr & 0xF)
1207 goto out;
1208
1209 dst = &opp->dst[idx];
1210 addr &= 0xFF0;
1211 switch (addr) {
1212 case 0x80: /* CTPR */
1213 retval = dst->ctpr;
1214 break;
1215 case 0x90: /* WHOAMI */
1216 retval = idx;
1217 break;
1218 case 0xA0: /* IACK */
1219 retval = openpic_iack(opp, dst, idx);
1220 break;
1221 case 0xB0: /* EOI */
1222 retval = 0;
1223 break;
1224 default:
1225 break;
1226 }
1227 pr_debug("%s: => 0x%08x\n", __func__, retval);
1228
1229out:
1230 *ptr = retval;
1231 return 0;
1232}
1233
1234static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
1235{
1236 struct openpic *opp = opaque;
1237
1238 return openpic_cpu_read_internal(opp, addr, ptr,
1239 (addr & 0x1f000) >> 12);
1240}
1241
1242struct mem_reg {
1243 int (*read)(void *opaque, gpa_t addr, u32 *ptr);
1244 int (*write)(void *opaque, gpa_t addr, u32 val);
1245 gpa_t start_addr;
1246 int size;
1247};
1248
1249static const struct mem_reg openpic_gbl_mmio = {
1250 .write = openpic_gbl_write,
1251 .read = openpic_gbl_read,
1252 .start_addr = OPENPIC_GLB_REG_START,
1253 .size = OPENPIC_GLB_REG_SIZE,
1254};
1255
1256static const struct mem_reg openpic_tmr_mmio = {
1257 .write = openpic_tmr_write,
1258 .read = openpic_tmr_read,
1259 .start_addr = OPENPIC_TMR_REG_START,
1260 .size = OPENPIC_TMR_REG_SIZE,
1261};
1262
1263static const struct mem_reg openpic_cpu_mmio = {
1264 .write = openpic_cpu_write,
1265 .read = openpic_cpu_read,
1266 .start_addr = OPENPIC_CPU_REG_START,
1267 .size = OPENPIC_CPU_REG_SIZE,
1268};
1269
1270static const struct mem_reg openpic_src_mmio = {
1271 .write = openpic_src_write,
1272 .read = openpic_src_read,
1273 .start_addr = OPENPIC_SRC_REG_START,
1274 .size = OPENPIC_SRC_REG_SIZE,
1275};
1276
1277static const struct mem_reg openpic_msi_mmio = {
1278 .read = openpic_msi_read,
1279 .write = openpic_msi_write,
1280 .start_addr = OPENPIC_MSI_REG_START,
1281 .size = OPENPIC_MSI_REG_SIZE,
1282};
1283
1284static const struct mem_reg openpic_summary_mmio = {
1285 .read = openpic_summary_read,
1286 .write = openpic_summary_write,
1287 .start_addr = OPENPIC_SUMMARY_REG_START,
1288 .size = OPENPIC_SUMMARY_REG_SIZE,
1289};
1290
1291static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr)
1292{
1293 if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) {
1294 WARN(1, "kvm mpic: too many mmio regions\n");
1295 return;
1296 }
1297
1298 opp->mmio_regions[opp->num_mmio_regions++] = mr;
1299}
1300
1301static void fsl_common_init(struct openpic *opp)
1302{
1303 int i;
1304 int virq = MAX_SRC;
1305
1306 add_mmio_region(opp, &openpic_msi_mmio);
1307 add_mmio_region(opp, &openpic_summary_mmio);
1308
1309 opp->vid = VID_REVISION_1_2;
1310 opp->vir = VIR_GENERIC;
1311 opp->vector_mask = 0xFFFF;
1312 opp->tfrr_reset = 0;
1313 opp->ivpr_reset = IVPR_MASK_MASK;
1314 opp->idr_reset = 1 << 0;
1315 opp->max_irq = MAX_IRQ;
1316
1317 opp->irq_ipi0 = virq;
1318 virq += MAX_IPI;
1319 opp->irq_tim0 = virq;
1320 virq += MAX_TMR;
1321
1322 BUG_ON(virq > MAX_IRQ);
1323
1324 opp->irq_msi = 224;
1325
1326 for (i = 0; i < opp->fsl->max_ext; i++)
1327 opp->src[i].level = false;
1328
1329 /* Internal interrupts, including message and MSI */
1330 for (i = 16; i < MAX_SRC; i++) {
1331 opp->src[i].type = IRQ_TYPE_FSLINT;
1332 opp->src[i].level = true;
1333 }
1334
1335 /* timers and IPIs */
1336 for (i = MAX_SRC; i < virq; i++) {
1337 opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
1338 opp->src[i].level = false;
1339 }
1340}
1341
1342static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
1343{
1344 int i;
1345
1346 for (i = 0; i < opp->num_mmio_regions; i++) {
1347 const struct mem_reg *mr = opp->mmio_regions[i];
1348
1349 if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
1350 continue;
1351
1352 return mr->read(opp, addr - mr->start_addr, ptr);
1353 }
1354
1355 return -ENXIO;
1356}
1357
1358static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
1359{
1360 int i;
1361
1362 for (i = 0; i < opp->num_mmio_regions; i++) {
1363 const struct mem_reg *mr = opp->mmio_regions[i];
1364
1365 if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
1366 continue;
1367
1368 return mr->write(opp, addr - mr->start_addr, val);
1369 }
1370
1371 return -ENXIO;
1372}
1373
1374static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
1375 int len, void *ptr)
1376{
1377 struct openpic *opp = container_of(this, struct openpic, mmio);
1378 int ret;
1379 union {
1380 u32 val;
1381 u8 bytes[4];
1382 } u;
1383
1384 if (addr & (len - 1)) {
1385 pr_debug("%s: bad alignment %llx/%d\n",
1386 __func__, addr, len);
1387 return -EINVAL;
1388 }
1389
1390 spin_lock_irq(&opp->lock);
1391 ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
1392 spin_unlock_irq(&opp->lock);
1393
1394 /*
1395 * Technically only 32-bit accesses are allowed, but be nice to
1396 * people dumping registers a byte at a time -- it works in real
1397 * hardware (reads only, not writes).
1398 */
1399 if (len == 4) {
1400 *(u32 *)ptr = u.val;
1401 pr_debug("%s: addr %llx ret %d len 4 val %x\n",
1402 __func__, addr, ret, u.val);
1403 } else if (len == 1) {
1404 *(u8 *)ptr = u.bytes[addr & 3];
1405 pr_debug("%s: addr %llx ret %d len 1 val %x\n",
1406 __func__, addr, ret, u.bytes[addr & 3]);
1407 } else {
1408 pr_debug("%s: bad length %d\n", __func__, len);
1409 return -EINVAL;
1410 }
1411
1412 return ret;
1413}
1414
1415static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
1416 int len, const void *ptr)
1417{
1418 struct openpic *opp = container_of(this, struct openpic, mmio);
1419 int ret;
1420
1421 if (len != 4) {
1422 pr_debug("%s: bad length %d\n", __func__, len);
1423 return -EOPNOTSUPP;
1424 }
1425 if (addr & 3) {
1426 pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
1427 return -EOPNOTSUPP;
1428 }
1429
1430 spin_lock_irq(&opp->lock);
1431 ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
1432 *(const u32 *)ptr);
1433 spin_unlock_irq(&opp->lock);
1434
1435 pr_debug("%s: addr %llx ret %d val %x\n",
1436 __func__, addr, ret, *(const u32 *)ptr);
1437
1438 return ret;
1439}
1440
1441static const struct kvm_io_device_ops mpic_mmio_ops = {
1442 .read = kvm_mpic_read,
1443 .write = kvm_mpic_write,
1444};
1445
1446static void map_mmio(struct openpic *opp)
1447{
1448 kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
1449
1450 kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
1451 opp->reg_base, OPENPIC_REG_SIZE,
1452 &opp->mmio);
1453}
1454
1455static void unmap_mmio(struct openpic *opp)
1456{
1457 kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
1458}
1459
1460static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
1461{
1462 u64 base;
1463
1464 if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
1465 return -EFAULT;
1466
1467 if (base & 0x3ffff) {
1468 pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
1469 __func__, base);
1470 return -EINVAL;
1471 }
1472
1473 if (base == opp->reg_base)
1474 return 0;
1475
1476 mutex_lock(&opp->kvm->slots_lock);
1477
1478 unmap_mmio(opp);
1479 opp->reg_base = base;
1480
1481 pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
1482 __func__, base);
1483
1484 if (base == 0)
1485 goto out;
1486
1487 map_mmio(opp);
1488
1489out:
1490 mutex_unlock(&opp->kvm->slots_lock);
1491 return 0;
1492}
1493
1494#define ATTR_SET 0
1495#define ATTR_GET 1
1496
1497static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
1498{
1499 int ret;
1500
1501 if (addr & 3)
1502 return -ENXIO;
1503
1504 spin_lock_irq(&opp->lock);
1505
1506 if (type == ATTR_SET)
1507 ret = kvm_mpic_write_internal(opp, addr, *val);
1508 else
1509 ret = kvm_mpic_read_internal(opp, addr, val);
1510
1511 spin_unlock_irq(&opp->lock);
1512
1513 pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
1514
1515 return ret;
1516}
1517
1518static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1519{
1520 struct openpic *opp = dev->private;
1521 u32 attr32;
1522
1523 switch (attr->group) {
1524 case KVM_DEV_MPIC_GRP_MISC:
1525 switch (attr->attr) {
1526 case KVM_DEV_MPIC_BASE_ADDR:
1527 return set_base_addr(opp, attr);
1528 }
1529
1530 break;
1531
1532 case KVM_DEV_MPIC_GRP_REGISTER:
1533 if (get_user(attr32, (u32 __user *)(long)attr->addr))
1534 return -EFAULT;
1535
1536 return access_reg(opp, attr->attr, &attr32, ATTR_SET);
1537
1538 case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
1539 if (attr->attr > MAX_SRC)
1540 return -EINVAL;
1541
1542 if (get_user(attr32, (u32 __user *)(long)attr->addr))
1543 return -EFAULT;
1544
1545 if (attr32 != 0 && attr32 != 1)
1546 return -EINVAL;
1547
1548 spin_lock_irq(&opp->lock);
1549 openpic_set_irq(opp, attr->attr, attr32);
1550 spin_unlock_irq(&opp->lock);
1551 return 0;
1552 }
1553
1554 return -ENXIO;
1555}
1556
1557static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1558{
1559 struct openpic *opp = dev->private;
1560 u64 attr64;
1561 u32 attr32;
1562 int ret;
1563
1564 switch (attr->group) {
1565 case KVM_DEV_MPIC_GRP_MISC:
1566 switch (attr->attr) {
1567 case KVM_DEV_MPIC_BASE_ADDR:
1568 mutex_lock(&opp->kvm->slots_lock);
1569 attr64 = opp->reg_base;
1570 mutex_unlock(&opp->kvm->slots_lock);
1571
1572 if (copy_to_user((u64 __user *)(long)attr->addr,
1573 &attr64, sizeof(u64)))
1574 return -EFAULT;
1575
1576 return 0;
1577 }
1578
1579 break;
1580
1581 case KVM_DEV_MPIC_GRP_REGISTER:
1582 ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
1583 if (ret)
1584 return ret;
1585
1586 if (put_user(attr32, (u32 __user *)(long)attr->addr))
1587 return -EFAULT;
1588
1589 return 0;
1590
1591 case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
1592 if (attr->attr > MAX_SRC)
1593 return -EINVAL;
1594
1595 spin_lock_irq(&opp->lock);
1596 attr32 = opp->src[attr->attr].pending;
1597 spin_unlock_irq(&opp->lock);
1598
1599 if (put_user(attr32, (u32 __user *)(long)attr->addr))
1600 return -EFAULT;
1601
1602 return 0;
1603 }
1604
1605 return -ENXIO;
1606}
1607
1608static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
1609{
1610 switch (attr->group) {
1611 case KVM_DEV_MPIC_GRP_MISC:
1612 switch (attr->attr) {
1613 case KVM_DEV_MPIC_BASE_ADDR:
1614 return 0;
1615 }
1616
1617 break;
1618
1619 case KVM_DEV_MPIC_GRP_REGISTER:
1620 return 0;
1621
1622 case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
1623 if (attr->attr > MAX_SRC)
1624 break;
1625
1626 return 0;
1627 }
1628
1629 return -ENXIO;
1630}
1631
1632static void mpic_destroy(struct kvm_device *dev)
1633{
1634 struct openpic *opp = dev->private;
1635
1636 dev->kvm->arch.mpic = NULL;
1637 kfree(opp);
1638}
1639
1640static int mpic_set_default_irq_routing(struct openpic *opp)
1641{
1642 struct kvm_irq_routing_entry *routing;
1643
1644 /* Create a nop default map, so that dereferencing it still works */
1645 routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
1646 if (!routing)
1647 return -ENOMEM;
1648
1649 kvm_set_irq_routing(opp->kvm, routing, 0, 0);
1650
1651 kfree(routing);
1652 return 0;
1653}
1654
1655static int mpic_create(struct kvm_device *dev, u32 type)
1656{
1657 struct openpic *opp;
1658 int ret;
1659
1660 /* We only support one MPIC at a time for now */
1661 if (dev->kvm->arch.mpic)
1662 return -EINVAL;
1663
1664 opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
1665 if (!opp)
1666 return -ENOMEM;
1667
1668 dev->private = opp;
1669 opp->kvm = dev->kvm;
1670 opp->dev = dev;
1671 opp->model = type;
1672 spin_lock_init(&opp->lock);
1673
1674 add_mmio_region(opp, &openpic_gbl_mmio);
1675 add_mmio_region(opp, &openpic_tmr_mmio);
1676 add_mmio_region(opp, &openpic_src_mmio);
1677 add_mmio_region(opp, &openpic_cpu_mmio);
1678
1679 switch (opp->model) {
1680 case KVM_DEV_TYPE_FSL_MPIC_20:
1681 opp->fsl = &fsl_mpic_20;
1682 opp->brr1 = 0x00400200;
1683 opp->flags |= OPENPIC_FLAG_IDR_CRIT;
1684 opp->nb_irqs = 80;
1685 opp->mpic_mode_mask = GCR_MODE_MIXED;
1686
1687 fsl_common_init(opp);
1688
1689 break;
1690
1691 case KVM_DEV_TYPE_FSL_MPIC_42:
1692 opp->fsl = &fsl_mpic_42;
1693 opp->brr1 = 0x00400402;
1694 opp->flags |= OPENPIC_FLAG_ILR;
1695 opp->nb_irqs = 196;
1696 opp->mpic_mode_mask = GCR_MODE_PROXY;
1697
1698 fsl_common_init(opp);
1699
1700 break;
1701
1702 default:
1703 ret = -ENODEV;
1704 goto err;
1705 }
1706
1707 ret = mpic_set_default_irq_routing(opp);
1708 if (ret)
1709 goto err;
1710
1711 openpic_reset(opp);
1712
1713 smp_wmb();
1714 dev->kvm->arch.mpic = opp;
1715
1716 return 0;
1717
1718err:
1719 kfree(opp);
1720 return ret;
1721}
1722
1723struct kvm_device_ops kvm_mpic_ops = {
1724 .name = "kvm-mpic",
1725 .create = mpic_create,
1726 .destroy = mpic_destroy,
1727 .set_attr = mpic_set_attr,
1728 .get_attr = mpic_get_attr,
1729 .has_attr = mpic_has_attr,
1730};
1731
1732int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
1733 u32 cpu)
1734{
1735 struct openpic *opp = dev->private;
1736 int ret = 0;
1737
1738 if (dev->ops != &kvm_mpic_ops)
1739 return -EPERM;
1740 if (opp->kvm != vcpu->kvm)
1741 return -EPERM;
1742 if (cpu < 0 || cpu >= MAX_CPU)
1743 return -EPERM;
1744
1745 spin_lock_irq(&opp->lock);
1746
1747 if (opp->dst[cpu].vcpu) {
1748 ret = -EEXIST;
1749 goto out;
1750 }
1751 if (vcpu->arch.irq_type) {
1752 ret = -EBUSY;
1753 goto out;
1754 }
1755
1756 opp->dst[cpu].vcpu = vcpu;
1757 opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
1758
1759 vcpu->arch.mpic = opp;
1760 vcpu->arch.irq_cpu_id = cpu;
1761 vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
1762
1763 /* This might need to be changed if GCR gets extended */
1764 if (opp->mpic_mode_mask == GCR_MODE_PROXY)
1765 vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
1766
1767out:
1768 spin_unlock_irq(&opp->lock);
1769 return ret;
1770}
1771
1772/*
1773 * This should only happen immediately before the mpic is destroyed,
1774 * so we shouldn't need to worry about anything still trying to
1775 * access the vcpu pointer.
1776 */
1777void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
1778{
1779 BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
1780
1781 opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
1782}
1783
1784/*
1785 * Return value:
1786 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
1787 * = 0 Interrupt was coalesced (previous irq is still pending)
1788 * > 0 Number of CPUs interrupt was delivered to
1789 */
1790static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
1791 struct kvm *kvm, int irq_source_id, int level,
1792 bool line_status)
1793{
1794 u32 irq = e->irqchip.pin;
1795 struct openpic *opp = kvm->arch.mpic;
1796 unsigned long flags;
1797
1798 spin_lock_irqsave(&opp->lock, flags);
1799 openpic_set_irq(opp, irq, level);
1800 spin_unlock_irqrestore(&opp->lock, flags);
1801
1802 /* All code paths we care about don't check for the return value */
1803 return 0;
1804}
1805
1806int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
1807 struct kvm *kvm, int irq_source_id, int level, bool line_status)
1808{
1809 struct openpic *opp = kvm->arch.mpic;
1810 unsigned long flags;
1811
1812 spin_lock_irqsave(&opp->lock, flags);
1813
1814 /*
1815 * XXX We ignore the target address for now, as we only support
1816 * a single MSI bank.
1817 */
1818 openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
1819 spin_unlock_irqrestore(&opp->lock, flags);
1820
1821 /* All code paths we care about don't check for the return value */
1822 return 0;
1823}
1824
1825int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
1826 struct kvm_kernel_irq_routing_entry *e,
1827 const struct kvm_irq_routing_entry *ue)
1828{
1829 int r = -EINVAL;
1830
1831 switch (ue->type) {
1832 case KVM_IRQ_ROUTING_IRQCHIP:
1833 e->set = mpic_set_irq;
1834 e->irqchip.irqchip = ue->u.irqchip.irqchip;
1835 e->irqchip.pin = ue->u.irqchip.pin;
1836 if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
1837 goto out;
1838 rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
1839 break;
1840 case KVM_IRQ_ROUTING_MSI:
1841 e->set = kvm_set_msi;
1842 e->msi.address_lo = ue->u.msi.address_lo;
1843 e->msi.address_hi = ue->u.msi.address_hi;
1844 e->msi.data = ue->u.msi.data;
1845 break;
1846 default:
1847 goto out;
1848 }
1849
1850 r = 0;
1851out:
1852 return r;
1853}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 934413cd3a1b..6316ee336e88 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -25,6 +25,7 @@
25#include <linux/hrtimer.h> 25#include <linux/hrtimer.h>
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/file.h>
28#include <asm/cputable.h> 29#include <asm/cputable.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/kvm_ppc.h> 31#include <asm/kvm_ppc.h>
@@ -32,6 +33,7 @@
32#include <asm/cputhreads.h> 33#include <asm/cputhreads.h>
33#include <asm/irqflags.h> 34#include <asm/irqflags.h>
34#include "timing.h" 35#include "timing.h"
36#include "irq.h"
35#include "../mm/mmu_decl.h" 37#include "../mm/mmu_decl.h"
36 38
37#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
@@ -317,6 +319,7 @@ int kvm_dev_ioctl_check_extension(long ext)
317 case KVM_CAP_ENABLE_CAP: 319 case KVM_CAP_ENABLE_CAP:
318 case KVM_CAP_ONE_REG: 320 case KVM_CAP_ONE_REG:
319 case KVM_CAP_IOEVENTFD: 321 case KVM_CAP_IOEVENTFD:
322 case KVM_CAP_DEVICE_CTRL:
320 r = 1; 323 r = 1;
321 break; 324 break;
322#ifndef CONFIG_KVM_BOOK3S_64_HV 325#ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -326,6 +329,9 @@ int kvm_dev_ioctl_check_extension(long ext)
326#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 329#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
327 case KVM_CAP_SW_TLB: 330 case KVM_CAP_SW_TLB:
328#endif 331#endif
332#ifdef CONFIG_KVM_MPIC
333 case KVM_CAP_IRQ_MPIC:
334#endif
329 r = 1; 335 r = 1;
330 break; 336 break;
331 case KVM_CAP_COALESCED_MMIO: 337 case KVM_CAP_COALESCED_MMIO:
@@ -335,6 +341,10 @@ int kvm_dev_ioctl_check_extension(long ext)
335#ifdef CONFIG_PPC_BOOK3S_64 341#ifdef CONFIG_PPC_BOOK3S_64
336 case KVM_CAP_SPAPR_TCE: 342 case KVM_CAP_SPAPR_TCE:
337 case KVM_CAP_PPC_ALLOC_HTAB: 343 case KVM_CAP_PPC_ALLOC_HTAB:
344 case KVM_CAP_PPC_RTAS:
345#ifdef CONFIG_KVM_XICS
346 case KVM_CAP_IRQ_XICS:
347#endif
338 r = 1; 348 r = 1;
339 break; 349 break;
340#endif /* CONFIG_PPC_BOOK3S_64 */ 350#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -411,18 +421,17 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
411} 421}
412 422
413int kvm_arch_prepare_memory_region(struct kvm *kvm, 423int kvm_arch_prepare_memory_region(struct kvm *kvm,
414 struct kvm_memory_slot *memslot, 424 struct kvm_memory_slot *memslot,
415 struct kvm_memory_slot old, 425 struct kvm_userspace_memory_region *mem,
416 struct kvm_userspace_memory_region *mem, 426 enum kvm_mr_change change)
417 bool user_alloc)
418{ 427{
419 return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 428 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
420} 429}
421 430
422void kvm_arch_commit_memory_region(struct kvm *kvm, 431void kvm_arch_commit_memory_region(struct kvm *kvm,
423 struct kvm_userspace_memory_region *mem, 432 struct kvm_userspace_memory_region *mem,
424 struct kvm_memory_slot old, 433 const struct kvm_memory_slot *old,
425 bool user_alloc) 434 enum kvm_mr_change change)
426{ 435{
427 kvmppc_core_commit_memory_region(kvm, mem, old); 436 kvmppc_core_commit_memory_region(kvm, mem, old);
428} 437}
@@ -460,6 +469,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
460 tasklet_kill(&vcpu->arch.tasklet); 469 tasklet_kill(&vcpu->arch.tasklet);
461 470
462 kvmppc_remove_vcpu_debugfs(vcpu); 471 kvmppc_remove_vcpu_debugfs(vcpu);
472
473 switch (vcpu->arch.irq_type) {
474 case KVMPPC_IRQ_MPIC:
475 kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
476 break;
477 case KVMPPC_IRQ_XICS:
478 kvmppc_xics_free_icp(vcpu);
479 break;
480 }
481
463 kvmppc_core_vcpu_free(vcpu); 482 kvmppc_core_vcpu_free(vcpu);
464} 483}
465 484
@@ -532,12 +551,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
532#endif 551#endif
533} 552}
534 553
535int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
536 struct kvm_guest_debug *dbg)
537{
538 return -EINVAL;
539}
540
541static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu, 554static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
542 struct kvm_run *run) 555 struct kvm_run *run)
543{ 556{
@@ -612,6 +625,8 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
612int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, 625int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
613 unsigned int rt, unsigned int bytes, int is_bigendian) 626 unsigned int rt, unsigned int bytes, int is_bigendian)
614{ 627{
628 int idx, ret;
629
615 if (bytes > sizeof(run->mmio.data)) { 630 if (bytes > sizeof(run->mmio.data)) {
616 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, 631 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
617 run->mmio.len); 632 run->mmio.len);
@@ -627,8 +642,14 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
627 vcpu->mmio_is_write = 0; 642 vcpu->mmio_is_write = 0;
628 vcpu->arch.mmio_sign_extend = 0; 643 vcpu->arch.mmio_sign_extend = 0;
629 644
630 if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 645 idx = srcu_read_lock(&vcpu->kvm->srcu);
631 bytes, &run->mmio.data)) { 646
647 ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
648 bytes, &run->mmio.data);
649
650 srcu_read_unlock(&vcpu->kvm->srcu, idx);
651
652 if (!ret) {
632 kvmppc_complete_mmio_load(vcpu, run); 653 kvmppc_complete_mmio_load(vcpu, run);
633 vcpu->mmio_needed = 0; 654 vcpu->mmio_needed = 0;
634 return EMULATE_DONE; 655 return EMULATE_DONE;
@@ -653,6 +674,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
653 u64 val, unsigned int bytes, int is_bigendian) 674 u64 val, unsigned int bytes, int is_bigendian)
654{ 675{
655 void *data = run->mmio.data; 676 void *data = run->mmio.data;
677 int idx, ret;
656 678
657 if (bytes > sizeof(run->mmio.data)) { 679 if (bytes > sizeof(run->mmio.data)) {
658 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, 680 printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
@@ -682,9 +704,14 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
682 } 704 }
683 } 705 }
684 706
685 if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 707 idx = srcu_read_lock(&vcpu->kvm->srcu);
686 bytes, &run->mmio.data)) { 708
687 kvmppc_complete_mmio_load(vcpu, run); 709 ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
710 bytes, &run->mmio.data);
711
712 srcu_read_unlock(&vcpu->kvm->srcu, idx);
713
714 if (!ret) {
688 vcpu->mmio_needed = 0; 715 vcpu->mmio_needed = 0;
689 return EMULATE_DONE; 716 return EMULATE_DONE;
690 } 717 }
@@ -740,7 +767,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
740int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) 767int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
741{ 768{
742 if (irq->irq == KVM_INTERRUPT_UNSET) { 769 if (irq->irq == KVM_INTERRUPT_UNSET) {
743 kvmppc_core_dequeue_external(vcpu, irq); 770 kvmppc_core_dequeue_external(vcpu);
744 return 0; 771 return 0;
745 } 772 }
746 773
@@ -770,7 +797,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
770 break; 797 break;
771 case KVM_CAP_PPC_EPR: 798 case KVM_CAP_PPC_EPR:
772 r = 0; 799 r = 0;
773 vcpu->arch.epr_enabled = cap->args[0]; 800 if (cap->args[0])
801 vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
802 else
803 vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
774 break; 804 break;
775#ifdef CONFIG_BOOKE 805#ifdef CONFIG_BOOKE
776 case KVM_CAP_PPC_BOOKE_WATCHDOG: 806 case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -791,6 +821,44 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
791 break; 821 break;
792 } 822 }
793#endif 823#endif
824#ifdef CONFIG_KVM_MPIC
825 case KVM_CAP_IRQ_MPIC: {
826 struct file *filp;
827 struct kvm_device *dev;
828
829 r = -EBADF;
830 filp = fget(cap->args[0]);
831 if (!filp)
832 break;
833
834 r = -EPERM;
835 dev = kvm_device_from_filp(filp);
836 if (dev)
837 r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
838
839 fput(filp);
840 break;
841 }
842#endif
843#ifdef CONFIG_KVM_XICS
844 case KVM_CAP_IRQ_XICS: {
845 struct file *filp;
846 struct kvm_device *dev;
847
848 r = -EBADF;
849 filp = fget(cap->args[0]);
850 if (!filp)
851 break;
852
853 r = -EPERM;
854 dev = kvm_device_from_filp(filp);
855 if (dev)
856 r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
857
858 fput(filp);
859 break;
860 }
861#endif /* CONFIG_KVM_XICS */
794 default: 862 default:
795 r = -EINVAL; 863 r = -EINVAL;
796 break; 864 break;
@@ -913,9 +981,22 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
913 return 0; 981 return 0;
914} 982}
915 983
984int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
985 bool line_status)
986{
987 if (!irqchip_in_kernel(kvm))
988 return -ENXIO;
989
990 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
991 irq_event->irq, irq_event->level,
992 line_status);
993 return 0;
994}
995
916long kvm_arch_vm_ioctl(struct file *filp, 996long kvm_arch_vm_ioctl(struct file *filp,
917 unsigned int ioctl, unsigned long arg) 997 unsigned int ioctl, unsigned long arg)
918{ 998{
999 struct kvm *kvm __maybe_unused = filp->private_data;
919 void __user *argp = (void __user *)arg; 1000 void __user *argp = (void __user *)arg;
920 long r; 1001 long r;
921 1002
@@ -934,7 +1015,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
934#ifdef CONFIG_PPC_BOOK3S_64 1015#ifdef CONFIG_PPC_BOOK3S_64
935 case KVM_CREATE_SPAPR_TCE: { 1016 case KVM_CREATE_SPAPR_TCE: {
936 struct kvm_create_spapr_tce create_tce; 1017 struct kvm_create_spapr_tce create_tce;
937 struct kvm *kvm = filp->private_data;
938 1018
939 r = -EFAULT; 1019 r = -EFAULT;
940 if (copy_from_user(&create_tce, argp, sizeof(create_tce))) 1020 if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -946,8 +1026,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
946 1026
947#ifdef CONFIG_KVM_BOOK3S_64_HV 1027#ifdef CONFIG_KVM_BOOK3S_64_HV
948 case KVM_ALLOCATE_RMA: { 1028 case KVM_ALLOCATE_RMA: {
949 struct kvm *kvm = filp->private_data;
950 struct kvm_allocate_rma rma; 1029 struct kvm_allocate_rma rma;
1030 struct kvm *kvm = filp->private_data;
951 1031
952 r = kvm_vm_ioctl_allocate_rma(kvm, &rma); 1032 r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
953 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) 1033 if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
@@ -956,7 +1036,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
956 } 1036 }
957 1037
958 case KVM_PPC_ALLOCATE_HTAB: { 1038 case KVM_PPC_ALLOCATE_HTAB: {
959 struct kvm *kvm = filp->private_data;
960 u32 htab_order; 1039 u32 htab_order;
961 1040
962 r = -EFAULT; 1041 r = -EFAULT;
@@ -973,7 +1052,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
973 } 1052 }
974 1053
975 case KVM_PPC_GET_HTAB_FD: { 1054 case KVM_PPC_GET_HTAB_FD: {
976 struct kvm *kvm = filp->private_data;
977 struct kvm_get_htab_fd ghf; 1055 struct kvm_get_htab_fd ghf;
978 1056
979 r = -EFAULT; 1057 r = -EFAULT;
@@ -986,7 +1064,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
986 1064
987#ifdef CONFIG_PPC_BOOK3S_64 1065#ifdef CONFIG_PPC_BOOK3S_64
988 case KVM_PPC_GET_SMMU_INFO: { 1066 case KVM_PPC_GET_SMMU_INFO: {
989 struct kvm *kvm = filp->private_data;
990 struct kvm_ppc_smmu_info info; 1067 struct kvm_ppc_smmu_info info;
991 1068
992 memset(&info, 0, sizeof(info)); 1069 memset(&info, 0, sizeof(info));
@@ -995,6 +1072,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
995 r = -EFAULT; 1072 r = -EFAULT;
996 break; 1073 break;
997 } 1074 }
1075 case KVM_PPC_RTAS_DEFINE_TOKEN: {
1076 struct kvm *kvm = filp->private_data;
1077
1078 r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
1079 break;
1080 }
998#endif /* CONFIG_PPC_BOOK3S_64 */ 1081#endif /* CONFIG_PPC_BOOK3S_64 */
999 default: 1082 default:
1000 r = -ENOTTY; 1083 r = -ENOTTY;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 89db29d17c25..7cd728b3b5e4 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -51,6 +51,12 @@ static struct icp_ipl __iomem *icp_native_regs[NR_CPUS];
51static inline unsigned int icp_native_get_xirr(void) 51static inline unsigned int icp_native_get_xirr(void)
52{ 52{
53 int cpu = smp_processor_id(); 53 int cpu = smp_processor_id();
54 unsigned int xirr;
55
56 /* Handled an interrupt latched by KVM */
57 xirr = kvmppc_get_xics_latch();
58 if (xirr)
59 return xirr;
54 60
55 return in_be32(&icp_native_regs[cpu]->xirr.word); 61 return in_be32(&icp_native_regs[cpu]->xirr.word);
56} 62}
@@ -138,6 +144,7 @@ static unsigned int icp_native_get_irq(void)
138 144
139static void icp_native_cause_ipi(int cpu, unsigned long data) 145static void icp_native_cause_ipi(int cpu, unsigned long data)
140{ 146{
147 kvmppc_set_host_ipi(cpu, 1);
141 icp_native_set_qirr(cpu, IPI_PRIORITY); 148 icp_native_set_qirr(cpu, IPI_PRIORITY);
142} 149}
143 150
@@ -151,6 +158,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
151{ 158{
152 int cpu = smp_processor_id(); 159 int cpu = smp_processor_id();
153 160
161 kvmppc_set_host_ipi(cpu, 0);
154 icp_native_set_qirr(cpu, 0xff); 162 icp_native_set_qirr(cpu, 0xff);
155 163
156 return smp_ipi_demux(); 164 return smp_ipi_demux();
diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild
index 7bf68fff7c5d..9ccd1905bdad 100644
--- a/arch/s390/include/uapi/asm/Kbuild
+++ b/arch/s390/include/uapi/asm/Kbuild
@@ -44,5 +44,6 @@ header-y += termios.h
44header-y += types.h 44header-y += types.h
45header-y += ucontext.h 45header-y += ucontext.h
46header-y += unistd.h 46header-y += unistd.h
47header-y += virtio-ccw.h
47header-y += vtoc.h 48header-y += vtoc.h
48header-y += zcrypt.h 49header-y += zcrypt.h
diff --git a/arch/s390/include/uapi/asm/virtio-ccw.h b/arch/s390/include/uapi/asm/virtio-ccw.h
new file mode 100644
index 000000000000..a9a4ebf79fa7
--- /dev/null
+++ b/arch/s390/include/uapi/asm/virtio-ccw.h
@@ -0,0 +1,21 @@
1/*
2 * Definitions for virtio-ccw devices.
3 *
4 * Copyright IBM Corp. 2013
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only)
8 * as published by the Free Software Foundation.
9 *
10 * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
11 */
12#ifndef __KVM_VIRTIO_CCW_H
13#define __KVM_VIRTIO_CCW_H
14
15/* Alignment of vring buffers. */
16#define KVM_VIRTIO_CCW_RING_ALIGN 4096
17
18/* Subcode for diagnose 500 (virtio hypercall). */
19#define KVM_S390_VIRTIO_CCW_NOTIFY 3
20
21#endif
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 60f9f8ae0fc8..70b46eacf8e1 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
22 select PREEMPT_NOTIFIERS 22 select PREEMPT_NOTIFIERS
23 select ANON_INODES 23 select ANON_INODES
24 select HAVE_KVM_CPU_RELAX_INTERCEPT 24 select HAVE_KVM_CPU_RELAX_INTERCEPT
25 select HAVE_KVM_EVENTFD
25 ---help--- 26 ---help---
26 Support hosting paravirtualized guest machines using the SIE 27 Support hosting paravirtualized guest machines using the SIE
27 virtualization capability on the mainframe. This should work 28 virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 3975722bb19d..8fe9d65a4585 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -6,7 +6,7 @@
6# it under the terms of the GNU General Public License (version 2 only) 6# it under the terms of the GNU General Public License (version 2 only)
7# as published by the Free Software Foundation. 7# as published by the Free Software Foundation.
8 8
9common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o) 9common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o)
10 10
11ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 11ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
12 12
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index a390687feb13..1c01a9912989 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/kvm.h> 14#include <linux/kvm.h>
15#include <linux/kvm_host.h> 15#include <linux/kvm_host.h>
16#include <asm/virtio-ccw.h>
16#include "kvm-s390.h" 17#include "kvm-s390.h"
17#include "trace.h" 18#include "trace.h"
18#include "trace-s390.h" 19#include "trace-s390.h"
@@ -104,6 +105,29 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
104 return -EREMOTE; 105 return -EREMOTE;
105} 106}
106 107
108static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
109{
110 int ret, idx;
111
112 /* No virtio-ccw notification? Get out quickly. */
113 if (!vcpu->kvm->arch.css_support ||
114 (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
115 return -EOPNOTSUPP;
116
117 idx = srcu_read_lock(&vcpu->kvm->srcu);
118 /*
119 * The layout is as follows:
120 * - gpr 2 contains the subchannel id (passed as addr)
121 * - gpr 3 contains the virtqueue index (passed as datamatch)
122 */
123 ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
124 vcpu->run->s.regs.gprs[2],
125 8, &vcpu->run->s.regs.gprs[3]);
126 srcu_read_unlock(&vcpu->kvm->srcu, idx);
127 /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
128 return ret < 0 ? ret : 0;
129}
130
107int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) 131int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
108{ 132{
109 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16; 133 int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
@@ -118,6 +142,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
118 return __diag_time_slice_end_directed(vcpu); 142 return __diag_time_slice_end_directed(vcpu);
119 case 0x308: 143 case 0x308:
120 return __diag_ipl_functions(vcpu); 144 return __diag_ipl_functions(vcpu);
145 case 0x500:
146 return __diag_virtio_hypercall(vcpu);
121 default: 147 default:
122 return -EOPNOTSUPP; 148 return -EOPNOTSUPP;
123 } 149 }
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 4703f129e95e..302e0e52b009 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,369 +18,86 @@
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19#include "kvm-s390.h" 19#include "kvm-s390.h"
20 20
21static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, 21static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu,
22 unsigned long guestaddr) 22 void __user *gptr,
23 int prefixing)
23{ 24{
24 unsigned long prefix = vcpu->arch.sie_block->prefix; 25 unsigned long prefix = vcpu->arch.sie_block->prefix;
25 26 unsigned long gaddr = (unsigned long) gptr;
26 if (guestaddr < 2 * PAGE_SIZE) 27 unsigned long uaddr;
27 guestaddr += prefix; 28
28 else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE)) 29 if (prefixing) {
29 guestaddr -= prefix; 30 if (gaddr < 2 * PAGE_SIZE)
30 31 gaddr += prefix;
31 return (void __user *) gmap_fault(guestaddr, vcpu->arch.gmap); 32 else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE))
32} 33 gaddr -= prefix;
33
34static inline int get_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr,
35 u64 *result)
36{
37 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
38
39 BUG_ON(guestaddr & 7);
40
41 if (IS_ERR((void __force *) uptr))
42 return PTR_ERR((void __force *) uptr);
43
44 return get_user(*result, (unsigned long __user *) uptr);
45}
46
47static inline int get_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr,
48 u32 *result)
49{
50 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
51
52 BUG_ON(guestaddr & 3);
53
54 if (IS_ERR((void __force *) uptr))
55 return PTR_ERR((void __force *) uptr);
56
57 return get_user(*result, (u32 __user *) uptr);
58}
59
60static inline int get_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr,
61 u16 *result)
62{
63 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
64
65 BUG_ON(guestaddr & 1);
66
67 if (IS_ERR(uptr))
68 return PTR_ERR(uptr);
69
70 return get_user(*result, (u16 __user *) uptr);
71}
72
73static inline int get_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr,
74 u8 *result)
75{
76 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
77
78 if (IS_ERR((void __force *) uptr))
79 return PTR_ERR((void __force *) uptr);
80
81 return get_user(*result, (u8 __user *) uptr);
82}
83
84static inline int put_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr,
85 u64 value)
86{
87 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
88
89 BUG_ON(guestaddr & 7);
90
91 if (IS_ERR((void __force *) uptr))
92 return PTR_ERR((void __force *) uptr);
93
94 return put_user(value, (u64 __user *) uptr);
95}
96
97static inline int put_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr,
98 u32 value)
99{
100 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
101
102 BUG_ON(guestaddr & 3);
103
104 if (IS_ERR((void __force *) uptr))
105 return PTR_ERR((void __force *) uptr);
106
107 return put_user(value, (u32 __user *) uptr);
108}
109
110static inline int put_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr,
111 u16 value)
112{
113 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
114
115 BUG_ON(guestaddr & 1);
116
117 if (IS_ERR((void __force *) uptr))
118 return PTR_ERR((void __force *) uptr);
119
120 return put_user(value, (u16 __user *) uptr);
121}
122
123static inline int put_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr,
124 u8 value)
125{
126 void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
127
128 if (IS_ERR((void __force *) uptr))
129 return PTR_ERR((void __force *) uptr);
130
131 return put_user(value, (u8 __user *) uptr);
132}
133
134
135static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu,
136 unsigned long guestdest,
137 void *from, unsigned long n)
138{
139 int rc;
140 unsigned long i;
141 u8 *data = from;
142
143 for (i = 0; i < n; i++) {
144 rc = put_guest_u8(vcpu, guestdest++, *(data++));
145 if (rc < 0)
146 return rc;
147 } 34 }
148 return 0; 35 uaddr = gmap_fault(gaddr, vcpu->arch.gmap);
149} 36 if (IS_ERR_VALUE(uaddr))
150 37 uaddr = -EFAULT;
151static inline int __copy_to_guest_fast(struct kvm_vcpu *vcpu, 38 return (void __user *)uaddr;
152 unsigned long guestdest, 39}
153 void *from, unsigned long n) 40
154{ 41#define get_guest(vcpu, x, gptr) \
155 int r; 42({ \
43 __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
44 int __mask = sizeof(__typeof__(*(gptr))) - 1; \
45 int __ret = PTR_RET((void __force *)__uptr); \
46 \
47 if (!__ret) { \
48 BUG_ON((unsigned long)__uptr & __mask); \
49 __ret = get_user(x, __uptr); \
50 } \
51 __ret; \
52})
53
54#define put_guest(vcpu, x, gptr) \
55({ \
56 __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
57 int __mask = sizeof(__typeof__(*(gptr))) - 1; \
58 int __ret = PTR_RET((void __force *)__uptr); \
59 \
60 if (!__ret) { \
61 BUG_ON((unsigned long)__uptr & __mask); \
62 __ret = put_user(x, __uptr); \
63 } \
64 __ret; \
65})
66
67static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to,
68 unsigned long from, unsigned long len,
69 int to_guest, int prefixing)
70{
71 unsigned long _len, rc;
156 void __user *uptr; 72 void __user *uptr;
157 unsigned long size;
158
159 if (guestdest + n < guestdest)
160 return -EFAULT;
161
162 /* simple case: all within one segment table entry? */
163 if ((guestdest & PMD_MASK) == ((guestdest+n) & PMD_MASK)) {
164 uptr = (void __user *) gmap_fault(guestdest, vcpu->arch.gmap);
165
166 if (IS_ERR((void __force *) uptr))
167 return PTR_ERR((void __force *) uptr);
168
169 r = copy_to_user(uptr, from, n);
170
171 if (r)
172 r = -EFAULT;
173
174 goto out;
175 }
176
177 /* copy first segment */
178 uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap);
179
180 if (IS_ERR((void __force *) uptr))
181 return PTR_ERR((void __force *) uptr);
182 73
183 size = PMD_SIZE - (guestdest & ~PMD_MASK); 74 while (len) {
184 75 uptr = to_guest ? (void __user *)to : (void __user *)from;
185 r = copy_to_user(uptr, from, size); 76 uptr = __gptr_to_uptr(vcpu, uptr, prefixing);
186 77 if (IS_ERR((void __force *)uptr))
187 if (r) { 78 return -EFAULT;
188 r = -EFAULT; 79 _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1));
189 goto out; 80 _len = min(_len, len);
190 } 81 if (to_guest)
191 from += size; 82 rc = copy_to_user((void __user *) uptr, (void *)from, _len);
192 n -= size; 83 else
193 guestdest += size; 84 rc = copy_from_user((void *)to, (void __user *)uptr, _len);
194 85 if (rc)
195 /* copy full segments */ 86 return -EFAULT;
196 while (n >= PMD_SIZE) { 87 len -= _len;
197 uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap); 88 from += _len;
198 89 to += _len;
199 if (IS_ERR((void __force *) uptr))
200 return PTR_ERR((void __force *) uptr);
201
202 r = copy_to_user(uptr, from, PMD_SIZE);
203
204 if (r) {
205 r = -EFAULT;
206 goto out;
207 }
208 from += PMD_SIZE;
209 n -= PMD_SIZE;
210 guestdest += PMD_SIZE;
211 }
212
213 /* copy the tail segment */
214 if (n) {
215 uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap);
216
217 if (IS_ERR((void __force *) uptr))
218 return PTR_ERR((void __force *) uptr);
219
220 r = copy_to_user(uptr, from, n);
221
222 if (r)
223 r = -EFAULT;
224 }
225out:
226 return r;
227}
228
229static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu,
230 unsigned long guestdest,
231 void *from, unsigned long n)
232{
233 return __copy_to_guest_fast(vcpu, guestdest, from, n);
234}
235
236static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest,
237 void *from, unsigned long n)
238{
239 unsigned long prefix = vcpu->arch.sie_block->prefix;
240
241 if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
242 goto slowpath;
243
244 if ((guestdest < prefix) && (guestdest + n > prefix))
245 goto slowpath;
246
247 if ((guestdest < prefix + 2 * PAGE_SIZE)
248 && (guestdest + n > prefix + 2 * PAGE_SIZE))
249 goto slowpath;
250
251 if (guestdest < 2 * PAGE_SIZE)
252 guestdest += prefix;
253 else if ((guestdest >= prefix) && (guestdest < prefix + 2 * PAGE_SIZE))
254 guestdest -= prefix;
255
256 return __copy_to_guest_fast(vcpu, guestdest, from, n);
257slowpath:
258 return __copy_to_guest_slow(vcpu, guestdest, from, n);
259}
260
261static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to,
262 unsigned long guestsrc,
263 unsigned long n)
264{
265 int rc;
266 unsigned long i;
267 u8 *data = to;
268
269 for (i = 0; i < n; i++) {
270 rc = get_guest_u8(vcpu, guestsrc++, data++);
271 if (rc < 0)
272 return rc;
273 } 90 }
274 return 0; 91 return 0;
275} 92}
276 93
277static inline int __copy_from_guest_fast(struct kvm_vcpu *vcpu, void *to, 94#define copy_to_guest(vcpu, to, from, size) \
278 unsigned long guestsrc, 95 __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1)
279 unsigned long n) 96#define copy_from_guest(vcpu, to, from, size) \
280{ 97 __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1)
281 int r; 98#define copy_to_guest_absolute(vcpu, to, from, size) \
282 void __user *uptr; 99 __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0)
283 unsigned long size; 100#define copy_from_guest_absolute(vcpu, to, from, size) \
284 101 __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0)
285 if (guestsrc + n < guestsrc)
286 return -EFAULT;
287
288 /* simple case: all within one segment table entry? */
289 if ((guestsrc & PMD_MASK) == ((guestsrc+n) & PMD_MASK)) {
290 uptr = (void __user *) gmap_fault(guestsrc, vcpu->arch.gmap);
291
292 if (IS_ERR((void __force *) uptr))
293 return PTR_ERR((void __force *) uptr);
294
295 r = copy_from_user(to, uptr, n);
296
297 if (r)
298 r = -EFAULT;
299
300 goto out;
301 }
302
303 /* copy first segment */
304 uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
305
306 if (IS_ERR((void __force *) uptr))
307 return PTR_ERR((void __force *) uptr);
308
309 size = PMD_SIZE - (guestsrc & ~PMD_MASK);
310
311 r = copy_from_user(to, uptr, size);
312
313 if (r) {
314 r = -EFAULT;
315 goto out;
316 }
317 to += size;
318 n -= size;
319 guestsrc += size;
320
321 /* copy full segments */
322 while (n >= PMD_SIZE) {
323 uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
324
325 if (IS_ERR((void __force *) uptr))
326 return PTR_ERR((void __force *) uptr);
327
328 r = copy_from_user(to, uptr, PMD_SIZE);
329
330 if (r) {
331 r = -EFAULT;
332 goto out;
333 }
334 to += PMD_SIZE;
335 n -= PMD_SIZE;
336 guestsrc += PMD_SIZE;
337 }
338
339 /* copy the tail segment */
340 if (n) {
341 uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
342
343 if (IS_ERR((void __force *) uptr))
344 return PTR_ERR((void __force *) uptr);
345
346 r = copy_from_user(to, uptr, n);
347
348 if (r)
349 r = -EFAULT;
350 }
351out:
352 return r;
353}
354
355static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
356 unsigned long guestsrc,
357 unsigned long n)
358{
359 return __copy_from_guest_fast(vcpu, to, guestsrc, n);
360}
361
362static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
363 unsigned long guestsrc, unsigned long n)
364{
365 unsigned long prefix = vcpu->arch.sie_block->prefix;
366
367 if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
368 goto slowpath;
369 102
370 if ((guestsrc < prefix) && (guestsrc + n > prefix)) 103#endif /* __KVM_S390_GACCESS_H */
371 goto slowpath;
372
373 if ((guestsrc < prefix + 2 * PAGE_SIZE)
374 && (guestsrc + n > prefix + 2 * PAGE_SIZE))
375 goto slowpath;
376
377 if (guestsrc < 2 * PAGE_SIZE)
378 guestsrc += prefix;
379 else if ((guestsrc >= prefix) && (guestsrc < prefix + 2 * PAGE_SIZE))
380 guestsrc -= prefix;
381
382 return __copy_from_guest_fast(vcpu, to, guestsrc, n);
383slowpath:
384 return __copy_from_guest_slow(vcpu, to, guestsrc, n);
385}
386#endif
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index f26ff1e31bdb..b7d1b2edeeb3 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -43,12 +43,10 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); 43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
44 44
45 do { 45 do {
46 rc = get_guest_u64(vcpu, useraddr, 46 rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
47 &vcpu->arch.sie_block->gcr[reg]); 47 (u64 __user *) useraddr);
48 if (rc == -EFAULT) { 48 if (rc)
49 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 49 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
50 break;
51 }
52 useraddr += 8; 50 useraddr += 8;
53 if (reg == reg3) 51 if (reg == reg3)
54 break; 52 break;
@@ -78,11 +76,9 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
78 76
79 reg = reg1; 77 reg = reg1;
80 do { 78 do {
81 rc = get_guest_u32(vcpu, useraddr, &val); 79 rc = get_guest(vcpu, val, (u32 __user *) useraddr);
82 if (rc == -EFAULT) { 80 if (rc)
83 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 81 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
84 break;
85 }
86 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul; 82 vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
87 vcpu->arch.sie_block->gcr[reg] |= val; 83 vcpu->arch.sie_block->gcr[reg] |= val;
88 useraddr += 4; 84 useraddr += 4;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 37116a77cb4b..5c948177529e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -180,7 +180,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
180 struct kvm_s390_interrupt_info *inti) 180 struct kvm_s390_interrupt_info *inti)
181{ 181{
182 const unsigned short table[] = { 2, 4, 4, 6 }; 182 const unsigned short table[] = { 2, 4, 4, 6 };
183 int rc, exception = 0; 183 int rc = 0;
184 184
185 switch (inti->type) { 185 switch (inti->type) {
186 case KVM_S390_INT_EMERGENCY: 186 case KVM_S390_INT_EMERGENCY:
@@ -188,74 +188,41 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
188 vcpu->stat.deliver_emergency_signal++; 188 vcpu->stat.deliver_emergency_signal++;
189 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 189 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
190 inti->emerg.code, 0); 190 inti->emerg.code, 0);
191 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201); 191 rc = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE);
192 if (rc == -EFAULT) 192 rc |= put_guest(vcpu, inti->emerg.code,
193 exception = 1; 193 (u16 __user *)__LC_EXT_CPU_ADDR);
194 194 rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
195 rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code); 195 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
196 if (rc == -EFAULT) 196 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
197 exception = 1; 197 __LC_EXT_NEW_PSW, sizeof(psw_t));
198
199 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
200 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
201 if (rc == -EFAULT)
202 exception = 1;
203
204 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
205 __LC_EXT_NEW_PSW, sizeof(psw_t));
206 if (rc == -EFAULT)
207 exception = 1;
208 break; 198 break;
209
210 case KVM_S390_INT_EXTERNAL_CALL: 199 case KVM_S390_INT_EXTERNAL_CALL:
211 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); 200 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
212 vcpu->stat.deliver_external_call++; 201 vcpu->stat.deliver_external_call++;
213 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 202 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
214 inti->extcall.code, 0); 203 inti->extcall.code, 0);
215 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202); 204 rc = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE);
216 if (rc == -EFAULT) 205 rc |= put_guest(vcpu, inti->extcall.code,
217 exception = 1; 206 (u16 __user *)__LC_EXT_CPU_ADDR);
218 207 rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
219 rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code); 208 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
220 if (rc == -EFAULT) 209 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
221 exception = 1; 210 __LC_EXT_NEW_PSW, sizeof(psw_t));
222
223 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
224 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
225 if (rc == -EFAULT)
226 exception = 1;
227
228 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
229 __LC_EXT_NEW_PSW, sizeof(psw_t));
230 if (rc == -EFAULT)
231 exception = 1;
232 break; 211 break;
233
234 case KVM_S390_INT_SERVICE: 212 case KVM_S390_INT_SERVICE:
235 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", 213 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
236 inti->ext.ext_params); 214 inti->ext.ext_params);
237 vcpu->stat.deliver_service_signal++; 215 vcpu->stat.deliver_service_signal++;
238 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 216 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
239 inti->ext.ext_params, 0); 217 inti->ext.ext_params, 0);
240 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401); 218 rc = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE);
241 if (rc == -EFAULT) 219 rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
242 exception = 1; 220 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
243 221 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
244 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 222 __LC_EXT_NEW_PSW, sizeof(psw_t));
245 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 223 rc |= put_guest(vcpu, inti->ext.ext_params,
246 if (rc == -EFAULT) 224 (u32 __user *)__LC_EXT_PARAMS);
247 exception = 1;
248
249 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
250 __LC_EXT_NEW_PSW, sizeof(psw_t));
251 if (rc == -EFAULT)
252 exception = 1;
253
254 rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
255 if (rc == -EFAULT)
256 exception = 1;
257 break; 225 break;
258
259 case KVM_S390_INT_VIRTIO: 226 case KVM_S390_INT_VIRTIO:
260 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx", 227 VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
261 inti->ext.ext_params, inti->ext.ext_params2); 228 inti->ext.ext_params, inti->ext.ext_params2);
@@ -263,34 +230,17 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
263 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 230 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
264 inti->ext.ext_params, 231 inti->ext.ext_params,
265 inti->ext.ext_params2); 232 inti->ext.ext_params2);
266 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603); 233 rc = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE);
267 if (rc == -EFAULT) 234 rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR);
268 exception = 1; 235 rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
269 236 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
270 rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00); 237 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
271 if (rc == -EFAULT) 238 __LC_EXT_NEW_PSW, sizeof(psw_t));
272 exception = 1; 239 rc |= put_guest(vcpu, inti->ext.ext_params,
273 240 (u32 __user *)__LC_EXT_PARAMS);
274 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 241 rc |= put_guest(vcpu, inti->ext.ext_params2,
275 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 242 (u64 __user *)__LC_EXT_PARAMS2);
276 if (rc == -EFAULT)
277 exception = 1;
278
279 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
280 __LC_EXT_NEW_PSW, sizeof(psw_t));
281 if (rc == -EFAULT)
282 exception = 1;
283
284 rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
285 if (rc == -EFAULT)
286 exception = 1;
287
288 rc = put_guest_u64(vcpu, __LC_EXT_PARAMS2,
289 inti->ext.ext_params2);
290 if (rc == -EFAULT)
291 exception = 1;
292 break; 243 break;
293
294 case KVM_S390_SIGP_STOP: 244 case KVM_S390_SIGP_STOP:
295 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop"); 245 VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
296 vcpu->stat.deliver_stop_signal++; 246 vcpu->stat.deliver_stop_signal++;
@@ -313,18 +263,14 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
313 vcpu->stat.deliver_restart_signal++; 263 vcpu->stat.deliver_restart_signal++;
314 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 264 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
315 0, 0); 265 0, 0);
316 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, 266 rc = copy_to_guest(vcpu,
317 restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 267 offsetof(struct _lowcore, restart_old_psw),
318 if (rc == -EFAULT) 268 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
319 exception = 1; 269 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
320 270 offsetof(struct _lowcore, restart_psw),
321 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw, 271 sizeof(psw_t));
322 offsetof(struct _lowcore, restart_psw), sizeof(psw_t));
323 if (rc == -EFAULT)
324 exception = 1;
325 atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 272 atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
326 break; 273 break;
327
328 case KVM_S390_PROGRAM_INT: 274 case KVM_S390_PROGRAM_INT:
329 VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", 275 VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
330 inti->pgm.code, 276 inti->pgm.code,
@@ -332,24 +278,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
332 vcpu->stat.deliver_program_int++; 278 vcpu->stat.deliver_program_int++;
333 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 279 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
334 inti->pgm.code, 0); 280 inti->pgm.code, 0);
335 rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code); 281 rc = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE);
336 if (rc == -EFAULT) 282 rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14],
337 exception = 1; 283 (u16 __user *)__LC_PGM_ILC);
338 284 rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
339 rc = put_guest_u16(vcpu, __LC_PGM_ILC, 285 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
340 table[vcpu->arch.sie_block->ipa >> 14]); 286 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
341 if (rc == -EFAULT) 287 __LC_PGM_NEW_PSW, sizeof(psw_t));
342 exception = 1;
343
344 rc = copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
345 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
346 if (rc == -EFAULT)
347 exception = 1;
348
349 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
350 __LC_PGM_NEW_PSW, sizeof(psw_t));
351 if (rc == -EFAULT)
352 exception = 1;
353 break; 288 break;
354 289
355 case KVM_S390_MCHK: 290 case KVM_S390_MCHK:
@@ -358,24 +293,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
358 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 293 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
359 inti->mchk.cr14, 294 inti->mchk.cr14,
360 inti->mchk.mcic); 295 inti->mchk.mcic);
361 rc = kvm_s390_vcpu_store_status(vcpu, 296 rc = kvm_s390_vcpu_store_status(vcpu,
362 KVM_S390_STORE_STATUS_PREFIXED); 297 KVM_S390_STORE_STATUS_PREFIXED);
363 if (rc == -EFAULT) 298 rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE);
364 exception = 1; 299 rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
365 300 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
366 rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic); 301 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
367 if (rc == -EFAULT) 302 __LC_MCK_NEW_PSW, sizeof(psw_t));
368 exception = 1;
369
370 rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
371 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
372 if (rc == -EFAULT)
373 exception = 1;
374
375 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
376 __LC_MCK_NEW_PSW, sizeof(psw_t));
377 if (rc == -EFAULT)
378 exception = 1;
379 break; 303 break;
380 304
381 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: 305 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -388,67 +312,44 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
388 vcpu->stat.deliver_io_int++; 312 vcpu->stat.deliver_io_int++;
389 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 313 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
390 param0, param1); 314 param0, param1);
391 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID, 315 rc = put_guest(vcpu, inti->io.subchannel_id,
392 inti->io.subchannel_id); 316 (u16 __user *) __LC_SUBCHANNEL_ID);
393 if (rc == -EFAULT) 317 rc |= put_guest(vcpu, inti->io.subchannel_nr,
394 exception = 1; 318 (u16 __user *) __LC_SUBCHANNEL_NR);
395 319 rc |= put_guest(vcpu, inti->io.io_int_parm,
396 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR, 320 (u32 __user *) __LC_IO_INT_PARM);
397 inti->io.subchannel_nr); 321 rc |= put_guest(vcpu, inti->io.io_int_word,
398 if (rc == -EFAULT) 322 (u32 __user *) __LC_IO_INT_WORD);
399 exception = 1; 323 rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW,
400 324 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
401 rc = put_guest_u32(vcpu, __LC_IO_INT_PARM, 325 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
402 inti->io.io_int_parm); 326 __LC_IO_NEW_PSW, sizeof(psw_t));
403 if (rc == -EFAULT)
404 exception = 1;
405
406 rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
407 inti->io.io_int_word);
408 if (rc == -EFAULT)
409 exception = 1;
410
411 rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
412 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
413 if (rc == -EFAULT)
414 exception = 1;
415
416 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
417 __LC_IO_NEW_PSW, sizeof(psw_t));
418 if (rc == -EFAULT)
419 exception = 1;
420 break; 327 break;
421 } 328 }
422 default: 329 default:
423 BUG(); 330 BUG();
424 } 331 }
425 if (exception) { 332 if (rc) {
426 printk("kvm: The guest lowcore is not mapped during interrupt " 333 printk("kvm: The guest lowcore is not mapped during interrupt "
427 "delivery, killing userspace\n"); 334 "delivery, killing userspace\n");
428 do_exit(SIGKILL); 335 do_exit(SIGKILL);
429 } 336 }
430} 337}
431 338
432static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) 339static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
433{ 340{
434 int rc, exception = 0; 341 int rc;
435 342
436 if (psw_extint_disabled(vcpu)) 343 if (psw_extint_disabled(vcpu))
437 return 0; 344 return 0;
438 if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul)) 345 if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))
439 return 0; 346 return 0;
440 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004); 347 rc = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE);
441 if (rc == -EFAULT) 348 rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
442 exception = 1; 349 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
443 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW, 350 rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
444 &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); 351 __LC_EXT_NEW_PSW, sizeof(psw_t));
445 if (rc == -EFAULT) 352 if (rc) {
446 exception = 1;
447 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
448 __LC_EXT_NEW_PSW, sizeof(psw_t));
449 if (rc == -EFAULT)
450 exception = 1;
451 if (exception) {
452 printk("kvm: The guest lowcore is not mapped during interrupt " 353 printk("kvm: The guest lowcore is not mapped during interrupt "
453 "delivery, killing userspace\n"); 354 "delivery, killing userspace\n");
454 do_exit(SIGKILL); 355 do_exit(SIGKILL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4cf35a0a79e7..c1c7c683fa26 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -142,12 +142,16 @@ int kvm_dev_ioctl_check_extension(long ext)
142 case KVM_CAP_ONE_REG: 142 case KVM_CAP_ONE_REG:
143 case KVM_CAP_ENABLE_CAP: 143 case KVM_CAP_ENABLE_CAP:
144 case KVM_CAP_S390_CSS_SUPPORT: 144 case KVM_CAP_S390_CSS_SUPPORT:
145 case KVM_CAP_IOEVENTFD:
145 r = 1; 146 r = 1;
146 break; 147 break;
147 case KVM_CAP_NR_VCPUS: 148 case KVM_CAP_NR_VCPUS:
148 case KVM_CAP_MAX_VCPUS: 149 case KVM_CAP_MAX_VCPUS:
149 r = KVM_MAX_VCPUS; 150 r = KVM_MAX_VCPUS;
150 break; 151 break;
152 case KVM_CAP_NR_MEMSLOTS:
153 r = KVM_USER_MEM_SLOTS;
154 break;
151 case KVM_CAP_S390_COW: 155 case KVM_CAP_S390_COW:
152 r = MACHINE_HAS_ESOP; 156 r = MACHINE_HAS_ESOP;
153 break; 157 break;
@@ -632,8 +636,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
632 } else { 636 } else {
633 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction"); 637 VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
634 trace_kvm_s390_sie_fault(vcpu); 638 trace_kvm_s390_sie_fault(vcpu);
635 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 639 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
636 rc = 0;
637 } 640 }
638 } 641 }
639 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", 642 VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
@@ -974,22 +977,13 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
974/* Section: memory related */ 977/* Section: memory related */
975int kvm_arch_prepare_memory_region(struct kvm *kvm, 978int kvm_arch_prepare_memory_region(struct kvm *kvm,
976 struct kvm_memory_slot *memslot, 979 struct kvm_memory_slot *memslot,
977 struct kvm_memory_slot old,
978 struct kvm_userspace_memory_region *mem, 980 struct kvm_userspace_memory_region *mem,
979 bool user_alloc) 981 enum kvm_mr_change change)
980{ 982{
981 /* A few sanity checks. We can have exactly one memory slot which has 983 /* A few sanity checks. We can have memory slots which have to be
982 to start at guest virtual zero and which has to be located at a 984 located/ended at a segment boundary (1MB). The memory in userland is
983 page boundary in userland and which has to end at a page boundary. 985 ok to be fragmented into various different vmas. It is okay to mmap()
984 The memory in userland is ok to be fragmented into various different 986 and munmap() stuff in this slot after doing this call at any time */
985 vmas. It is okay to mmap() and munmap() stuff in this slot after
986 doing this call at any time */
987
988 if (mem->slot)
989 return -EINVAL;
990
991 if (mem->guest_phys_addr)
992 return -EINVAL;
993 987
994 if (mem->userspace_addr & 0xffffful) 988 if (mem->userspace_addr & 0xffffful)
995 return -EINVAL; 989 return -EINVAL;
@@ -997,19 +991,26 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
997 if (mem->memory_size & 0xffffful) 991 if (mem->memory_size & 0xffffful)
998 return -EINVAL; 992 return -EINVAL;
999 993
1000 if (!user_alloc)
1001 return -EINVAL;
1002
1003 return 0; 994 return 0;
1004} 995}
1005 996
1006void kvm_arch_commit_memory_region(struct kvm *kvm, 997void kvm_arch_commit_memory_region(struct kvm *kvm,
1007 struct kvm_userspace_memory_region *mem, 998 struct kvm_userspace_memory_region *mem,
1008 struct kvm_memory_slot old, 999 const struct kvm_memory_slot *old,
1009 bool user_alloc) 1000 enum kvm_mr_change change)
1010{ 1001{
1011 int rc; 1002 int rc;
1012 1003
1004 /* If the basics of the memslot do not change, we do not want
1005 * to update the gmap. Every update causes several unnecessary
1006 * segment translation exceptions. This is usually handled just
1007 * fine by the normal fault handler + gmap, but it will also
1008 * cause faults on the prefix page of running guest CPUs.
1009 */
1010 if (old->userspace_addr == mem->userspace_addr &&
1011 old->base_gfn * PAGE_SIZE == mem->guest_phys_addr &&
1012 old->npages * PAGE_SIZE == mem->memory_size)
1013 return;
1013 1014
1014 rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, 1015 rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
1015 mem->guest_phys_addr, mem->memory_size); 1016 mem->guest_phys_addr, mem->memory_size);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 4d89d64a8161..efc14f687265 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -110,12 +110,12 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
110void kvm_s390_tasklet(unsigned long parm); 110void kvm_s390_tasklet(unsigned long parm);
111void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); 111void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
112void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu); 112void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
113int kvm_s390_inject_vm(struct kvm *kvm, 113int __must_check kvm_s390_inject_vm(struct kvm *kvm,
114 struct kvm_s390_interrupt *s390int); 114 struct kvm_s390_interrupt *s390int);
115int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 115int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
116 struct kvm_s390_interrupt *s390int); 116 struct kvm_s390_interrupt *s390int);
117int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 117int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
118int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); 118int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
119struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, 119struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
120 u64 cr6, u64 schid); 120 u64 cr6, u64 schid);
121 121
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 0ef9894606e5..6bbd7b5a0bbe 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -14,6 +14,8 @@
14#include <linux/kvm.h> 14#include <linux/kvm.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/compat.h>
18#include <asm/asm-offsets.h>
17#include <asm/current.h> 19#include <asm/current.h>
18#include <asm/debug.h> 20#include <asm/debug.h>
19#include <asm/ebcdic.h> 21#include <asm/ebcdic.h>
@@ -35,31 +37,24 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
35 operand2 = kvm_s390_get_base_disp_s(vcpu); 37 operand2 = kvm_s390_get_base_disp_s(vcpu);
36 38
37 /* must be word boundary */ 39 /* must be word boundary */
38 if (operand2 & 3) { 40 if (operand2 & 3)
39 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 41 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
40 goto out;
41 }
42 42
43 /* get the value */ 43 /* get the value */
44 if (get_guest_u32(vcpu, operand2, &address)) { 44 if (get_guest(vcpu, address, (u32 __user *) operand2))
45 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 45 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
46 goto out;
47 }
48 46
49 address = address & 0x7fffe000u; 47 address = address & 0x7fffe000u;
50 48
51 /* make sure that the new value is valid memory */ 49 /* make sure that the new value is valid memory */
52 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) || 50 if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
53 (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) { 51 (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)))
54 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 52 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
55 goto out;
56 }
57 53
58 kvm_s390_set_prefix(vcpu, address); 54 kvm_s390_set_prefix(vcpu, address);
59 55
60 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); 56 VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
61 trace_kvm_s390_handle_prefix(vcpu, 1, address); 57 trace_kvm_s390_handle_prefix(vcpu, 1, address);
62out:
63 return 0; 58 return 0;
64} 59}
65 60
@@ -73,49 +68,37 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
73 operand2 = kvm_s390_get_base_disp_s(vcpu); 68 operand2 = kvm_s390_get_base_disp_s(vcpu);
74 69
75 /* must be word boundary */ 70 /* must be word boundary */
76 if (operand2 & 3) { 71 if (operand2 & 3)
77 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 72 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
78 goto out;
79 }
80 73
81 address = vcpu->arch.sie_block->prefix; 74 address = vcpu->arch.sie_block->prefix;
82 address = address & 0x7fffe000u; 75 address = address & 0x7fffe000u;
83 76
84 /* get the value */ 77 /* get the value */
85 if (put_guest_u32(vcpu, operand2, address)) { 78 if (put_guest(vcpu, address, (u32 __user *)operand2))
86 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 79 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
87 goto out;
88 }
89 80
90 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); 81 VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
91 trace_kvm_s390_handle_prefix(vcpu, 0, address); 82 trace_kvm_s390_handle_prefix(vcpu, 0, address);
92out:
93 return 0; 83 return 0;
94} 84}
95 85
96static int handle_store_cpu_address(struct kvm_vcpu *vcpu) 86static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
97{ 87{
98 u64 useraddr; 88 u64 useraddr;
99 int rc;
100 89
101 vcpu->stat.instruction_stap++; 90 vcpu->stat.instruction_stap++;
102 91
103 useraddr = kvm_s390_get_base_disp_s(vcpu); 92 useraddr = kvm_s390_get_base_disp_s(vcpu);
104 93
105 if (useraddr & 1) { 94 if (useraddr & 1)
106 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 95 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
107 goto out;
108 }
109 96
110 rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id); 97 if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr))
111 if (rc == -EFAULT) { 98 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
112 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
113 goto out;
114 }
115 99
116 VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr); 100 VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
117 trace_kvm_s390_handle_stap(vcpu, useraddr); 101 trace_kvm_s390_handle_stap(vcpu, useraddr);
118out:
119 return 0; 102 return 0;
120} 103}
121 104
@@ -129,36 +112,38 @@ static int handle_skey(struct kvm_vcpu *vcpu)
129 112
130static int handle_tpi(struct kvm_vcpu *vcpu) 113static int handle_tpi(struct kvm_vcpu *vcpu)
131{ 114{
132 u64 addr;
133 struct kvm_s390_interrupt_info *inti; 115 struct kvm_s390_interrupt_info *inti;
116 u64 addr;
134 int cc; 117 int cc;
135 118
136 addr = kvm_s390_get_base_disp_s(vcpu); 119 addr = kvm_s390_get_base_disp_s(vcpu);
137 120 if (addr & 3)
121 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
122 cc = 0;
138 inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0); 123 inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
139 if (inti) { 124 if (!inti)
140 if (addr) { 125 goto no_interrupt;
141 /* 126 cc = 1;
142 * Store the two-word I/O interruption code into the 127 if (addr) {
143 * provided area. 128 /*
144 */ 129 * Store the two-word I/O interruption code into the
145 put_guest_u16(vcpu, addr, inti->io.subchannel_id); 130 * provided area.
146 put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr); 131 */
147 put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm); 132 put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr);
148 } else { 133 put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2));
149 /* 134 put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4));
150 * Store the three-word I/O interruption code into 135 } else {
151 * the appropriate lowcore area. 136 /*
152 */ 137 * Store the three-word I/O interruption code into
153 put_guest_u16(vcpu, 184, inti->io.subchannel_id); 138 * the appropriate lowcore area.
154 put_guest_u16(vcpu, 186, inti->io.subchannel_nr); 139 */
155 put_guest_u32(vcpu, 188, inti->io.io_int_parm); 140 put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID);
156 put_guest_u32(vcpu, 192, inti->io.io_int_word); 141 put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR);
157 } 142 put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM);
158 cc = 1; 143 put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD);
159 } else 144 }
160 cc = 0;
161 kfree(inti); 145 kfree(inti);
146no_interrupt:
162 /* Set condition code and we're done. */ 147 /* Set condition code and we're done. */
163 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 148 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
164 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44; 149 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
@@ -230,13 +215,10 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
230 215
231 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list), 216 rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
232 &facility_list, sizeof(facility_list)); 217 &facility_list, sizeof(facility_list));
233 if (rc == -EFAULT) 218 if (rc)
234 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 219 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
235 else { 220 VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
236 VCPU_EVENT(vcpu, 5, "store facility list value %x", 221 trace_kvm_s390_handle_stfl(vcpu, facility_list);
237 facility_list);
238 trace_kvm_s390_handle_stfl(vcpu, facility_list);
239 }
240 return 0; 222 return 0;
241} 223}
242 224
@@ -249,112 +231,80 @@ static void handle_new_psw(struct kvm_vcpu *vcpu)
249 231
250#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA) 232#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
251#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL 233#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
252#define PSW_ADDR_24 0x00000000000fffffUL 234#define PSW_ADDR_24 0x0000000000ffffffUL
253#define PSW_ADDR_31 0x000000007fffffffUL 235#define PSW_ADDR_31 0x000000007fffffffUL
254 236
237static int is_valid_psw(psw_t *psw) {
238 if (psw->mask & PSW_MASK_UNASSIGNED)
239 return 0;
240 if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) {
241 if (psw->addr & ~PSW_ADDR_31)
242 return 0;
243 }
244 if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24))
245 return 0;
246 if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_EA)
247 return 0;
248 return 1;
249}
250
255int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) 251int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
256{ 252{
257 u64 addr; 253 psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
258 psw_compat_t new_psw; 254 psw_compat_t new_psw;
255 u64 addr;
259 256
260 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 257 if (gpsw->mask & PSW_MASK_PSTATE)
261 return kvm_s390_inject_program_int(vcpu, 258 return kvm_s390_inject_program_int(vcpu,
262 PGM_PRIVILEGED_OPERATION); 259 PGM_PRIVILEGED_OPERATION);
263
264 addr = kvm_s390_get_base_disp_s(vcpu); 260 addr = kvm_s390_get_base_disp_s(vcpu);
265 261 if (addr & 7)
266 if (addr & 7) { 262 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
267 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 263 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
268 goto out; 264 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
269 } 265 if (!(new_psw.mask & PSW32_MASK_BASE))
270 266 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
271 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { 267 gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32;
272 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 268 gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE;
273 goto out; 269 gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE;
274 } 270 if (!is_valid_psw(gpsw))
275 271 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
276 if (!(new_psw.mask & PSW32_MASK_BASE)) {
277 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
278 goto out;
279 }
280
281 vcpu->arch.sie_block->gpsw.mask =
282 (new_psw.mask & ~PSW32_MASK_BASE) << 32;
283 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
284
285 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
286 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
287 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
288 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
289 PSW_MASK_EA)) {
290 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
291 goto out;
292 }
293
294 handle_new_psw(vcpu); 272 handle_new_psw(vcpu);
295out:
296 return 0; 273 return 0;
297} 274}
298 275
299static int handle_lpswe(struct kvm_vcpu *vcpu) 276static int handle_lpswe(struct kvm_vcpu *vcpu)
300{ 277{
301 u64 addr;
302 psw_t new_psw; 278 psw_t new_psw;
279 u64 addr;
303 280
304 addr = kvm_s390_get_base_disp_s(vcpu); 281 addr = kvm_s390_get_base_disp_s(vcpu);
305 282 if (addr & 7)
306 if (addr & 7) { 283 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
307 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 284 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
308 goto out; 285 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
309 } 286 vcpu->arch.sie_block->gpsw = new_psw;
310 287 if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
311 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) { 288 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
312 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
313 goto out;
314 }
315
316 vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
317 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
318
319 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
320 (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
321 PSW_MASK_BA) &&
322 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
323 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
324 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
325 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
326 PSW_MASK_EA)) {
327 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
328 goto out;
329 }
330
331 handle_new_psw(vcpu); 289 handle_new_psw(vcpu);
332out:
333 return 0; 290 return 0;
334} 291}
335 292
336static int handle_stidp(struct kvm_vcpu *vcpu) 293static int handle_stidp(struct kvm_vcpu *vcpu)
337{ 294{
338 u64 operand2; 295 u64 operand2;
339 int rc;
340 296
341 vcpu->stat.instruction_stidp++; 297 vcpu->stat.instruction_stidp++;
342 298
343 operand2 = kvm_s390_get_base_disp_s(vcpu); 299 operand2 = kvm_s390_get_base_disp_s(vcpu);
344 300
345 if (operand2 & 7) { 301 if (operand2 & 7)
346 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 302 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
347 goto out;
348 }
349 303
350 rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data); 304 if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2))
351 if (rc == -EFAULT) { 305 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
352 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
353 goto out;
354 }
355 306
356 VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); 307 VCPU_EVENT(vcpu, 5, "%s", "store cpu id");
357out:
358 return 0; 308 return 0;
359} 309}
360 310
@@ -394,8 +344,9 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
394 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; 344 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
395 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; 345 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
396 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; 346 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
347 unsigned long mem = 0;
397 u64 operand2; 348 u64 operand2;
398 unsigned long mem; 349 int rc = 0;
399 350
400 vcpu->stat.instruction_stsi++; 351 vcpu->stat.instruction_stsi++;
401 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); 352 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
@@ -414,37 +365,37 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
414 case 2: 365 case 2:
415 mem = get_zeroed_page(GFP_KERNEL); 366 mem = get_zeroed_page(GFP_KERNEL);
416 if (!mem) 367 if (!mem)
417 goto out_fail; 368 goto out_no_data;
418 if (stsi((void *) mem, fc, sel1, sel2)) 369 if (stsi((void *) mem, fc, sel1, sel2))
419 goto out_mem; 370 goto out_no_data;
420 break; 371 break;
421 case 3: 372 case 3:
422 if (sel1 != 2 || sel2 != 2) 373 if (sel1 != 2 || sel2 != 2)
423 goto out_fail; 374 goto out_no_data;
424 mem = get_zeroed_page(GFP_KERNEL); 375 mem = get_zeroed_page(GFP_KERNEL);
425 if (!mem) 376 if (!mem)
426 goto out_fail; 377 goto out_no_data;
427 handle_stsi_3_2_2(vcpu, (void *) mem); 378 handle_stsi_3_2_2(vcpu, (void *) mem);
428 break; 379 break;
429 default: 380 default:
430 goto out_fail; 381 goto out_no_data;
431 } 382 }
432 383
433 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) { 384 if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
434 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 385 rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
435 goto out_mem; 386 goto out_exception;
436 } 387 }
437 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); 388 trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
438 free_page(mem); 389 free_page(mem);
439 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 390 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
440 vcpu->run->s.regs.gprs[0] = 0; 391 vcpu->run->s.regs.gprs[0] = 0;
441 return 0; 392 return 0;
442out_mem: 393out_no_data:
443 free_page(mem);
444out_fail:
445 /* condition code 3 */ 394 /* condition code 3 */
446 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44; 395 vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
447 return 0; 396out_exception:
397 free_page(mem);
398 return rc;
448} 399}
449 400
450static const intercept_handler_t b2_handlers[256] = { 401static const intercept_handler_t b2_handlers[256] = {
@@ -575,20 +526,13 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
575 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) 526 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
576 return -EOPNOTSUPP; 527 return -EOPNOTSUPP;
577 528
578
579 /* we must resolve the address without holding the mmap semaphore.
580 * This is ok since the userspace hypervisor is not supposed to change
581 * the mapping while the guest queries the memory. Otherwise the guest
582 * might crash or get wrong info anyway. */
583 user_address = (unsigned long) __guestaddr_to_user(vcpu, address1);
584
585 down_read(&current->mm->mmap_sem); 529 down_read(&current->mm->mmap_sem);
530 user_address = __gmap_translate(address1, vcpu->arch.gmap);
531 if (IS_ERR_VALUE(user_address))
532 goto out_inject;
586 vma = find_vma(current->mm, user_address); 533 vma = find_vma(current->mm, user_address);
587 if (!vma) { 534 if (!vma)
588 up_read(&current->mm->mmap_sem); 535 goto out_inject;
589 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
590 }
591
592 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 536 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
593 if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ)) 537 if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ))
594 vcpu->arch.sie_block->gpsw.mask |= (1ul << 44); 538 vcpu->arch.sie_block->gpsw.mask |= (1ul << 44);
@@ -597,6 +541,10 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
597 541
598 up_read(&current->mm->mmap_sem); 542 up_read(&current->mm->mmap_sem);
599 return 0; 543 return 0;
544
545out_inject:
546 up_read(&current->mm->mmap_sem);
547 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
600} 548}
601 549
602int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) 550int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa0005c69..9bd4ecac72be 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
19 19
20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
21 21
22#ifdef CONFIG_HAVE_KVM
23BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
24#endif
25
22/* 26/*
23 * every pentium local APIC has two 'local interrupts', with a 27 * every pentium local APIC has two 'local interrupts', with a
24 * soft-definable vector attached to both interrupts, one of 28 * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 81f04cee5f74..ab0ae1aa6d0a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,9 @@ typedef struct {
12 unsigned int irq_spurious_count; 12 unsigned int irq_spurious_count;
13 unsigned int icr_read_retry_count; 13 unsigned int icr_read_retry_count;
14#endif 14#endif
15#ifdef CONFIG_HAVE_KVM
16 unsigned int kvm_posted_intr_ipis;
17#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 18 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 19 unsigned int apic_perf_irqs;
17 unsigned int apic_irq_work_irqs; 20 unsigned int apic_irq_work_irqs;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 10a78c3d3d5a..1da97efad08a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,6 +28,7 @@
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void x86_platform_ipi(void); 30extern void x86_platform_ipi(void);
31extern void kvm_posted_intr_ipi(void);
31extern void error_interrupt(void); 32extern void error_interrupt(void);
32extern void irq_work_interrupt(void); 33extern void irq_work_interrupt(void);
33 34
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index aac5fa62a86c..5702d7e3111d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,6 +102,11 @@
102 */ 102 */
103#define X86_PLATFORM_IPI_VECTOR 0xf7 103#define X86_PLATFORM_IPI_VECTOR 0xf7
104 104
105/* Vector for KVM to deliver posted interrupt IPI */
106#ifdef CONFIG_HAVE_KVM
107#define POSTED_INTR_VECTOR 0xf2
108#endif
109
105/* 110/*
106 * IRQ work vector: 111 * IRQ work vector:
107 */ 112 */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778cc7fb..3741c653767c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,7 +31,7 @@
31#include <asm/msr-index.h> 31#include <asm/msr-index.h>
32#include <asm/asm.h> 32#include <asm/asm.h>
33 33
34#define KVM_MAX_VCPUS 254 34#define KVM_MAX_VCPUS 255
35#define KVM_SOFT_MAX_VCPUS 160 35#define KVM_SOFT_MAX_VCPUS 160
36#define KVM_USER_MEM_SLOTS 125 36#define KVM_USER_MEM_SLOTS 125
37/* memory slots that are not exposed to userspace */ 37/* memory slots that are not exposed to userspace */
@@ -43,6 +43,8 @@
43#define KVM_PIO_PAGE_OFFSET 1 43#define KVM_PIO_PAGE_OFFSET 1
44#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 44#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
45 45
46#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
47
46#define CR0_RESERVED_BITS \ 48#define CR0_RESERVED_BITS \
47 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 49 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
48 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 50 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -94,9 +96,6 @@
94 96
95#define ASYNC_PF_PER_VCPU 64 97#define ASYNC_PF_PER_VCPU 64
96 98
97extern raw_spinlock_t kvm_lock;
98extern struct list_head vm_list;
99
100struct kvm_vcpu; 99struct kvm_vcpu;
101struct kvm; 100struct kvm;
102struct kvm_async_pf; 101struct kvm_async_pf;
@@ -230,6 +229,7 @@ struct kvm_mmu_page {
230#endif 229#endif
231 230
232 int write_flooding_count; 231 int write_flooding_count;
232 bool mmio_cached;
233}; 233};
234 234
235struct kvm_pio_request { 235struct kvm_pio_request {
@@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
345 unsigned long apic_attention; 345 unsigned long apic_attention;
346 int32_t apic_arb_prio; 346 int32_t apic_arb_prio;
347 int mp_state; 347 int mp_state;
348 int sipi_vector;
349 u64 ia32_misc_enable_msr; 348 u64 ia32_misc_enable_msr;
350 bool tpr_access_reporting; 349 bool tpr_access_reporting;
351 350
@@ -643,7 +642,7 @@ struct kvm_x86_ops {
643 /* Create, but do not attach this VCPU */ 642 /* Create, but do not attach this VCPU */
644 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 643 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
645 void (*vcpu_free)(struct kvm_vcpu *vcpu); 644 void (*vcpu_free)(struct kvm_vcpu *vcpu);
646 int (*vcpu_reset)(struct kvm_vcpu *vcpu); 645 void (*vcpu_reset)(struct kvm_vcpu *vcpu);
647 646
648 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 647 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
649 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 648 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -696,14 +695,16 @@ struct kvm_x86_ops {
696 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 695 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
697 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 696 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
698 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); 697 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 698 int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
700 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 699 int (*enable_irq_window)(struct kvm_vcpu *vcpu);
701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 700 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm); 701 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 702 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr); 703 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 704 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); 705 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
706 void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
707 void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 708 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
708 int (*get_tdp_level)(void); 709 int (*get_tdp_level)(void);
709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 710 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -730,6 +731,7 @@ struct kvm_x86_ops {
730 int (*check_intercept)(struct kvm_vcpu *vcpu, 731 int (*check_intercept)(struct kvm_vcpu *vcpu,
731 struct x86_instruction_info *info, 732 struct x86_instruction_info *info,
732 enum x86_intercept_stage stage); 733 enum x86_intercept_stage stage);
734 void (*handle_external_intr)(struct kvm_vcpu *vcpu);
733}; 735};
734 736
735struct kvm_arch_async_pf { 737struct kvm_arch_async_pf {
@@ -767,6 +769,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
767 struct kvm_memory_slot *slot, 769 struct kvm_memory_slot *slot,
768 gfn_t gfn_offset, unsigned long mask); 770 gfn_t gfn_offset, unsigned long mask);
769void kvm_mmu_zap_all(struct kvm *kvm); 771void kvm_mmu_zap_all(struct kvm *kvm);
772void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
770unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 773unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
771void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 774void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
772 775
@@ -797,6 +800,7 @@ enum emulation_result {
797#define EMULTYPE_TRAP_UD (1 << 1) 800#define EMULTYPE_TRAP_UD (1 << 1)
798#define EMULTYPE_SKIP (1 << 2) 801#define EMULTYPE_SKIP (1 << 2)
799#define EMULTYPE_RETRY (1 << 3) 802#define EMULTYPE_RETRY (1 << 3)
803#define EMULTYPE_NO_REEXECUTE (1 << 4)
800int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, 804int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
801 int emulation_type, void *insn, int insn_len); 805 int emulation_type, void *insn, int insn_len);
802 806
@@ -807,6 +811,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
807} 811}
808 812
809void kvm_enable_efer_bits(u64); 813void kvm_enable_efer_bits(u64);
814bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
810int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 815int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
811int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 816int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
812 817
@@ -819,6 +824,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
819 824
820void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 825void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
821int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 826int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
827void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
822 828
823int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, 829int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
824 int reason, bool has_error_code, u32 error_code); 830 int reason, bool has_error_code, u32 error_code);
@@ -973,7 +979,6 @@ enum {
973 * Trap the fault and ignore the instruction if that happens. 979 * Trap the fault and ignore the instruction if that happens.
974 */ 980 */
975asmlinkage void kvm_spurious_fault(void); 981asmlinkage void kvm_spurious_fault(void);
976extern bool kvm_rebooting;
977 982
978#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ 983#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
979 "666: " insn "\n\t" \ 984 "666: " insn "\n\t" \
@@ -1002,6 +1007,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1007int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1008int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1009int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
1010void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
1005 1011
1006void kvm_define_shared_msr(unsigned index, u32 msr); 1012void kvm_define_shared_msr(unsigned index, u32 msr);
1007void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1013void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
@@ -1027,7 +1033,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
1027void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); 1033void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
1028bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); 1034bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
1029int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 1035int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
1030int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); 1036int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
1031int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); 1037int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
1032void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); 1038void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
1033void kvm_deliver_pmi(struct kvm_vcpu *vcpu); 1039void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf860e398..f3e01a2cbaa1 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -65,11 +65,16 @@
65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
68#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
68 69
69 70
70#define PIN_BASED_EXT_INTR_MASK 0x00000001 71#define PIN_BASED_EXT_INTR_MASK 0x00000001
71#define PIN_BASED_NMI_EXITING 0x00000008 72#define PIN_BASED_NMI_EXITING 0x00000008
72#define PIN_BASED_VIRTUAL_NMIS 0x00000020 73#define PIN_BASED_VIRTUAL_NMIS 0x00000020
74#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
75#define PIN_BASED_POSTED_INTR 0x00000080
76
77#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
73 78
74#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 79#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
75#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 80#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
@@ -81,6 +86,8 @@
81#define VM_EXIT_LOAD_IA32_EFER 0x00200000 86#define VM_EXIT_LOAD_IA32_EFER 0x00200000
82#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 87#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
83 88
89#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
90
84#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 91#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
85#define VM_ENTRY_IA32E_MODE 0x00000200 92#define VM_ENTRY_IA32E_MODE 0x00000200
86#define VM_ENTRY_SMM 0x00000400 93#define VM_ENTRY_SMM 0x00000400
@@ -89,9 +96,15 @@
89#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 96#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
90#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 97#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
91 98
99#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
100
101#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
102#define VMX_MISC_SAVE_EFER_LMA 0x00000020
103
92/* VMCS Encodings */ 104/* VMCS Encodings */
93enum vmcs_field { 105enum vmcs_field {
94 VIRTUAL_PROCESSOR_ID = 0x00000000, 106 VIRTUAL_PROCESSOR_ID = 0x00000000,
107 POSTED_INTR_NV = 0x00000002,
95 GUEST_ES_SELECTOR = 0x00000800, 108 GUEST_ES_SELECTOR = 0x00000800,
96 GUEST_CS_SELECTOR = 0x00000802, 109 GUEST_CS_SELECTOR = 0x00000802,
97 GUEST_SS_SELECTOR = 0x00000804, 110 GUEST_SS_SELECTOR = 0x00000804,
@@ -126,6 +139,8 @@ enum vmcs_field {
126 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 139 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
127 APIC_ACCESS_ADDR = 0x00002014, 140 APIC_ACCESS_ADDR = 0x00002014,
128 APIC_ACCESS_ADDR_HIGH = 0x00002015, 141 APIC_ACCESS_ADDR_HIGH = 0x00002015,
142 POSTED_INTR_DESC_ADDR = 0x00002016,
143 POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
129 EPT_POINTER = 0x0000201a, 144 EPT_POINTER = 0x0000201a,
130 EPT_POINTER_HIGH = 0x0000201b, 145 EPT_POINTER_HIGH = 0x0000201b,
131 EOI_EXIT_BITMAP0 = 0x0000201c, 146 EOI_EXIT_BITMAP0 = 0x0000201c,
@@ -136,6 +151,8 @@ enum vmcs_field {
136 EOI_EXIT_BITMAP2_HIGH = 0x00002021, 151 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
137 EOI_EXIT_BITMAP3 = 0x00002022, 152 EOI_EXIT_BITMAP3 = 0x00002022,
138 EOI_EXIT_BITMAP3_HIGH = 0x00002023, 153 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
154 VMREAD_BITMAP = 0x00002026,
155 VMWRITE_BITMAP = 0x00002028,
139 GUEST_PHYSICAL_ADDRESS = 0x00002400, 156 GUEST_PHYSICAL_ADDRESS = 0x00002400,
140 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 157 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
141 VMCS_LINK_POINTER = 0x00002800, 158 VMCS_LINK_POINTER = 0x00002800,
@@ -209,6 +226,7 @@ enum vmcs_field {
209 GUEST_INTERRUPTIBILITY_INFO = 0x00004824, 226 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
210 GUEST_ACTIVITY_STATE = 0X00004826, 227 GUEST_ACTIVITY_STATE = 0X00004826,
211 GUEST_SYSENTER_CS = 0x0000482A, 228 GUEST_SYSENTER_CS = 0x0000482A,
229 VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
212 HOST_IA32_SYSENTER_CS = 0x00004c00, 230 HOST_IA32_SYSENTER_CS = 0x00004c00,
213 CR0_GUEST_HOST_MASK = 0x00006000, 231 CR0_GUEST_HOST_MASK = 0x00006000,
214 CR4_GUEST_HOST_MASK = 0x00006002, 232 CR4_GUEST_HOST_MASK = 0x00006002,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a65ec29e6ffb..5d9a3033b3d7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -29,7 +29,6 @@
29#define __KVM_HAVE_PIT 29#define __KVM_HAVE_PIT
30#define __KVM_HAVE_IOAPIC 30#define __KVM_HAVE_IOAPIC
31#define __KVM_HAVE_IRQ_LINE 31#define __KVM_HAVE_IRQ_LINE
32#define __KVM_HAVE_DEVICE_ASSIGNMENT
33#define __KVM_HAVE_MSI 32#define __KVM_HAVE_MSI
34#define __KVM_HAVE_USER_NMI 33#define __KVM_HAVE_USER_NMI
35#define __KVM_HAVE_GUEST_DEBUG 34#define __KVM_HAVE_GUEST_DEBUG
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index b5757885d7a4..b3a4866661c5 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -528,6 +528,8 @@
528#define VMX_BASIC_MEM_TYPE_WB 6LLU 528#define VMX_BASIC_MEM_TYPE_WB 6LLU
529#define VMX_BASIC_INOUT 0x0040000000000000LLU 529#define VMX_BASIC_INOUT 0x0040000000000000LLU
530 530
531/* MSR_IA32_VMX_MISC bits */
532#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
531/* AMD-V MSRs */ 533/* AMD-V MSRs */
532 534
533#define MSR_VM_CR 0xc0010114 535#define MSR_VM_CR 0xc0010114
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fccfee68..d651082c7cf7 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
65#define EXIT_REASON_EOI_INDUCED 45 65#define EXIT_REASON_EOI_INDUCED 45
66#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
67#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
68#define EXIT_REASON_PREEMPTION_TIMER 52
68#define EXIT_REASON_WBINVD 54 69#define EXIT_REASON_WBINVD 54
69#define EXIT_REASON_XSETBV 55 70#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56 71#define EXIT_REASON_APIC_WRITE 56
@@ -110,7 +111,7 @@
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 111 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 112 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \ 113 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" } 114 { EXIT_REASON_INVPCID, "INVPCID" }, \
114 115 { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }
115 116
116#endif /* _UAPIVMX_H */ 117#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c1d01e6ca790..727208941030 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1166,6 +1166,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1166apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1166apicinterrupt X86_PLATFORM_IPI_VECTOR \
1167 x86_platform_ipi smp_x86_platform_ipi 1167 x86_platform_ipi smp_x86_platform_ipi
1168 1168
1169#ifdef CONFIG_HAVE_KVM
1170apicinterrupt POSTED_INTR_VECTOR \
1171 kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
1172#endif
1173
1169apicinterrupt THRESHOLD_APIC_VECTOR \ 1174apicinterrupt THRESHOLD_APIC_VECTOR \
1170 threshold_interrupt smp_threshold_interrupt 1175 threshold_interrupt smp_threshold_interrupt
1171apicinterrupt THERMAL_APIC_VECTOR \ 1176apicinterrupt THERMAL_APIC_VECTOR \
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 84b778962c66..ac0631d8996f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -224,6 +224,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
224 set_irq_regs(old_regs); 224 set_irq_regs(old_regs);
225} 225}
226 226
227#ifdef CONFIG_HAVE_KVM
228/*
229 * Handler for POSTED_INTERRUPT_VECTOR.
230 */
231void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
232{
233 struct pt_regs *old_regs = set_irq_regs(regs);
234
235 ack_APIC_irq();
236
237 irq_enter();
238
239 exit_idle();
240
241 inc_irq_stat(kvm_posted_intr_ipis);
242
243 irq_exit();
244
245 set_irq_regs(old_regs);
246}
247#endif
248
227EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 249EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
228 250
229#ifdef CONFIG_HOTPLUG_CPU 251#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7dc4e459c2b3..a2a1fbc594ff 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -172,6 +172,10 @@ static void __init apic_intr_init(void)
172 172
173 /* IPI for X86 platform specific use */ 173 /* IPI for X86 platform specific use */
174 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); 174 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
175#ifdef CONFIG_HAVE_KVM
176 /* IPI for KVM to deliver posted interrupt */
177 alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
178#endif
175 179
176 /* IPI vectors for APIC spurious and error interrupts */ 180 /* IPI vectors for APIC spurious and error interrupts */
177 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 181 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0732f0089a3d..d2c381280e3c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,8 +160,12 @@ int kvm_register_clock(char *txt)
160{ 160{
161 int cpu = smp_processor_id(); 161 int cpu = smp_processor_id();
162 int low, high, ret; 162 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 struct pvclock_vcpu_time_info *src;
164
165 if (!hv_clock)
166 return 0;
164 167
168 src = &hv_clock[cpu].pvti;
165 low = (int)slow_virt_to_phys(src) | 1; 169 low = (int)slow_virt_to_phys(src) | 1;
166 high = ((u64)slow_virt_to_phys(src) >> 32); 170 high = ((u64)slow_virt_to_phys(src) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 171 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
@@ -276,6 +280,9 @@ int __init kvm_setup_vsyscall_timeinfo(void)
276 struct pvclock_vcpu_time_info *vcpu_time; 280 struct pvclock_vcpu_time_info *vcpu_time;
277 unsigned int size; 281 unsigned int size;
278 282
283 if (!hv_clock)
284 return 0;
285
279 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); 286 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
280 287
281 preempt_disable(); 288 preempt_disable();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 586f00059805..a47a3e54b964 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,14 +21,13 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 depends on HIGH_RES_TIMERS 23 depends on HIGH_RES_TIMERS
24 # for device assignment:
25 depends on PCI
26 # for TASKSTATS/TASK_DELAY_ACCT: 24 # for TASKSTATS/TASK_DELAY_ACCT:
27 depends on NET 25 depends on NET
28 select PREEMPT_NOTIFIERS 26 select PREEMPT_NOTIFIERS
29 select MMU_NOTIFIER 27 select MMU_NOTIFIER
30 select ANON_INODES 28 select ANON_INODES
31 select HAVE_KVM_IRQCHIP 29 select HAVE_KVM_IRQCHIP
30 select HAVE_KVM_IRQ_ROUTING
32 select HAVE_KVM_EVENTFD 31 select HAVE_KVM_EVENTFD
33 select KVM_APIC_ARCHITECTURE 32 select KVM_APIC_ARCHITECTURE
34 select KVM_ASYNC_PF 33 select KVM_ASYNC_PF
@@ -82,6 +81,17 @@ config KVM_MMU_AUDIT
82 This option adds a R/W kVM module parameter 'mmu_audit', which allows 81 This option adds a R/W kVM module parameter 'mmu_audit', which allows
83 audit KVM MMU at runtime. 82 audit KVM MMU at runtime.
84 83
84config KVM_DEVICE_ASSIGNMENT
85 bool "KVM legacy PCI device assignment support"
86 depends on KVM && PCI && IOMMU_API
87 default y
88 ---help---
89 Provide support for legacy PCI device assignment through KVM. The
90 kernel now also supports a full featured userspace device driver
91 framework through VFIO, which supersedes much of this support.
92
93 If unsure, say Y.
94
85# OK, it's a little counter-intuitive to do this, but it puts it neatly under 95# OK, it's a little counter-intuitive to do this, but it puts it neatly under
86# the virtualization menu. 96# the virtualization menu.
87source drivers/vhost/Kconfig 97source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 04d30401c5cb..d609e1d84048 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 irqchip.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(addprefix ../../../virt/kvm/, \
12 assigned-dev.o iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 13kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 14
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 15kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a335cc6cde72..8e517bba6a7c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -132,8 +132,9 @@
132#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 132#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
133#define No64 (1<<28) 133#define No64 (1<<28)
134#define PageTable (1 << 29) /* instruction used to write page table */ 134#define PageTable (1 << 29) /* instruction used to write page table */
135#define NotImpl (1 << 30) /* instruction is not implemented */
135/* Source 2 operand type */ 136/* Source 2 operand type */
136#define Src2Shift (30) 137#define Src2Shift (31)
137#define Src2None (OpNone << Src2Shift) 138#define Src2None (OpNone << Src2Shift)
138#define Src2CL (OpCL << Src2Shift) 139#define Src2CL (OpCL << Src2Shift)
139#define Src2ImmByte (OpImmByte << Src2Shift) 140#define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1578,12 +1579,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1578 1579
1579 memset(&seg_desc, 0, sizeof seg_desc); 1580 memset(&seg_desc, 0, sizeof seg_desc);
1580 1581
1581 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1582 if (ctxt->mode == X86EMUL_MODE_REAL) {
1582 || ctxt->mode == X86EMUL_MODE_REAL) { 1583 /* set real mode segment descriptor (keep limit etc. for
1583 /* set real mode segment descriptor */ 1584 * unreal mode) */
1584 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg); 1585 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
1585 set_desc_base(&seg_desc, selector << 4); 1586 set_desc_base(&seg_desc, selector << 4);
1586 goto load; 1587 goto load;
1588 } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
1589 /* VM86 needs a clean new segment descriptor */
1590 set_desc_base(&seg_desc, selector << 4);
1591 set_desc_limit(&seg_desc, 0xffff);
1592 seg_desc.type = 3;
1593 seg_desc.p = 1;
1594 seg_desc.s = 1;
1595 seg_desc.dpl = 3;
1596 goto load;
1587 } 1597 }
1588 1598
1589 rpl = selector & 3; 1599 rpl = selector & 3;
@@ -3615,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3615#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } 3625#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
3616#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ 3626#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
3617 .check_perm = (_p) } 3627 .check_perm = (_p) }
3618#define N D(0) 3628#define N D(NotImpl)
3619#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3629#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3620#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3630#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3621#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3631#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
@@ -3713,7 +3723,7 @@ static const struct opcode group5[] = {
3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3723 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3714 I(SrcMem | Stack, em_grp45), 3724 I(SrcMem | Stack, em_grp45),
3715 I(SrcMemFAddr | ImplicitOps, em_grp45), 3725 I(SrcMemFAddr | ImplicitOps, em_grp45),
3716 I(SrcMem | Stack, em_grp45), N, 3726 I(SrcMem | Stack, em_grp45), D(Undefined),
3717}; 3727};
3718 3728
3719static const struct opcode group6[] = { 3729static const struct opcode group6[] = {
@@ -4162,6 +4172,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4162 break; 4172 break;
4163 case OpMem8: 4173 case OpMem8:
4164 ctxt->memop.bytes = 1; 4174 ctxt->memop.bytes = 1;
4175 if (ctxt->memop.type == OP_REG) {
4176 ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
4177 fetch_register_operand(&ctxt->memop);
4178 }
4165 goto mem_common; 4179 goto mem_common;
4166 case OpMem16: 4180 case OpMem16:
4167 ctxt->memop.bytes = 2; 4181 ctxt->memop.bytes = 2;
@@ -4373,7 +4387,7 @@ done_prefixes:
4373 ctxt->intercept = opcode.intercept; 4387 ctxt->intercept = opcode.intercept;
4374 4388
4375 /* Unrecognised? */ 4389 /* Unrecognised? */
4376 if (ctxt->d == 0 || (ctxt->d & Undefined)) 4390 if (ctxt->d == 0 || (ctxt->d & NotImpl))
4377 return EMULATION_FAILED; 4391 return EMULATION_FAILED;
4378 4392
4379 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 4393 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
@@ -4511,7 +4525,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4511 4525
4512 ctxt->mem_read.pos = 0; 4526 ctxt->mem_read.pos = 0;
4513 4527
4514 if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { 4528 if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
4529 (ctxt->d & Undefined)) {
4515 rc = emulate_ud(ctxt); 4530 rc = emulate_ud(ctxt);
4516 goto done; 4531 goto done;
4517 } 4532 }
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c1d30b2fc9bb..412a5aa0ef94 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
290 } 290 }
291 spin_unlock(&ps->inject_lock); 291 spin_unlock(&ps->inject_lock);
292 if (inject) { 292 if (inject) {
293 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 293 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
294 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 294 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
295 295
296 /* 296 /*
297 * Provides NMI watchdog support via Virtual Wire mode. 297 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f77df1c5de6e..e1adbb4aca75 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 94 return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
95} 95}
96 96
97bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
98{
99 struct kvm_lapic *apic = vcpu->arch.apic;
100
101 return apic_test_vector(vector, apic->regs + APIC_ISR) ||
102 apic_test_vector(vector, apic->regs + APIC_IRR);
103}
104
97static inline void apic_set_vector(int vec, void *bitmap) 105static inline void apic_set_vector(int vec, void *bitmap)
98{ 106{
99 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 107 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -145,53 +153,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 153 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
146} 154}
147 155
148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
151{
152 struct kvm_lapic **dst;
153 struct kvm_apic_map *map;
154 unsigned long bitmap = 1;
155 int i;
156
157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
159
160 if (unlikely(!map)) {
161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
164
165 if (irq->dest_mode == 0) { /* physical mode */
166 if (irq->delivery_mode == APIC_DM_LOWEST ||
167 irq->dest_id == 0xff) {
168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
193}
194
195static void recalculate_apic_map(struct kvm *kvm) 156static void recalculate_apic_map(struct kvm *kvm)
196{ 157{
197 struct kvm_apic_map *new, *old = NULL; 158 struct kvm_apic_map *new, *old = NULL;
@@ -256,7 +217,7 @@ out:
256 if (old) 217 if (old)
257 kfree_rcu(old, rcu); 218 kfree_rcu(old, rcu);
258 219
259 kvm_ioapic_make_eoibitmap_request(kvm); 220 kvm_vcpu_request_scan_ioapic(kvm);
260} 221}
261 222
262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 223static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -357,6 +318,19 @@ static u8 count_vectors(void *bitmap)
357 return count; 318 return count;
358} 319}
359 320
321void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
322{
323 u32 i, pir_val;
324 struct kvm_lapic *apic = vcpu->arch.apic;
325
326 for (i = 0; i <= 7; i++) {
327 pir_val = xchg(&pir[i], 0);
328 if (pir_val)
329 *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
330 }
331}
332EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
333
360static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 334static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
361{ 335{
362 apic->irr_pending = true; 336 apic->irr_pending = true;
@@ -379,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
379 if (!apic->irr_pending) 353 if (!apic->irr_pending)
380 return -1; 354 return -1;
381 355
356 kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
382 result = apic_search_irr(apic); 357 result = apic_search_irr(apic);
383 ASSERT(result == -1 || result >= 16); 358 ASSERT(result == -1 || result >= 16);
384 359
@@ -431,14 +406,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
431} 406}
432 407
433static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 408static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
434 int vector, int level, int trig_mode); 409 int vector, int level, int trig_mode,
410 unsigned long *dest_map);
435 411
436int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) 412int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
413 unsigned long *dest_map)
437{ 414{
438 struct kvm_lapic *apic = vcpu->arch.apic; 415 struct kvm_lapic *apic = vcpu->arch.apic;
439 416
440 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, 417 return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
441 irq->level, irq->trig_mode); 418 irq->level, irq->trig_mode, dest_map);
442} 419}
443 420
444static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) 421static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -505,6 +482,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
505 return result; 482 return result;
506} 483}
507 484
485void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
486{
487 struct kvm_lapic *apic = vcpu->arch.apic;
488 int i;
489
490 for (i = 0; i < 8; i++)
491 apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
492}
493
508static void apic_update_ppr(struct kvm_lapic *apic) 494static void apic_update_ppr(struct kvm_lapic *apic)
509{ 495{
510 u32 tpr, isrv, ppr, old_ppr; 496 u32 tpr, isrv, ppr, old_ppr;
@@ -611,7 +597,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
611} 597}
612 598
613bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 599bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
614 struct kvm_lapic_irq *irq, int *r) 600 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
615{ 601{
616 struct kvm_apic_map *map; 602 struct kvm_apic_map *map;
617 unsigned long bitmap = 1; 603 unsigned long bitmap = 1;
@@ -622,7 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
622 *r = -1; 608 *r = -1;
623 609
624 if (irq->shorthand == APIC_DEST_SELF) { 610 if (irq->shorthand == APIC_DEST_SELF) {
625 *r = kvm_apic_set_irq(src->vcpu, irq); 611 *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
626 return true; 612 return true;
627 } 613 }
628 614
@@ -667,7 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
667 continue; 653 continue;
668 if (*r < 0) 654 if (*r < 0)
669 *r = 0; 655 *r = 0;
670 *r += kvm_apic_set_irq(dst[i]->vcpu, irq); 656 *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
671 } 657 }
672 658
673 ret = true; 659 ret = true;
@@ -681,7 +667,8 @@ out:
681 * Return 1 if successfully added and 0 if discarded. 667 * Return 1 if successfully added and 0 if discarded.
682 */ 668 */
683static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 669static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
684 int vector, int level, int trig_mode) 670 int vector, int level, int trig_mode,
671 unsigned long *dest_map)
685{ 672{
686 int result = 0; 673 int result = 0;
687 struct kvm_vcpu *vcpu = apic->vcpu; 674 struct kvm_vcpu *vcpu = apic->vcpu;
@@ -694,24 +681,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
694 if (unlikely(!apic_enabled(apic))) 681 if (unlikely(!apic_enabled(apic)))
695 break; 682 break;
696 683
697 if (trig_mode) { 684 if (dest_map)
698 apic_debug("level trig mode for vector %d", vector); 685 __set_bit(vcpu->vcpu_id, dest_map);
699 apic_set_vector(vector, apic->regs + APIC_TMR);
700 } else
701 apic_clear_vector(vector, apic->regs + APIC_TMR);
702 686
703 result = !apic_test_and_set_irr(vector, apic); 687 if (kvm_x86_ops->deliver_posted_interrupt) {
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 688 result = 1;
705 trig_mode, vector, !result); 689 kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
706 if (!result) { 690 } else {
707 if (trig_mode) 691 result = !apic_test_and_set_irr(vector, apic);
708 apic_debug("level trig mode repeatedly for "
709 "vector %d", vector);
710 break;
711 }
712 692
713 kvm_make_request(KVM_REQ_EVENT, vcpu); 693 if (!result) {
714 kvm_vcpu_kick(vcpu); 694 if (trig_mode)
695 apic_debug("level trig mode repeatedly "
696 "for vector %d", vector);
697 goto out;
698 }
699
700 kvm_make_request(KVM_REQ_EVENT, vcpu);
701 kvm_vcpu_kick(vcpu);
702 }
703out:
704 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
705 trig_mode, vector, !result);
715 break; 706 break;
716 707
717 case APIC_DM_REMRD: 708 case APIC_DM_REMRD:
@@ -731,7 +722,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
731 case APIC_DM_INIT: 722 case APIC_DM_INIT:
732 if (!trig_mode || level) { 723 if (!trig_mode || level) {
733 result = 1; 724 result = 1;
734 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 725 /* assumes that there are only KVM_APIC_INIT/SIPI */
726 apic->pending_events = (1UL << KVM_APIC_INIT);
727 /* make sure pending_events is visible before sending
728 * the request */
729 smp_wmb();
735 kvm_make_request(KVM_REQ_EVENT, vcpu); 730 kvm_make_request(KVM_REQ_EVENT, vcpu);
736 kvm_vcpu_kick(vcpu); 731 kvm_vcpu_kick(vcpu);
737 } else { 732 } else {
@@ -743,13 +738,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
743 case APIC_DM_STARTUP: 738 case APIC_DM_STARTUP:
744 apic_debug("SIPI to vcpu %d vector 0x%02x\n", 739 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
745 vcpu->vcpu_id, vector); 740 vcpu->vcpu_id, vector);
746 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 741 result = 1;
747 result = 1; 742 apic->sipi_vector = vector;
748 vcpu->arch.sipi_vector = vector; 743 /* make sure sipi_vector is visible for the receiver */
749 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 744 smp_wmb();
750 kvm_make_request(KVM_REQ_EVENT, vcpu); 745 set_bit(KVM_APIC_SIPI, &apic->pending_events);
751 kvm_vcpu_kick(vcpu); 746 kvm_make_request(KVM_REQ_EVENT, vcpu);
752 } 747 kvm_vcpu_kick(vcpu);
753 break; 748 break;
754 749
755 case APIC_DM_EXTINT: 750 case APIC_DM_EXTINT:
@@ -782,7 +777,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
782 trigger_mode = IOAPIC_LEVEL_TRIG; 777 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else 778 else
784 trigger_mode = IOAPIC_EDGE_TRIG; 779 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 780 kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
786 } 781 }
787} 782}
788 783
@@ -848,7 +843,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
848 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 843 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
849 irq.vector); 844 irq.vector);
850 845
851 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 846 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
852} 847}
853 848
854static u32 apic_get_tmcct(struct kvm_lapic *apic) 849static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1484,7 +1479,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1484 vector = reg & APIC_VECTOR_MASK; 1479 vector = reg & APIC_VECTOR_MASK;
1485 mode = reg & APIC_MODE_MASK; 1480 mode = reg & APIC_MODE_MASK;
1486 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1481 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
1487 return __apic_accept_irq(apic, mode, vector, 1, trig_mode); 1482 return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
1483 NULL);
1488 } 1484 }
1489 return 0; 1485 return 0;
1490} 1486}
@@ -1654,6 +1650,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1654 apic->highest_isr_cache = -1; 1650 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); 1651 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1656 kvm_make_request(KVM_REQ_EVENT, vcpu); 1652 kvm_make_request(KVM_REQ_EVENT, vcpu);
1653 kvm_rtc_eoi_tracking_restore_one(vcpu);
1657} 1654}
1658 1655
1659void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1656void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1860,6 +1857,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1860 addr, sizeof(u8)); 1857 addr, sizeof(u8));
1861} 1858}
1862 1859
1860void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
1861{
1862 struct kvm_lapic *apic = vcpu->arch.apic;
1863 unsigned int sipi_vector;
1864
1865 if (!kvm_vcpu_has_lapic(vcpu))
1866 return;
1867
1868 if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
1869 kvm_lapic_reset(vcpu);
1870 kvm_vcpu_reset(vcpu);
1871 if (kvm_vcpu_is_bsp(apic->vcpu))
1872 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1873 else
1874 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
1875 }
1876 if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
1877 vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
1878 /* evaluate pending_events before reading the vector */
1879 smp_rmb();
1880 sipi_vector = apic->sipi_vector;
1881 pr_debug("vcpu %d received sipi with vector # %x\n",
1882 vcpu->vcpu_id, sipi_vector);
1883 kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
1884 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1885 }
1886}
1887
1863void kvm_lapic_init(void) 1888void kvm_lapic_init(void)
1864{ 1889{
1865 /* do not patch jump label more than once per second */ 1890 /* do not patch jump label more than once per second */
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1676d34ddb4e..c730ac9fe801 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -5,6 +5,9 @@
5 5
6#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
7 7
8#define KVM_APIC_INIT 0
9#define KVM_APIC_SIPI 1
10
8struct kvm_timer { 11struct kvm_timer {
9 struct hrtimer timer; 12 struct hrtimer timer;
10 s64 period; /* unit: ns */ 13 s64 period; /* unit: ns */
@@ -32,6 +35,8 @@ struct kvm_lapic {
32 void *regs; 35 void *regs;
33 gpa_t vapic_addr; 36 gpa_t vapic_addr;
34 struct page *vapic_page; 37 struct page *vapic_page;
38 unsigned long pending_events;
39 unsigned int sipi_vector;
35}; 40};
36int kvm_create_lapic(struct kvm_vcpu *vcpu); 41int kvm_create_lapic(struct kvm_vcpu *vcpu);
37void kvm_free_lapic(struct kvm_vcpu *vcpu); 42void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
39int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); 44int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
40int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); 45int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
41int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); 46int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
47void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
42void kvm_lapic_reset(struct kvm_vcpu *vcpu); 48void kvm_lapic_reset(struct kvm_vcpu *vcpu);
43u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 49u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
44void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 50void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -47,13 +53,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
47u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 53u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
48void kvm_apic_set_version(struct kvm_vcpu *vcpu); 54void kvm_apic_set_version(struct kvm_vcpu *vcpu);
49 55
56void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
57void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
50int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 58int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
51int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 59int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
52int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 60int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
61 unsigned long *dest_map);
53int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 62int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
54 63
55bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 64bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
56 struct kvm_lapic_irq *irq, int *r); 65 struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
57 66
58u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 67u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
59void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 68void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
@@ -154,8 +163,11 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
154 return ldr & map->lid_mask; 163 return ldr & map->lid_mask;
155} 164}
156 165
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, 166static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
158 struct kvm_lapic_irq *irq, 167{
159 u64 *eoi_bitmap); 168 return vcpu->arch.apic->pending_events;
169}
170
171bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
160 172
161#endif 173#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca358108a..004cc87b781c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,8 +199,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
201{ 201{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep));
203
202 access &= ACC_WRITE_MASK | ACC_USER_MASK; 204 access &= ACC_WRITE_MASK | ACC_USER_MASK;
203 205
206 sp->mmio_cached = true;
204 trace_mark_mmio_spte(sptep, gfn, access); 207 trace_mark_mmio_spte(sptep, gfn, access);
205 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); 208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
206} 209}
@@ -1502,6 +1505,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1502 u64 *parent_pte, int direct) 1505 u64 *parent_pte, int direct)
1503{ 1506{
1504 struct kvm_mmu_page *sp; 1507 struct kvm_mmu_page *sp;
1508
1505 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1509 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1506 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1510 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1507 if (!direct) 1511 if (!direct)
@@ -1644,16 +1648,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1644static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1648static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1645 struct list_head *invalid_list); 1649 struct list_head *invalid_list);
1646 1650
1647#define for_each_gfn_sp(kvm, sp, gfn) \ 1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1648 hlist_for_each_entry(sp, \ 1652 hlist_for_each_entry(_sp, \
1649 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
1650 if ((sp)->gfn != (gfn)) {} else 1654 if ((_sp)->gfn != (_gfn)) {} else
1651 1655
1652#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ 1656#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
1653 hlist_for_each_entry(sp, \ 1657 for_each_gfn_sp(_kvm, _sp, _gfn) \
1654 &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1658 if ((_sp)->role.direct || (_sp)->role.invalid) {} else
1655 if ((sp)->gfn != (gfn) || (sp)->role.direct || \
1656 (sp)->role.invalid) {} else
1657 1659
1658/* @sp->gfn should be write-protected at the call site */ 1660/* @sp->gfn should be write-protected at the call site */
1659static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1661static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2089,7 +2091,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2089static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2091static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2090 struct list_head *invalid_list) 2092 struct list_head *invalid_list)
2091{ 2093{
2092 struct kvm_mmu_page *sp; 2094 struct kvm_mmu_page *sp, *nsp;
2093 2095
2094 if (list_empty(invalid_list)) 2096 if (list_empty(invalid_list))
2095 return; 2097 return;
@@ -2106,11 +2108,25 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2106 */ 2108 */
2107 kvm_flush_remote_tlbs(kvm); 2109 kvm_flush_remote_tlbs(kvm);
2108 2110
2109 do { 2111 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2110 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2111 WARN_ON(!sp->role.invalid || sp->root_count); 2112 WARN_ON(!sp->role.invalid || sp->root_count);
2112 kvm_mmu_free_page(sp); 2113 kvm_mmu_free_page(sp);
2113 } while (!list_empty(invalid_list)); 2114 }
2115}
2116
2117static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2118 struct list_head *invalid_list)
2119{
2120 struct kvm_mmu_page *sp;
2121
2122 if (list_empty(&kvm->arch.active_mmu_pages))
2123 return false;
2124
2125 sp = list_entry(kvm->arch.active_mmu_pages.prev,
2126 struct kvm_mmu_page, link);
2127 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2128
2129 return true;
2114} 2130}
2115 2131
2116/* 2132/*
@@ -2120,23 +2136,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2120void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) 2136void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2121{ 2137{
2122 LIST_HEAD(invalid_list); 2138 LIST_HEAD(invalid_list);
2123 /*
2124 * If we set the number of mmu pages to be smaller be than the
2125 * number of actived pages , we must to free some mmu pages before we
2126 * change the value
2127 */
2128 2139
2129 spin_lock(&kvm->mmu_lock); 2140 spin_lock(&kvm->mmu_lock);
2130 2141
2131 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2142 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2132 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2143 /* Need to free some mmu pages to achieve the goal. */
2133 !list_empty(&kvm->arch.active_mmu_pages)) { 2144 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2134 struct kvm_mmu_page *page; 2145 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2146 break;
2135 2147
2136 page = container_of(kvm->arch.active_mmu_pages.prev,
2137 struct kvm_mmu_page, link);
2138 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
2139 }
2140 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2148 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2141 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2149 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2142 } 2150 }
@@ -2794,6 +2802,7 @@ exit:
2794 2802
2795static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2803static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2796 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2804 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2805static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
2797 2806
2798static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2807static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2799 gfn_t gfn, bool prefault) 2808 gfn_t gfn, bool prefault)
@@ -2835,7 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2835 spin_lock(&vcpu->kvm->mmu_lock); 2844 spin_lock(&vcpu->kvm->mmu_lock);
2836 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 2845 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
2837 goto out_unlock; 2846 goto out_unlock;
2838 kvm_mmu_free_some_pages(vcpu); 2847 make_mmu_pages_available(vcpu);
2839 if (likely(!force_pt_level)) 2848 if (likely(!force_pt_level))
2840 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 2849 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2841 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 2850 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2913,7 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2913 2922
2914 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2923 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2915 spin_lock(&vcpu->kvm->mmu_lock); 2924 spin_lock(&vcpu->kvm->mmu_lock);
2916 kvm_mmu_free_some_pages(vcpu); 2925 make_mmu_pages_available(vcpu);
2917 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 2926 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2918 1, ACC_ALL, NULL); 2927 1, ACC_ALL, NULL);
2919 ++sp->root_count; 2928 ++sp->root_count;
@@ -2925,7 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2925 2934
2926 ASSERT(!VALID_PAGE(root)); 2935 ASSERT(!VALID_PAGE(root));
2927 spin_lock(&vcpu->kvm->mmu_lock); 2936 spin_lock(&vcpu->kvm->mmu_lock);
2928 kvm_mmu_free_some_pages(vcpu); 2937 make_mmu_pages_available(vcpu);
2929 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 2938 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2930 i << 30, 2939 i << 30,
2931 PT32_ROOT_LEVEL, 1, ACC_ALL, 2940 PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2964,7 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2964 ASSERT(!VALID_PAGE(root)); 2973 ASSERT(!VALID_PAGE(root));
2965 2974
2966 spin_lock(&vcpu->kvm->mmu_lock); 2975 spin_lock(&vcpu->kvm->mmu_lock);
2967 kvm_mmu_free_some_pages(vcpu); 2976 make_mmu_pages_available(vcpu);
2968 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 2977 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2969 0, ACC_ALL, NULL); 2978 0, ACC_ALL, NULL);
2970 root = __pa(sp->spt); 2979 root = __pa(sp->spt);
@@ -2998,7 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2998 return 1; 3007 return 1;
2999 } 3008 }
3000 spin_lock(&vcpu->kvm->mmu_lock); 3009 spin_lock(&vcpu->kvm->mmu_lock);
3001 kvm_mmu_free_some_pages(vcpu); 3010 make_mmu_pages_available(vcpu);
3002 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 3011 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
3003 PT32_ROOT_LEVEL, 0, 3012 PT32_ROOT_LEVEL, 0,
3004 ACC_ALL, NULL); 3013 ACC_ALL, NULL);
@@ -3304,7 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3304 spin_lock(&vcpu->kvm->mmu_lock); 3313 spin_lock(&vcpu->kvm->mmu_lock);
3305 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3314 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3306 goto out_unlock; 3315 goto out_unlock;
3307 kvm_mmu_free_some_pages(vcpu); 3316 make_mmu_pages_available(vcpu);
3308 if (likely(!force_pt_level)) 3317 if (likely(!force_pt_level))
3309 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3318 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
3310 r = __direct_map(vcpu, gpa, write, map_writable, 3319 r = __direct_map(vcpu, gpa, write, map_writable,
@@ -4006,17 +4015,17 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
4006} 4015}
4007EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4016EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
4008 4017
4009void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 4018static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
4010{ 4019{
4011 LIST_HEAD(invalid_list); 4020 LIST_HEAD(invalid_list);
4012 4021
4013 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && 4022 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
4014 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 4023 return;
4015 struct kvm_mmu_page *sp; 4024
4025 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
4026 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
4027 break;
4016 4028
4017 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
4018 struct kvm_mmu_page, link);
4019 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
4020 ++vcpu->kvm->stat.mmu_recycled; 4029 ++vcpu->kvm->stat.mmu_recycled;
4021 } 4030 }
4022 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4031 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4185,17 +4194,22 @@ restart:
4185 spin_unlock(&kvm->mmu_lock); 4194 spin_unlock(&kvm->mmu_lock);
4186} 4195}
4187 4196
4188static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
4189 struct list_head *invalid_list)
4190{ 4198{
4191 struct kvm_mmu_page *page; 4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4192 4201
4193 if (list_empty(&kvm->arch.active_mmu_pages)) 4202 spin_lock(&kvm->mmu_lock);
4194 return; 4203restart:
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4195 4210
4196 page = container_of(kvm->arch.active_mmu_pages.prev, 4211 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4197 struct kvm_mmu_page, link); 4212 spin_unlock(&kvm->mmu_lock);
4198 kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
4199} 4213}
4200 4214
4201static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -4232,7 +4246,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4232 idx = srcu_read_lock(&kvm->srcu); 4246 idx = srcu_read_lock(&kvm->srcu);
4233 spin_lock(&kvm->mmu_lock); 4247 spin_lock(&kvm->mmu_lock);
4234 4248
4235 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); 4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4236 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4250 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4237 4251
4238 spin_unlock(&kvm->mmu_lock); 4252 spin_unlock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 69871080e866..2adcbc2cac6d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,14 +57,11 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
57 57
58static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) 58static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
59{ 59{
60 return kvm->arch.n_max_mmu_pages - 60 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
61 kvm->arch.n_used_mmu_pages; 61 return kvm->arch.n_max_mmu_pages -
62} 62 kvm->arch.n_used_mmu_pages;
63 63
64static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 64 return 0;
65{
66 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
67 __kvm_mmu_free_some_pages(vcpu);
68} 65}
69 66
70static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) 67static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5bd550e..da20860b457a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -627,7 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
627 goto out_unlock; 627 goto out_unlock;
628 628
629 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 629 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
630 kvm_mmu_free_some_pages(vcpu); 630 make_mmu_pages_available(vcpu);
631 if (!force_pt_level) 631 if (!force_pt_level)
632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault, 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cfc258a6bf97..c53e797e7369 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
360 return 1; 360 return 1;
361} 361}
362 362
363int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 363int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
364{ 364{
365 struct kvm_pmu *pmu = &vcpu->arch.pmu; 365 struct kvm_pmu *pmu = &vcpu->arch.pmu;
366 struct kvm_pmc *pmc; 366 struct kvm_pmc *pmc;
367 u32 index = msr_info->index;
368 u64 data = msr_info->data;
367 369
368 switch (index) { 370 switch (index) {
369 case MSR_CORE_PERF_FIXED_CTR_CTRL: 371 case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
375 } 377 }
376 break; 378 break;
377 case MSR_CORE_PERF_GLOBAL_STATUS: 379 case MSR_CORE_PERF_GLOBAL_STATUS:
380 if (msr_info->host_initiated) {
381 pmu->global_status = data;
382 return 0;
383 }
378 break; /* RO MSR */ 384 break; /* RO MSR */
379 case MSR_CORE_PERF_GLOBAL_CTRL: 385 case MSR_CORE_PERF_GLOBAL_CTRL:
380 if (pmu->global_ctrl == data) 386 if (pmu->global_ctrl == data)
@@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
386 break; 392 break;
387 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 393 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
388 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { 394 if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
389 pmu->global_status &= ~data; 395 if (!msr_info->host_initiated)
396 pmu->global_status &= ~data;
390 pmu->global_ovf_ctrl = data; 397 pmu->global_ovf_ctrl = data;
391 return 0; 398 return 0;
392 } 399 }
@@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
394 default: 401 default:
395 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || 402 if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
396 (pmc = get_fixed_pmc(pmu, index))) { 403 (pmc = get_fixed_pmc(pmu, index))) {
397 data = (s64)(s32)data; 404 if (!msr_info->host_initiated)
405 data = (s64)(s32)data;
398 pmc->counter += data - read_pmc(pmc); 406 pmc->counter += data - read_pmc(pmc);
399 return 0; 407 return 0;
400 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { 408 } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7d39d70647e3..a14a6eaf871d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1131,17 +1131,11 @@ static void init_vmcb(struct vcpu_svm *svm)
1131 init_seg(&save->gs); 1131 init_seg(&save->gs);
1132 1132
1133 save->cs.selector = 0xf000; 1133 save->cs.selector = 0xf000;
1134 save->cs.base = 0xffff0000;
1134 /* Executable/Readable Code Segment */ 1135 /* Executable/Readable Code Segment */
1135 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1136 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1136 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1137 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1137 save->cs.limit = 0xffff; 1138 save->cs.limit = 0xffff;
1138 /*
1139 * cs.base should really be 0xffff0000, but vmx can't handle that, so
1140 * be consistent with it.
1141 *
1142 * Replace when we have real mode working for vmx.
1143 */
1144 save->cs.base = 0xf0000;
1145 1139
1146 save->gdtr.limit = 0xffff; 1140 save->gdtr.limit = 0xffff;
1147 save->idtr.limit = 0xffff; 1141 save->idtr.limit = 0xffff;
@@ -1191,7 +1185,7 @@ static void init_vmcb(struct vcpu_svm *svm)
1191 enable_gif(svm); 1185 enable_gif(svm);
1192} 1186}
1193 1187
1194static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1188static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
1195{ 1189{
1196 struct vcpu_svm *svm = to_svm(vcpu); 1190 struct vcpu_svm *svm = to_svm(vcpu);
1197 u32 dummy; 1191 u32 dummy;
@@ -1199,16 +1193,8 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1199 1193
1200 init_vmcb(svm); 1194 init_vmcb(svm);
1201 1195
1202 if (!kvm_vcpu_is_bsp(vcpu)) {
1203 kvm_rip_write(vcpu, 0);
1204 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1205 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1206 }
1207
1208 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1196 kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1209 kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1197 kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1210
1211 return 0;
1212} 1198}
1213 1199
1214static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 1200static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -3487,7 +3473,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3487 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3473 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3488 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3474 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3489 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3475 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3490 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3476 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3491 "exit_code 0x%x\n", 3477 "exit_code 0x%x\n",
3492 __func__, svm->vmcb->control.exit_int_info, 3478 __func__, svm->vmcb->control.exit_int_info,
3493 exit_code); 3479 exit_code);
@@ -3591,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3591 return; 3577 return;
3592} 3578}
3593 3579
3580static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
3581{
3582 return;
3583}
3584
3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3585static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3595{ 3586{
3596 struct vcpu_svm *svm = to_svm(vcpu); 3587 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3641,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3641 return ret; 3632 return ret;
3642} 3633}
3643 3634
3644static void enable_irq_window(struct kvm_vcpu *vcpu) 3635static int enable_irq_window(struct kvm_vcpu *vcpu)
3645{ 3636{
3646 struct vcpu_svm *svm = to_svm(vcpu); 3637 struct vcpu_svm *svm = to_svm(vcpu);
3647 3638
@@ -3655,15 +3646,16 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
3655 svm_set_vintr(svm); 3646 svm_set_vintr(svm);
3656 svm_inject_irq(svm, 0x0); 3647 svm_inject_irq(svm, 0x0);
3657 } 3648 }
3649 return 0;
3658} 3650}
3659 3651
3660static void enable_nmi_window(struct kvm_vcpu *vcpu) 3652static int enable_nmi_window(struct kvm_vcpu *vcpu)
3661{ 3653{
3662 struct vcpu_svm *svm = to_svm(vcpu); 3654 struct vcpu_svm *svm = to_svm(vcpu);
3663 3655
3664 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) 3656 if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3665 == HF_NMI_MASK) 3657 == HF_NMI_MASK)
3666 return; /* IRET will cause a vm exit */ 3658 return 0; /* IRET will cause a vm exit */
3667 3659
3668 /* 3660 /*
3669 * Something prevents NMI from been injected. Single step over possible 3661 * Something prevents NMI from been injected. Single step over possible
@@ -3672,6 +3664,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3672 svm->nmi_singlestep = true; 3664 svm->nmi_singlestep = true;
3673 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3665 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3674 update_db_bp_intercept(vcpu); 3666 update_db_bp_intercept(vcpu);
3667 return 0;
3675} 3668}
3676 3669
3677static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3670static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4247,6 +4240,11 @@ out:
4247 return ret; 4240 return ret;
4248} 4241}
4249 4242
4243static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4244{
4245 local_irq_enable();
4246}
4247
4250static struct kvm_x86_ops svm_x86_ops = { 4248static struct kvm_x86_ops svm_x86_ops = {
4251 .cpu_has_kvm_support = has_svm, 4249 .cpu_has_kvm_support = has_svm,
4252 .disabled_by_bios = is_disabled, 4250 .disabled_by_bios = is_disabled,
@@ -4314,6 +4312,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4314 .vm_has_apicv = svm_vm_has_apicv, 4312 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap, 4313 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update, 4314 .hwapic_isr_update = svm_hwapic_isr_update,
4315 .sync_pir_to_irr = svm_sync_pir_to_irr,
4317 4316
4318 .set_tss_addr = svm_set_tss_addr, 4317 .set_tss_addr = svm_set_tss_addr,
4319 .get_tdp_level = get_npt_level, 4318 .get_tdp_level = get_npt_level,
@@ -4342,6 +4341,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4342 .set_tdp_cr3 = set_tdp_cr3, 4341 .set_tdp_cr3 = set_tdp_cr3,
4343 4342
4344 .check_intercept = svm_check_intercept, 4343 .check_intercept = svm_check_intercept,
4344 .handle_external_intr = svm_handle_external_intr,
4345}; 4345};
4346 4346
4347static int __init svm_init(void) 4347static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 867b81037f96..25a791ed21c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,8 +84,11 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid; 87static bool __read_mostly enable_apicv = 1;
88module_param(enable_apicv, bool, S_IRUGO);
88 89
90static bool __read_mostly enable_shadow_vmcs = 1;
91module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
89/* 92/*
90 * If nested=1, nested virtualization is supported, i.e., guests may use 93 * If nested=1, nested virtualization is supported, i.e., guests may use
91 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 94 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -298,7 +301,8 @@ struct __packed vmcs12 {
298 u32 guest_activity_state; 301 u32 guest_activity_state;
299 u32 guest_sysenter_cs; 302 u32 guest_sysenter_cs;
300 u32 host_ia32_sysenter_cs; 303 u32 host_ia32_sysenter_cs;
301 u32 padding32[8]; /* room for future expansion */ 304 u32 vmx_preemption_timer_value;
305 u32 padding32[7]; /* room for future expansion */
302 u16 virtual_processor_id; 306 u16 virtual_processor_id;
303 u16 guest_es_selector; 307 u16 guest_es_selector;
304 u16 guest_cs_selector; 308 u16 guest_cs_selector;
@@ -351,6 +355,12 @@ struct nested_vmx {
351 /* The host-usable pointer to the above */ 355 /* The host-usable pointer to the above */
352 struct page *current_vmcs12_page; 356 struct page *current_vmcs12_page;
353 struct vmcs12 *current_vmcs12; 357 struct vmcs12 *current_vmcs12;
358 struct vmcs *current_shadow_vmcs;
359 /*
360 * Indicates if the shadow vmcs must be updated with the
361 * data hold by vmcs12
362 */
363 bool sync_shadow_vmcs;
354 364
355 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 365 /* vmcs02_list cache of VMCSs recently used to run L2 guests */
356 struct list_head vmcs02_pool; 366 struct list_head vmcs02_pool;
@@ -365,6 +375,31 @@ struct nested_vmx {
365 struct page *apic_access_page; 375 struct page *apic_access_page;
366}; 376};
367 377
378#define POSTED_INTR_ON 0
379/* Posted-Interrupt Descriptor */
380struct pi_desc {
381 u32 pir[8]; /* Posted interrupt requested */
382 u32 control; /* bit 0 of control is outstanding notification bit */
383 u32 rsvd[7];
384} __aligned(64);
385
386static bool pi_test_and_set_on(struct pi_desc *pi_desc)
387{
388 return test_and_set_bit(POSTED_INTR_ON,
389 (unsigned long *)&pi_desc->control);
390}
391
392static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
393{
394 return test_and_clear_bit(POSTED_INTR_ON,
395 (unsigned long *)&pi_desc->control);
396}
397
398static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
399{
400 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
401}
402
368struct vcpu_vmx { 403struct vcpu_vmx {
369 struct kvm_vcpu vcpu; 404 struct kvm_vcpu vcpu;
370 unsigned long host_rsp; 405 unsigned long host_rsp;
@@ -377,6 +412,7 @@ struct vcpu_vmx {
377 struct shared_msr_entry *guest_msrs; 412 struct shared_msr_entry *guest_msrs;
378 int nmsrs; 413 int nmsrs;
379 int save_nmsrs; 414 int save_nmsrs;
415 unsigned long host_idt_base;
380#ifdef CONFIG_X86_64 416#ifdef CONFIG_X86_64
381 u64 msr_host_kernel_gs_base; 417 u64 msr_host_kernel_gs_base;
382 u64 msr_guest_kernel_gs_base; 418 u64 msr_guest_kernel_gs_base;
@@ -428,6 +464,9 @@ struct vcpu_vmx {
428 464
429 bool rdtscp_enabled; 465 bool rdtscp_enabled;
430 466
467 /* Posted interrupt descriptor */
468 struct pi_desc pi_desc;
469
431 /* Support for a guest hypervisor (nested VMX) */ 470 /* Support for a guest hypervisor (nested VMX) */
432 struct nested_vmx nested; 471 struct nested_vmx nested;
433}; 472};
@@ -451,6 +490,64 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
451#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 490#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
452 [number##_HIGH] = VMCS12_OFFSET(name)+4 491 [number##_HIGH] = VMCS12_OFFSET(name)+4
453 492
493
494static const unsigned long shadow_read_only_fields[] = {
495 /*
496 * We do NOT shadow fields that are modified when L0
497 * traps and emulates any vmx instruction (e.g. VMPTRLD,
498 * VMXON...) executed by L1.
499 * For example, VM_INSTRUCTION_ERROR is read
500 * by L1 if a vmx instruction fails (part of the error path).
501 * Note the code assumes this logic. If for some reason
502 * we start shadowing these fields then we need to
503 * force a shadow sync when L0 emulates vmx instructions
504 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
505 * by nested_vmx_failValid)
506 */
507 VM_EXIT_REASON,
508 VM_EXIT_INTR_INFO,
509 VM_EXIT_INSTRUCTION_LEN,
510 IDT_VECTORING_INFO_FIELD,
511 IDT_VECTORING_ERROR_CODE,
512 VM_EXIT_INTR_ERROR_CODE,
513 EXIT_QUALIFICATION,
514 GUEST_LINEAR_ADDRESS,
515 GUEST_PHYSICAL_ADDRESS
516};
517static const int max_shadow_read_only_fields =
518 ARRAY_SIZE(shadow_read_only_fields);
519
520static const unsigned long shadow_read_write_fields[] = {
521 GUEST_RIP,
522 GUEST_RSP,
523 GUEST_CR0,
524 GUEST_CR3,
525 GUEST_CR4,
526 GUEST_INTERRUPTIBILITY_INFO,
527 GUEST_RFLAGS,
528 GUEST_CS_SELECTOR,
529 GUEST_CS_AR_BYTES,
530 GUEST_CS_LIMIT,
531 GUEST_CS_BASE,
532 GUEST_ES_BASE,
533 CR0_GUEST_HOST_MASK,
534 CR0_READ_SHADOW,
535 CR4_READ_SHADOW,
536 TSC_OFFSET,
537 EXCEPTION_BITMAP,
538 CPU_BASED_VM_EXEC_CONTROL,
539 VM_ENTRY_EXCEPTION_ERROR_CODE,
540 VM_ENTRY_INTR_INFO_FIELD,
541 VM_ENTRY_INSTRUCTION_LEN,
542 VM_ENTRY_EXCEPTION_ERROR_CODE,
543 HOST_FS_BASE,
544 HOST_GS_BASE,
545 HOST_FS_SELECTOR,
546 HOST_GS_SELECTOR
547};
548static const int max_shadow_read_write_fields =
549 ARRAY_SIZE(shadow_read_write_fields);
550
454static const unsigned short vmcs_field_to_offset_table[] = { 551static const unsigned short vmcs_field_to_offset_table[] = {
455 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 552 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
456 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 553 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -537,6 +634,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
537 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 634 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
538 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 635 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
539 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 636 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
637 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
540 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 638 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
541 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 639 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
542 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 640 FIELD(CR0_READ_SHADOW, cr0_read_shadow),
@@ -624,6 +722,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg); 722 struct kvm_segment *var, int seg);
625static bool guest_state_valid(struct kvm_vcpu *vcpu); 723static bool guest_state_valid(struct kvm_vcpu *vcpu);
626static u32 vmx_segment_access_rights(struct kvm_segment *var); 724static u32 vmx_segment_access_rights(struct kvm_segment *var);
725static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
726static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
727static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
627 728
628static DEFINE_PER_CPU(struct vmcs *, vmxarea); 729static DEFINE_PER_CPU(struct vmcs *, vmxarea);
629static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 730static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -640,6 +741,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
640static unsigned long *vmx_msr_bitmap_longmode; 741static unsigned long *vmx_msr_bitmap_longmode;
641static unsigned long *vmx_msr_bitmap_legacy_x2apic; 742static unsigned long *vmx_msr_bitmap_legacy_x2apic;
642static unsigned long *vmx_msr_bitmap_longmode_x2apic; 743static unsigned long *vmx_msr_bitmap_longmode_x2apic;
744static unsigned long *vmx_vmread_bitmap;
745static unsigned long *vmx_vmwrite_bitmap;
643 746
644static bool cpu_has_load_ia32_efer; 747static bool cpu_has_load_ia32_efer;
645static bool cpu_has_load_perf_global_ctrl; 748static bool cpu_has_load_perf_global_ctrl;
@@ -782,6 +885,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 885 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
783} 886}
784 887
888static inline bool cpu_has_vmx_posted_intr(void)
889{
890 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
891}
892
893static inline bool cpu_has_vmx_apicv(void)
894{
895 return cpu_has_vmx_apic_register_virt() &&
896 cpu_has_vmx_virtual_intr_delivery() &&
897 cpu_has_vmx_posted_intr();
898}
899
785static inline bool cpu_has_vmx_flexpriority(void) 900static inline bool cpu_has_vmx_flexpriority(void)
786{ 901{
787 return cpu_has_vmx_tpr_shadow() && 902 return cpu_has_vmx_tpr_shadow() &&
@@ -895,6 +1010,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void)
895 SECONDARY_EXEC_WBINVD_EXITING; 1010 SECONDARY_EXEC_WBINVD_EXITING;
896} 1011}
897 1012
1013static inline bool cpu_has_vmx_shadow_vmcs(void)
1014{
1015 u64 vmx_msr;
1016 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1017 /* check if the cpu supports writing r/o exit information fields */
1018 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1019 return false;
1020
1021 return vmcs_config.cpu_based_2nd_exec_ctrl &
1022 SECONDARY_EXEC_SHADOW_VMCS;
1023}
1024
898static inline bool report_flexpriority(void) 1025static inline bool report_flexpriority(void)
899{ 1026{
900 return flexpriority_enabled; 1027 return flexpriority_enabled;
@@ -1790,7 +1917,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1790 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1917 u32 intr_info = nr | INTR_INFO_VALID_MASK;
1791 1918
1792 if (nr == PF_VECTOR && is_guest_mode(vcpu) && 1919 if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
1793 nested_pf_handled(vcpu)) 1920 !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
1794 return; 1921 return;
1795 1922
1796 if (has_error_code) { 1923 if (has_error_code) {
@@ -2022,6 +2149,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
2022static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; 2149static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
2023static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; 2150static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
2024static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; 2151static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
2152static u32 nested_vmx_misc_low, nested_vmx_misc_high;
2025static __init void nested_vmx_setup_ctls_msrs(void) 2153static __init void nested_vmx_setup_ctls_msrs(void)
2026{ 2154{
2027 /* 2155 /*
@@ -2040,30 +2168,40 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2040 */ 2168 */
2041 2169
2042 /* pin-based controls */ 2170 /* pin-based controls */
2171 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2172 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
2043 /* 2173 /*
2044 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is 2174 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
2045 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. 2175 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
2046 */ 2176 */
2047 nested_vmx_pinbased_ctls_low = 0x16 ; 2177 nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2048 nested_vmx_pinbased_ctls_high = 0x16 | 2178 nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
2049 PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | 2179 PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
2050 PIN_BASED_VIRTUAL_NMIS; 2180 PIN_BASED_VMX_PREEMPTION_TIMER;
2181 nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2051 2182
2052 /* exit controls */ 2183 /*
2053 nested_vmx_exit_ctls_low = 0; 2184 * Exit controls
2185 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
2186 * 17 must be 1.
2187 */
2188 nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2054 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ 2189 /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
2055#ifdef CONFIG_X86_64 2190#ifdef CONFIG_X86_64
2056 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; 2191 nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
2057#else 2192#else
2058 nested_vmx_exit_ctls_high = 0; 2193 nested_vmx_exit_ctls_high = 0;
2059#endif 2194#endif
2195 nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2060 2196
2061 /* entry controls */ 2197 /* entry controls */
2062 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2198 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2063 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); 2199 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
2064 nested_vmx_entry_ctls_low = 0; 2200 /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
2201 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2065 nested_vmx_entry_ctls_high &= 2202 nested_vmx_entry_ctls_high &=
2066 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; 2203 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
2204 nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2067 2205
2068 /* cpu-based controls */ 2206 /* cpu-based controls */
2069 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2207 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2080,6 +2218,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2080 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2218 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2081 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2219 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2082 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2220 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2221 CPU_BASED_PAUSE_EXITING |
2083 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2222 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2084 /* 2223 /*
2085 * We can allow some features even when not supported by the 2224 * We can allow some features even when not supported by the
@@ -2094,7 +2233,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
2094 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); 2233 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
2095 nested_vmx_secondary_ctls_low = 0; 2234 nested_vmx_secondary_ctls_low = 0;
2096 nested_vmx_secondary_ctls_high &= 2235 nested_vmx_secondary_ctls_high &=
2097 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2236 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2237 SECONDARY_EXEC_WBINVD_EXITING;
2238
2239 /* miscellaneous data */
2240 rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
2241 nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
2242 VMX_MISC_SAVE_EFER_LMA;
2243 nested_vmx_misc_high = 0;
2098} 2244}
2099 2245
2100static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2246static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2165,7 +2311,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2165 nested_vmx_entry_ctls_high); 2311 nested_vmx_entry_ctls_high);
2166 break; 2312 break;
2167 case MSR_IA32_VMX_MISC: 2313 case MSR_IA32_VMX_MISC:
2168 *pdata = 0; 2314 *pdata = vmx_control_msr(nested_vmx_misc_low,
2315 nested_vmx_misc_high);
2169 break; 2316 break;
2170 /* 2317 /*
2171 * These MSRs specify bits which the guest must keep fixed (on or off) 2318 * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2529,12 +2676,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2529 u32 _vmexit_control = 0; 2676 u32 _vmexit_control = 0;
2530 u32 _vmentry_control = 0; 2677 u32 _vmentry_control = 0;
2531 2678
2532 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2533 opt = PIN_BASED_VIRTUAL_NMIS;
2534 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2535 &_pin_based_exec_control) < 0)
2536 return -EIO;
2537
2538 min = CPU_BASED_HLT_EXITING | 2679 min = CPU_BASED_HLT_EXITING |
2539#ifdef CONFIG_X86_64 2680#ifdef CONFIG_X86_64
2540 CPU_BASED_CR8_LOAD_EXITING | 2681 CPU_BASED_CR8_LOAD_EXITING |
@@ -2573,7 +2714,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2573 SECONDARY_EXEC_RDTSCP | 2714 SECONDARY_EXEC_RDTSCP |
2574 SECONDARY_EXEC_ENABLE_INVPCID | 2715 SECONDARY_EXEC_ENABLE_INVPCID |
2575 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2716 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 2717 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2718 SECONDARY_EXEC_SHADOW_VMCS;
2577 if (adjust_vmx_controls(min2, opt2, 2719 if (adjust_vmx_controls(min2, opt2,
2578 MSR_IA32_VMX_PROCBASED_CTLS2, 2720 MSR_IA32_VMX_PROCBASED_CTLS2,
2579 &_cpu_based_2nd_exec_control) < 0) 2721 &_cpu_based_2nd_exec_control) < 0)
@@ -2605,11 +2747,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2605#ifdef CONFIG_X86_64 2747#ifdef CONFIG_X86_64
2606 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2748 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2607#endif 2749#endif
2608 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; 2750 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
2751 VM_EXIT_ACK_INTR_ON_EXIT;
2609 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2752 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2610 &_vmexit_control) < 0) 2753 &_vmexit_control) < 0)
2611 return -EIO; 2754 return -EIO;
2612 2755
2756 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2757 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
2758 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2759 &_pin_based_exec_control) < 0)
2760 return -EIO;
2761
2762 if (!(_cpu_based_2nd_exec_control &
2763 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
2764 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
2765 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2766
2613 min = 0; 2767 min = 0;
2614 opt = VM_ENTRY_LOAD_IA32_PAT; 2768 opt = VM_ENTRY_LOAD_IA32_PAT;
2615 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2769 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2762,6 +2916,8 @@ static __init int hardware_setup(void)
2762 2916
2763 if (!cpu_has_vmx_vpid()) 2917 if (!cpu_has_vmx_vpid())
2764 enable_vpid = 0; 2918 enable_vpid = 0;
2919 if (!cpu_has_vmx_shadow_vmcs())
2920 enable_shadow_vmcs = 0;
2765 2921
2766 if (!cpu_has_vmx_ept() || 2922 if (!cpu_has_vmx_ept() ||
2767 !cpu_has_vmx_ept_4levels()) { 2923 !cpu_has_vmx_ept_4levels()) {
@@ -2788,14 +2944,16 @@ static __init int hardware_setup(void)
2788 if (!cpu_has_vmx_ple()) 2944 if (!cpu_has_vmx_ple())
2789 ple_gap = 0; 2945 ple_gap = 0;
2790 2946
2791 if (!cpu_has_vmx_apic_register_virt() || 2947 if (!cpu_has_vmx_apicv())
2792 !cpu_has_vmx_virtual_intr_delivery()) 2948 enable_apicv = 0;
2793 enable_apicv_reg_vid = 0;
2794 2949
2795 if (enable_apicv_reg_vid) 2950 if (enable_apicv)
2796 kvm_x86_ops->update_cr8_intercept = NULL; 2951 kvm_x86_ops->update_cr8_intercept = NULL;
2797 else 2952 else {
2798 kvm_x86_ops->hwapic_irr_update = NULL; 2953 kvm_x86_ops->hwapic_irr_update = NULL;
2954 kvm_x86_ops->deliver_posted_interrupt = NULL;
2955 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
2956 }
2799 2957
2800 if (nested) 2958 if (nested)
2801 nested_vmx_setup_ctls_msrs(); 2959 nested_vmx_setup_ctls_msrs();
@@ -2876,22 +3034,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2876 vmx->cpl = 0; 3034 vmx->cpl = 0;
2877} 3035}
2878 3036
2879static gva_t rmode_tss_base(struct kvm *kvm)
2880{
2881 if (!kvm->arch.tss_addr) {
2882 struct kvm_memslots *slots;
2883 struct kvm_memory_slot *slot;
2884 gfn_t base_gfn;
2885
2886 slots = kvm_memslots(kvm);
2887 slot = id_to_memslot(slots, 0);
2888 base_gfn = slot->base_gfn + slot->npages - 3;
2889
2890 return base_gfn << PAGE_SHIFT;
2891 }
2892 return kvm->arch.tss_addr;
2893}
2894
2895static void fix_rmode_seg(int seg, struct kvm_segment *save) 3037static void fix_rmode_seg(int seg, struct kvm_segment *save)
2896{ 3038{
2897 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3039 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -2942,19 +3084,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2942 3084
2943 /* 3085 /*
2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3086 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2945 * vcpu. Call it here with phys address pointing 16M below 4G. 3087 * vcpu. Warn the user that an update is overdue.
2946 */ 3088 */
2947 if (!vcpu->kvm->arch.tss_addr) { 3089 if (!vcpu->kvm->arch.tss_addr)
2948 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3090 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2949 "called before entering vcpu\n"); 3091 "called before entering vcpu\n");
2950 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
2951 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
2952 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2953 }
2954 3092
2955 vmx_segment_cache_clear(vmx); 3093 vmx_segment_cache_clear(vmx);
2956 3094
2957 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 3095 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
2958 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3096 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2959 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3097 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2960 3098
@@ -3214,7 +3352,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3214 */ 3352 */
3215 if (!nested_vmx_allowed(vcpu)) 3353 if (!nested_vmx_allowed(vcpu))
3216 return 1; 3354 return 1;
3217 } else if (to_vmx(vcpu)->nested.vmxon) 3355 }
3356 if (to_vmx(vcpu)->nested.vmxon &&
3357 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
3218 return 1; 3358 return 1;
3219 3359
3220 vcpu->arch.cr4 = cr4; 3360 vcpu->arch.cr4 = cr4;
@@ -3550,7 +3690,7 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
3550 return true; 3690 return true;
3551 3691
3552 /* real mode guest state checks */ 3692 /* real mode guest state checks */
3553 if (!is_protmode(vcpu)) { 3693 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3694 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3555 return false; 3695 return false;
3556 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3696 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -3599,7 +3739,7 @@ static int init_rmode_tss(struct kvm *kvm)
3599 int r, idx, ret = 0; 3739 int r, idx, ret = 0;
3600 3740
3601 idx = srcu_read_lock(&kvm->srcu); 3741 idx = srcu_read_lock(&kvm->srcu);
3602 fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 3742 fn = kvm->arch.tss_addr >> PAGE_SHIFT;
3603 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3743 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3604 if (r < 0) 3744 if (r < 0)
3605 goto out; 3745 goto out;
@@ -3692,7 +3832,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
3692 kvm_userspace_mem.flags = 0; 3832 kvm_userspace_mem.flags = 0;
3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3833 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3694 kvm_userspace_mem.memory_size = PAGE_SIZE; 3834 kvm_userspace_mem.memory_size = PAGE_SIZE;
3695 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3835 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3696 if (r) 3836 if (r)
3697 goto out; 3837 goto out;
3698 3838
@@ -3722,7 +3862,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
3722 kvm_userspace_mem.guest_phys_addr = 3862 kvm_userspace_mem.guest_phys_addr =
3723 kvm->arch.ept_identity_map_addr; 3863 kvm->arch.ept_identity_map_addr;
3724 kvm_userspace_mem.memory_size = PAGE_SIZE; 3864 kvm_userspace_mem.memory_size = PAGE_SIZE;
3725 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); 3865 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
3726 if (r) 3866 if (r)
3727 goto out; 3867 goto out;
3728 3868
@@ -3869,13 +4009,59 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3869 msr, MSR_TYPE_W); 4009 msr, MSR_TYPE_W);
3870} 4010}
3871 4011
4012static int vmx_vm_has_apicv(struct kvm *kvm)
4013{
4014 return enable_apicv && irqchip_in_kernel(kvm);
4015}
4016
4017/*
4018 * Send interrupt to vcpu via posted interrupt way.
4019 * 1. If target vcpu is running(non-root mode), send posted interrupt
4020 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4021 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4022 * interrupt from PIR in next vmentry.
4023 */
4024static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4025{
4026 struct vcpu_vmx *vmx = to_vmx(vcpu);
4027 int r;
4028
4029 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4030 return;
4031
4032 r = pi_test_and_set_on(&vmx->pi_desc);
4033 kvm_make_request(KVM_REQ_EVENT, vcpu);
4034#ifdef CONFIG_SMP
4035 if (!r && (vcpu->mode == IN_GUEST_MODE))
4036 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4037 POSTED_INTR_VECTOR);
4038 else
4039#endif
4040 kvm_vcpu_kick(vcpu);
4041}
4042
4043static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4044{
4045 struct vcpu_vmx *vmx = to_vmx(vcpu);
4046
4047 if (!pi_test_and_clear_on(&vmx->pi_desc))
4048 return;
4049
4050 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
4051}
4052
4053static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
4054{
4055 return;
4056}
4057
3872/* 4058/*
3873 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4059 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3874 * will not change in the lifetime of the guest. 4060 * will not change in the lifetime of the guest.
3875 * Note that host-state that does change is set elsewhere. E.g., host-state 4061 * Note that host-state that does change is set elsewhere. E.g., host-state
3876 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4062 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3877 */ 4063 */
3878static void vmx_set_constant_host_state(void) 4064static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
3879{ 4065{
3880 u32 low32, high32; 4066 u32 low32, high32;
3881 unsigned long tmpl; 4067 unsigned long tmpl;
@@ -3903,6 +4089,7 @@ static void vmx_set_constant_host_state(void)
3903 4089
3904 native_store_idt(&dt); 4090 native_store_idt(&dt);
3905 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4091 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
4092 vmx->host_idt_base = dt.address;
3906 4093
3907 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4094 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3908 4095
@@ -3928,6 +4115,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
3928 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4115 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3929} 4116}
3930 4117
4118static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4119{
4120 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4121
4122 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4123 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4124 return pin_based_exec_ctrl;
4125}
4126
3931static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4127static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3932{ 4128{
3933 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4129 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3945,11 +4141,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3945 return exec_control; 4141 return exec_control;
3946} 4142}
3947 4143
3948static int vmx_vm_has_apicv(struct kvm *kvm)
3949{
3950 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3951}
3952
3953static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4144static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3954{ 4145{
3955 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4146 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3971,6 +4162,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4162 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4163 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4164 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4165 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4166 (handle_vmptrld).
4167 We can NOT enable shadow_vmcs here because we don't have yet
4168 a current VMCS12
4169 */
4170 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
3974 return exec_control; 4171 return exec_control;
3975} 4172}
3976 4173
@@ -3999,14 +4196,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3999 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4196 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
4000 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4197 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
4001 4198
4199 if (enable_shadow_vmcs) {
4200 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
4201 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
4202 }
4002 if (cpu_has_vmx_msr_bitmap()) 4203 if (cpu_has_vmx_msr_bitmap())
4003 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4204 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
4004 4205
4005 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4206 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4006 4207
4007 /* Control */ 4208 /* Control */
4008 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 4209 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4009 vmcs_config.pin_based_exec_ctrl);
4010 4210
4011 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4211 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4012 4212
@@ -4015,13 +4215,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4015 vmx_secondary_exec_control(vmx)); 4215 vmx_secondary_exec_control(vmx));
4016 } 4216 }
4017 4217
4018 if (enable_apicv_reg_vid) { 4218 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
4019 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4219 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4020 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4220 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4021 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4221 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4022 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4222 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4023 4223
4024 vmcs_write16(GUEST_INTR_STATUS, 0); 4224 vmcs_write16(GUEST_INTR_STATUS, 0);
4225
4226 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4227 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4025 } 4228 }
4026 4229
4027 if (ple_gap) { 4230 if (ple_gap) {
@@ -4035,7 +4238,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4035 4238
4036 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4239 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4037 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4240 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
4038 vmx_set_constant_host_state(); 4241 vmx_set_constant_host_state(vmx);
4039#ifdef CONFIG_X86_64 4242#ifdef CONFIG_X86_64
4040 rdmsrl(MSR_FS_BASE, a); 4243 rdmsrl(MSR_FS_BASE, a);
4041 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4244 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -4089,11 +4292,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4089 return 0; 4292 return 0;
4090} 4293}
4091 4294
4092static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4295static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4093{ 4296{
4094 struct vcpu_vmx *vmx = to_vmx(vcpu); 4297 struct vcpu_vmx *vmx = to_vmx(vcpu);
4095 u64 msr; 4298 u64 msr;
4096 int ret;
4097 4299
4098 vmx->rmode.vm86_active = 0; 4300 vmx->rmode.vm86_active = 0;
4099 4301
@@ -4109,12 +4311,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4109 vmx_segment_cache_clear(vmx); 4311 vmx_segment_cache_clear(vmx);
4110 4312
4111 seg_setup(VCPU_SREG_CS); 4313 seg_setup(VCPU_SREG_CS);
4112 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4314 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4113 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4315 vmcs_write32(GUEST_CS_BASE, 0xffff0000);
4114 else {
4115 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4116 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4117 }
4118 4316
4119 seg_setup(VCPU_SREG_DS); 4317 seg_setup(VCPU_SREG_DS);
4120 seg_setup(VCPU_SREG_ES); 4318 seg_setup(VCPU_SREG_ES);
@@ -4137,10 +4335,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4137 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4335 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4138 4336
4139 vmcs_writel(GUEST_RFLAGS, 0x02); 4337 vmcs_writel(GUEST_RFLAGS, 0x02);
4140 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 4338 kvm_rip_write(vcpu, 0xfff0);
4141 kvm_rip_write(vcpu, 0xfff0);
4142 else
4143 kvm_rip_write(vcpu, 0);
4144 4339
4145 vmcs_writel(GUEST_GDTR_BASE, 0); 4340 vmcs_writel(GUEST_GDTR_BASE, 0);
4146 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4341 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4171,23 +4366,20 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4171 vmcs_write64(APIC_ACCESS_ADDR, 4366 vmcs_write64(APIC_ACCESS_ADDR,
4172 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); 4367 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
4173 4368
4369 if (vmx_vm_has_apicv(vcpu->kvm))
4370 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4371
4174 if (vmx->vpid != 0) 4372 if (vmx->vpid != 0)
4175 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4373 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4176 4374
4177 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4375 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4178 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4179 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4376 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4180 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4181 vmx_set_cr4(&vmx->vcpu, 0); 4377 vmx_set_cr4(&vmx->vcpu, 0);
4182 vmx_set_efer(&vmx->vcpu, 0); 4378 vmx_set_efer(&vmx->vcpu, 0);
4183 vmx_fpu_activate(&vmx->vcpu); 4379 vmx_fpu_activate(&vmx->vcpu);
4184 update_exception_bitmap(&vmx->vcpu); 4380 update_exception_bitmap(&vmx->vcpu);
4185 4381
4186 vpid_sync_context(vmx); 4382 vpid_sync_context(vmx);
4187
4188 ret = 0;
4189
4190 return ret;
4191} 4383}
4192 4384
4193/* 4385/*
@@ -4200,40 +4392,45 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4200 PIN_BASED_EXT_INTR_MASK; 4392 PIN_BASED_EXT_INTR_MASK;
4201} 4393}
4202 4394
4203static void enable_irq_window(struct kvm_vcpu *vcpu) 4395static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4396{
4397 return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4398 PIN_BASED_NMI_EXITING;
4399}
4400
4401static int enable_irq_window(struct kvm_vcpu *vcpu)
4204{ 4402{
4205 u32 cpu_based_vm_exec_control; 4403 u32 cpu_based_vm_exec_control;
4206 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4404
4405 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4207 /* 4406 /*
4208 * We get here if vmx_interrupt_allowed() said we can't 4407 * We get here if vmx_interrupt_allowed() said we can't
4209 * inject to L1 now because L2 must run. Ask L2 to exit 4408 * inject to L1 now because L2 must run. The caller will have
4210 * right after entry, so we can inject to L1 more promptly. 4409 * to make L2 exit right after entry, so we can inject to L1
4410 * more promptly.
4211 */ 4411 */
4212 kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); 4412 return -EBUSY;
4213 return;
4214 }
4215 4413
4216 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4414 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4217 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4415 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4218 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4416 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4417 return 0;
4219} 4418}
4220 4419
4221static void enable_nmi_window(struct kvm_vcpu *vcpu) 4420static int enable_nmi_window(struct kvm_vcpu *vcpu)
4222{ 4421{
4223 u32 cpu_based_vm_exec_control; 4422 u32 cpu_based_vm_exec_control;
4224 4423
4225 if (!cpu_has_virtual_nmis()) { 4424 if (!cpu_has_virtual_nmis())
4226 enable_irq_window(vcpu); 4425 return enable_irq_window(vcpu);
4227 return; 4426
4228 } 4427 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
4428 return enable_irq_window(vcpu);
4229 4429
4230 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4231 enable_irq_window(vcpu);
4232 return;
4233 }
4234 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4430 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4235 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4431 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4236 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4433 return 0;
4237} 4434}
4238 4435
4239static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4436static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4294,16 +4491,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4294 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4491 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4295} 4492}
4296 4493
4297static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4298{
4299 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4300 return 0;
4301
4302 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4303 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4304 | GUEST_INTR_STATE_NMI));
4305}
4306
4307static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4494static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4308{ 4495{
4309 if (!cpu_has_virtual_nmis()) 4496 if (!cpu_has_virtual_nmis())
@@ -4333,18 +4520,52 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4333 } 4520 }
4334} 4521}
4335 4522
4523static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4524{
4525 if (is_guest_mode(vcpu)) {
4526 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4527
4528 if (to_vmx(vcpu)->nested.nested_run_pending)
4529 return 0;
4530 if (nested_exit_on_nmi(vcpu)) {
4531 nested_vmx_vmexit(vcpu);
4532 vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
4533 vmcs12->vm_exit_intr_info = NMI_VECTOR |
4534 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
4535 /*
4536 * The NMI-triggered VM exit counts as injection:
4537 * clear this one and block further NMIs.
4538 */
4539 vcpu->arch.nmi_pending = 0;
4540 vmx_set_nmi_mask(vcpu, true);
4541 return 0;
4542 }
4543 }
4544
4545 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4546 return 0;
4547
4548 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4549 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4550 | GUEST_INTR_STATE_NMI));
4551}
4552
4336static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4553static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4337{ 4554{
4338 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { 4555 if (is_guest_mode(vcpu)) {
4339 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4556 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4340 if (to_vmx(vcpu)->nested.nested_run_pending || 4557
4341 (vmcs12->idt_vectoring_info_field & 4558 if (to_vmx(vcpu)->nested.nested_run_pending)
4342 VECTORING_INFO_VALID_MASK))
4343 return 0; 4559 return 0;
4344 nested_vmx_vmexit(vcpu); 4560 if (nested_exit_on_intr(vcpu)) {
4345 vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; 4561 nested_vmx_vmexit(vcpu);
4346 vmcs12->vm_exit_intr_info = 0; 4562 vmcs12->vm_exit_reason =
4347 /* fall through to normal code, but now in L1, not L2 */ 4563 EXIT_REASON_EXTERNAL_INTERRUPT;
4564 vmcs12->vm_exit_intr_info = 0;
4565 /*
4566 * fall through to normal code, but now in L1, not L2
4567 */
4568 }
4348 } 4569 }
4349 4570
4350 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4571 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -4362,7 +4583,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4362 .flags = 0, 4583 .flags = 0,
4363 }; 4584 };
4364 4585
4365 ret = kvm_set_memory_region(kvm, &tss_mem, false); 4586 ret = kvm_set_memory_region(kvm, &tss_mem);
4366 if (ret) 4587 if (ret)
4367 return ret; 4588 return ret;
4368 kvm->arch.tss_addr = addr; 4589 kvm->arch.tss_addr = addr;
@@ -4603,34 +4824,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4603/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 4824/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4604static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4825static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4605{ 4826{
4606 if (to_vmx(vcpu)->nested.vmxon &&
4607 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4608 return 1;
4609
4610 if (is_guest_mode(vcpu)) { 4827 if (is_guest_mode(vcpu)) {
4828 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4829 unsigned long orig_val = val;
4830
4611 /* 4831 /*
4612 * We get here when L2 changed cr0 in a way that did not change 4832 * We get here when L2 changed cr0 in a way that did not change
4613 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 4833 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4614 * but did change L0 shadowed bits. This can currently happen 4834 * but did change L0 shadowed bits. So we first calculate the
4615 * with the TS bit: L0 may want to leave TS on (for lazy fpu 4835 * effective cr0 value that L1 would like to write into the
4616 * loading) while pretending to allow the guest to change it. 4836 * hardware. It consists of the L2-owned bits from the new
4837 * value combined with the L1-owned bits from L1's guest_cr0.
4617 */ 4838 */
4618 if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | 4839 val = (val & ~vmcs12->cr0_guest_host_mask) |
4619 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) 4840 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4841
4842 /* TODO: will have to take unrestricted guest mode into
4843 * account */
4844 if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
4620 return 1; 4845 return 1;
4621 vmcs_writel(CR0_READ_SHADOW, val); 4846
4847 if (kvm_set_cr0(vcpu, val))
4848 return 1;
4849 vmcs_writel(CR0_READ_SHADOW, orig_val);
4622 return 0; 4850 return 0;
4623 } else 4851 } else {
4852 if (to_vmx(vcpu)->nested.vmxon &&
4853 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
4854 return 1;
4624 return kvm_set_cr0(vcpu, val); 4855 return kvm_set_cr0(vcpu, val);
4856 }
4625} 4857}
4626 4858
4627static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 4859static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4628{ 4860{
4629 if (is_guest_mode(vcpu)) { 4861 if (is_guest_mode(vcpu)) {
4630 if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | 4862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4631 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) 4863 unsigned long orig_val = val;
4864
4865 /* analogously to handle_set_cr0 */
4866 val = (val & ~vmcs12->cr4_guest_host_mask) |
4867 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4868 if (kvm_set_cr4(vcpu, val))
4632 return 1; 4869 return 1;
4633 vmcs_writel(CR4_READ_SHADOW, val); 4870 vmcs_writel(CR4_READ_SHADOW, orig_val);
4634 return 0; 4871 return 0;
4635 } else 4872 } else
4636 return kvm_set_cr4(vcpu, val); 4873 return kvm_set_cr4(vcpu, val);
@@ -5183,7 +5420,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5183 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5420 if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5184 return 1; 5421 return 1;
5185 5422
5186 err = emulate_instruction(vcpu, 0); 5423 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5187 5424
5188 if (err == EMULATE_DO_MMIO) { 5425 if (err == EMULATE_DO_MMIO) {
5189 ret = 0; 5426 ret = 0;
@@ -5259,8 +5496,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
5259 } 5496 }
5260 5497
5261 /* Create a new VMCS */ 5498 /* Create a new VMCS */
5262 item = (struct vmcs02_list *) 5499 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5263 kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
5264 if (!item) 5500 if (!item)
5265 return NULL; 5501 return NULL;
5266 item->vmcs02.vmcs = alloc_vmcs(); 5502 item->vmcs02.vmcs = alloc_vmcs();
@@ -5309,6 +5545,9 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
5309 free_loaded_vmcs(&vmx->vmcs01); 5545 free_loaded_vmcs(&vmx->vmcs01);
5310} 5546}
5311 5547
5548static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5549 u32 vm_instruction_error);
5550
5312/* 5551/*
5313 * Emulate the VMXON instruction. 5552 * Emulate the VMXON instruction.
5314 * Currently, we just remember that VMX is active, and do not save or even 5553 * Currently, we just remember that VMX is active, and do not save or even
@@ -5321,6 +5560,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5321{ 5560{
5322 struct kvm_segment cs; 5561 struct kvm_segment cs;
5323 struct vcpu_vmx *vmx = to_vmx(vcpu); 5562 struct vcpu_vmx *vmx = to_vmx(vcpu);
5563 struct vmcs *shadow_vmcs;
5324 5564
5325 /* The Intel VMX Instruction Reference lists a bunch of bits that 5565 /* The Intel VMX Instruction Reference lists a bunch of bits that
5326 * are prerequisite to running VMXON, most notably cr4.VMXE must be 5566 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5344,6 +5584,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
5344 kvm_inject_gp(vcpu, 0); 5584 kvm_inject_gp(vcpu, 0);
5345 return 1; 5585 return 1;
5346 } 5586 }
5587 if (vmx->nested.vmxon) {
5588 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
5589 skip_emulated_instruction(vcpu);
5590 return 1;
5591 }
5592 if (enable_shadow_vmcs) {
5593 shadow_vmcs = alloc_vmcs();
5594 if (!shadow_vmcs)
5595 return -ENOMEM;
5596 /* mark vmcs as shadow */
5597 shadow_vmcs->revision_id |= (1u << 31);
5598 /* init shadow vmcs */
5599 vmcs_clear(shadow_vmcs);
5600 vmx->nested.current_shadow_vmcs = shadow_vmcs;
5601 }
5347 5602
5348 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 5603 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
5349 vmx->nested.vmcs02_num = 0; 5604 vmx->nested.vmcs02_num = 0;
@@ -5384,6 +5639,25 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
5384 return 1; 5639 return 1;
5385} 5640}
5386 5641
5642static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
5643{
5644 u32 exec_control;
5645 if (enable_shadow_vmcs) {
5646 if (vmx->nested.current_vmcs12 != NULL) {
5647 /* copy to memory all shadowed fields in case
5648 they were modified */
5649 copy_shadow_to_vmcs12(vmx);
5650 vmx->nested.sync_shadow_vmcs = false;
5651 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
5652 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
5653 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
5654 vmcs_write64(VMCS_LINK_POINTER, -1ull);
5655 }
5656 }
5657 kunmap(vmx->nested.current_vmcs12_page);
5658 nested_release_page(vmx->nested.current_vmcs12_page);
5659}
5660
5387/* 5661/*
5388 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 5662 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
5389 * just stops using VMX. 5663 * just stops using VMX.
@@ -5394,11 +5668,12 @@ static void free_nested(struct vcpu_vmx *vmx)
5394 return; 5668 return;
5395 vmx->nested.vmxon = false; 5669 vmx->nested.vmxon = false;
5396 if (vmx->nested.current_vmptr != -1ull) { 5670 if (vmx->nested.current_vmptr != -1ull) {
5397 kunmap(vmx->nested.current_vmcs12_page); 5671 nested_release_vmcs12(vmx);
5398 nested_release_page(vmx->nested.current_vmcs12_page);
5399 vmx->nested.current_vmptr = -1ull; 5672 vmx->nested.current_vmptr = -1ull;
5400 vmx->nested.current_vmcs12 = NULL; 5673 vmx->nested.current_vmcs12 = NULL;
5401 } 5674 }
5675 if (enable_shadow_vmcs)
5676 free_vmcs(vmx->nested.current_shadow_vmcs);
5402 /* Unpin physical memory we referred to in current vmcs02 */ 5677 /* Unpin physical memory we referred to in current vmcs02 */
5403 if (vmx->nested.apic_access_page) { 5678 if (vmx->nested.apic_access_page) {
5404 nested_release_page(vmx->nested.apic_access_page); 5679 nested_release_page(vmx->nested.apic_access_page);
@@ -5507,6 +5782,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
5507 X86_EFLAGS_SF | X86_EFLAGS_OF)) 5782 X86_EFLAGS_SF | X86_EFLAGS_OF))
5508 | X86_EFLAGS_ZF); 5783 | X86_EFLAGS_ZF);
5509 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 5784 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
5785 /*
5786 * We don't need to force a shadow sync because
5787 * VM_INSTRUCTION_ERROR is not shadowed
5788 */
5510} 5789}
5511 5790
5512/* Emulate the VMCLEAR instruction */ 5791/* Emulate the VMCLEAR instruction */
@@ -5539,8 +5818,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
5539 } 5818 }
5540 5819
5541 if (vmptr == vmx->nested.current_vmptr) { 5820 if (vmptr == vmx->nested.current_vmptr) {
5542 kunmap(vmx->nested.current_vmcs12_page); 5821 nested_release_vmcs12(vmx);
5543 nested_release_page(vmx->nested.current_vmcs12_page);
5544 vmx->nested.current_vmptr = -1ull; 5822 vmx->nested.current_vmptr = -1ull;
5545 vmx->nested.current_vmcs12 = NULL; 5823 vmx->nested.current_vmcs12 = NULL;
5546 } 5824 }
@@ -5639,6 +5917,111 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
5639 } 5917 }
5640} 5918}
5641 5919
5920
5921static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
5922 unsigned long field, u64 field_value){
5923 short offset = vmcs_field_to_offset(field);
5924 char *p = ((char *) get_vmcs12(vcpu)) + offset;
5925 if (offset < 0)
5926 return false;
5927
5928 switch (vmcs_field_type(field)) {
5929 case VMCS_FIELD_TYPE_U16:
5930 *(u16 *)p = field_value;
5931 return true;
5932 case VMCS_FIELD_TYPE_U32:
5933 *(u32 *)p = field_value;
5934 return true;
5935 case VMCS_FIELD_TYPE_U64:
5936 *(u64 *)p = field_value;
5937 return true;
5938 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5939 *(natural_width *)p = field_value;
5940 return true;
5941 default:
5942 return false; /* can never happen. */
5943 }
5944
5945}
5946
5947static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
5948{
5949 int i;
5950 unsigned long field;
5951 u64 field_value;
5952 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5953 unsigned long *fields = (unsigned long *)shadow_read_write_fields;
5954 int num_fields = max_shadow_read_write_fields;
5955
5956 vmcs_load(shadow_vmcs);
5957
5958 for (i = 0; i < num_fields; i++) {
5959 field = fields[i];
5960 switch (vmcs_field_type(field)) {
5961 case VMCS_FIELD_TYPE_U16:
5962 field_value = vmcs_read16(field);
5963 break;
5964 case VMCS_FIELD_TYPE_U32:
5965 field_value = vmcs_read32(field);
5966 break;
5967 case VMCS_FIELD_TYPE_U64:
5968 field_value = vmcs_read64(field);
5969 break;
5970 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5971 field_value = vmcs_readl(field);
5972 break;
5973 }
5974 vmcs12_write_any(&vmx->vcpu, field, field_value);
5975 }
5976
5977 vmcs_clear(shadow_vmcs);
5978 vmcs_load(vmx->loaded_vmcs->vmcs);
5979}
5980
5981static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
5982{
5983 unsigned long *fields[] = {
5984 (unsigned long *)shadow_read_write_fields,
5985 (unsigned long *)shadow_read_only_fields
5986 };
5987 int num_lists = ARRAY_SIZE(fields);
5988 int max_fields[] = {
5989 max_shadow_read_write_fields,
5990 max_shadow_read_only_fields
5991 };
5992 int i, q;
5993 unsigned long field;
5994 u64 field_value = 0;
5995 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
5996
5997 vmcs_load(shadow_vmcs);
5998
5999 for (q = 0; q < num_lists; q++) {
6000 for (i = 0; i < max_fields[q]; i++) {
6001 field = fields[q][i];
6002 vmcs12_read_any(&vmx->vcpu, field, &field_value);
6003
6004 switch (vmcs_field_type(field)) {
6005 case VMCS_FIELD_TYPE_U16:
6006 vmcs_write16(field, (u16)field_value);
6007 break;
6008 case VMCS_FIELD_TYPE_U32:
6009 vmcs_write32(field, (u32)field_value);
6010 break;
6011 case VMCS_FIELD_TYPE_U64:
6012 vmcs_write64(field, (u64)field_value);
6013 break;
6014 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6015 vmcs_writel(field, (long)field_value);
6016 break;
6017 }
6018 }
6019 }
6020
6021 vmcs_clear(shadow_vmcs);
6022 vmcs_load(vmx->loaded_vmcs->vmcs);
6023}
6024
5642/* 6025/*
5643 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 6026 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
5644 * used before) all generate the same failure when it is missing. 6027 * used before) all generate the same failure when it is missing.
@@ -5703,8 +6086,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
5703 gva_t gva; 6086 gva_t gva;
5704 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 6087 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5705 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6088 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5706 char *p;
5707 short offset;
5708 /* The value to write might be 32 or 64 bits, depending on L1's long 6089 /* The value to write might be 32 or 64 bits, depending on L1's long
5709 * mode, and eventually we need to write that into a field of several 6090 * mode, and eventually we need to write that into a field of several
5710 * possible lengths. The code below first zero-extends the value to 64 6091 * possible lengths. The code below first zero-extends the value to 64
@@ -5741,28 +6122,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
5741 return 1; 6122 return 1;
5742 } 6123 }
5743 6124
5744 offset = vmcs_field_to_offset(field); 6125 if (!vmcs12_write_any(vcpu, field, field_value)) {
5745 if (offset < 0) {
5746 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5747 skip_emulated_instruction(vcpu);
5748 return 1;
5749 }
5750 p = ((char *) get_vmcs12(vcpu)) + offset;
5751
5752 switch (vmcs_field_type(field)) {
5753 case VMCS_FIELD_TYPE_U16:
5754 *(u16 *)p = field_value;
5755 break;
5756 case VMCS_FIELD_TYPE_U32:
5757 *(u32 *)p = field_value;
5758 break;
5759 case VMCS_FIELD_TYPE_U64:
5760 *(u64 *)p = field_value;
5761 break;
5762 case VMCS_FIELD_TYPE_NATURAL_WIDTH:
5763 *(natural_width *)p = field_value;
5764 break;
5765 default:
5766 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 6126 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5767 skip_emulated_instruction(vcpu); 6127 skip_emulated_instruction(vcpu);
5768 return 1; 6128 return 1;
@@ -5780,6 +6140,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
5780 gva_t gva; 6140 gva_t gva;
5781 gpa_t vmptr; 6141 gpa_t vmptr;
5782 struct x86_exception e; 6142 struct x86_exception e;
6143 u32 exec_control;
5783 6144
5784 if (!nested_vmx_check_permission(vcpu)) 6145 if (!nested_vmx_check_permission(vcpu))
5785 return 1; 6146 return 1;
@@ -5818,14 +6179,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
5818 skip_emulated_instruction(vcpu); 6179 skip_emulated_instruction(vcpu);
5819 return 1; 6180 return 1;
5820 } 6181 }
5821 if (vmx->nested.current_vmptr != -1ull) { 6182 if (vmx->nested.current_vmptr != -1ull)
5822 kunmap(vmx->nested.current_vmcs12_page); 6183 nested_release_vmcs12(vmx);
5823 nested_release_page(vmx->nested.current_vmcs12_page);
5824 }
5825 6184
5826 vmx->nested.current_vmptr = vmptr; 6185 vmx->nested.current_vmptr = vmptr;
5827 vmx->nested.current_vmcs12 = new_vmcs12; 6186 vmx->nested.current_vmcs12 = new_vmcs12;
5828 vmx->nested.current_vmcs12_page = page; 6187 vmx->nested.current_vmcs12_page = page;
6188 if (enable_shadow_vmcs) {
6189 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6190 exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
6191 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6192 vmcs_write64(VMCS_LINK_POINTER,
6193 __pa(vmx->nested.current_shadow_vmcs));
6194 vmx->nested.sync_shadow_vmcs = true;
6195 }
5829 } 6196 }
5830 6197
5831 nested_vmx_succeed(vcpu); 6198 nested_vmx_succeed(vcpu);
@@ -5908,6 +6275,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5908static const int kvm_vmx_max_exit_handlers = 6275static const int kvm_vmx_max_exit_handlers =
5909 ARRAY_SIZE(kvm_vmx_exit_handlers); 6276 ARRAY_SIZE(kvm_vmx_exit_handlers);
5910 6277
6278static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
6279 struct vmcs12 *vmcs12)
6280{
6281 unsigned long exit_qualification;
6282 gpa_t bitmap, last_bitmap;
6283 unsigned int port;
6284 int size;
6285 u8 b;
6286
6287 if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
6288 return 1;
6289
6290 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
6291 return 0;
6292
6293 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6294
6295 port = exit_qualification >> 16;
6296 size = (exit_qualification & 7) + 1;
6297
6298 last_bitmap = (gpa_t)-1;
6299 b = -1;
6300
6301 while (size > 0) {
6302 if (port < 0x8000)
6303 bitmap = vmcs12->io_bitmap_a;
6304 else if (port < 0x10000)
6305 bitmap = vmcs12->io_bitmap_b;
6306 else
6307 return 1;
6308 bitmap += (port & 0x7fff) / 8;
6309
6310 if (last_bitmap != bitmap)
6311 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
6312 return 1;
6313 if (b & (1 << (port & 7)))
6314 return 1;
6315
6316 port++;
6317 size--;
6318 last_bitmap = bitmap;
6319 }
6320
6321 return 0;
6322}
6323
5911/* 6324/*
5912 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 6325 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5913 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6326 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -5939,7 +6352,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5939 /* Then read the msr_index'th bit from this bitmap: */ 6352 /* Then read the msr_index'th bit from this bitmap: */
5940 if (msr_index < 1024*8) { 6353 if (msr_index < 1024*8) {
5941 unsigned char b; 6354 unsigned char b;
5942 kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); 6355 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
6356 return 1;
5943 return 1 & (b >> (msr_index & 7)); 6357 return 1 & (b >> (msr_index & 7));
5944 } else 6358 } else
5945 return 1; /* let L1 handle the wrong parameter */ 6359 return 1; /* let L1 handle the wrong parameter */
@@ -6033,10 +6447,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
6033 */ 6447 */
6034static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 6448static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6035{ 6449{
6036 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
6037 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6450 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6038 struct vcpu_vmx *vmx = to_vmx(vcpu); 6451 struct vcpu_vmx *vmx = to_vmx(vcpu);
6039 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6452 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6453 u32 exit_reason = vmx->exit_reason;
6040 6454
6041 if (vmx->nested.nested_run_pending) 6455 if (vmx->nested.nested_run_pending)
6042 return 0; 6456 return 0;
@@ -6060,14 +6474,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6060 case EXIT_REASON_TRIPLE_FAULT: 6474 case EXIT_REASON_TRIPLE_FAULT:
6061 return 1; 6475 return 1;
6062 case EXIT_REASON_PENDING_INTERRUPT: 6476 case EXIT_REASON_PENDING_INTERRUPT:
6477 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
6063 case EXIT_REASON_NMI_WINDOW: 6478 case EXIT_REASON_NMI_WINDOW:
6064 /* 6479 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
6065 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
6066 * (aka Interrupt Window Exiting) only when L1 turned it on,
6067 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
6068 * Same for NMI Window Exiting.
6069 */
6070 return 1;
6071 case EXIT_REASON_TASK_SWITCH: 6480 case EXIT_REASON_TASK_SWITCH:
6072 return 1; 6481 return 1;
6073 case EXIT_REASON_CPUID: 6482 case EXIT_REASON_CPUID:
@@ -6097,8 +6506,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6097 case EXIT_REASON_DR_ACCESS: 6506 case EXIT_REASON_DR_ACCESS:
6098 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6507 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
6099 case EXIT_REASON_IO_INSTRUCTION: 6508 case EXIT_REASON_IO_INSTRUCTION:
6100 /* TODO: support IO bitmaps */ 6509 return nested_vmx_exit_handled_io(vcpu, vmcs12);
6101 return 1;
6102 case EXIT_REASON_MSR_READ: 6510 case EXIT_REASON_MSR_READ:
6103 case EXIT_REASON_MSR_WRITE: 6511 case EXIT_REASON_MSR_WRITE:
6104 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6512 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -6122,6 +6530,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
6122 case EXIT_REASON_EPT_VIOLATION: 6530 case EXIT_REASON_EPT_VIOLATION:
6123 case EXIT_REASON_EPT_MISCONFIG: 6531 case EXIT_REASON_EPT_MISCONFIG:
6124 return 0; 6532 return 0;
6533 case EXIT_REASON_PREEMPTION_TIMER:
6534 return vmcs12->pin_based_vm_exec_control &
6535 PIN_BASED_VMX_PREEMPTION_TIMER;
6125 case EXIT_REASON_WBINVD: 6536 case EXIT_REASON_WBINVD:
6126 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6537 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6127 case EXIT_REASON_XSETBV: 6538 case EXIT_REASON_XSETBV:
@@ -6316,6 +6727,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6316 6727
6317static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6728static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6318{ 6729{
6730 if (!vmx_vm_has_apicv(vcpu->kvm))
6731 return;
6732
6319 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6733 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6320 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6734 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6321 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6735 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
@@ -6346,6 +6760,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6346 } 6760 }
6347} 6761}
6348 6762
6763static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
6764{
6765 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6766
6767 /*
6768 * If external interrupt exists, IF bit is set in rflags/eflags on the
6769 * interrupt stack frame, and interrupt will be enabled on a return
6770 * from interrupt handler.
6771 */
6772 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
6773 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
6774 unsigned int vector;
6775 unsigned long entry;
6776 gate_desc *desc;
6777 struct vcpu_vmx *vmx = to_vmx(vcpu);
6778#ifdef CONFIG_X86_64
6779 unsigned long tmp;
6780#endif
6781
6782 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6783 desc = (gate_desc *)vmx->host_idt_base + vector;
6784 entry = gate_offset(*desc);
6785 asm volatile(
6786#ifdef CONFIG_X86_64
6787 "mov %%" _ASM_SP ", %[sp]\n\t"
6788 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
6789 "push $%c[ss]\n\t"
6790 "push %[sp]\n\t"
6791#endif
6792 "pushf\n\t"
6793 "orl $0x200, (%%" _ASM_SP ")\n\t"
6794 __ASM_SIZE(push) " $%c[cs]\n\t"
6795 "call *%[entry]\n\t"
6796 :
6797#ifdef CONFIG_X86_64
6798 [sp]"=&r"(tmp)
6799#endif
6800 :
6801 [entry]"r"(entry),
6802 [ss]"i"(__KERNEL_DS),
6803 [cs]"i"(__KERNEL_CS)
6804 );
6805 } else
6806 local_irq_enable();
6807}
6808
6349static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6809static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6350{ 6810{
6351 u32 exit_intr_info; 6811 u32 exit_intr_info;
@@ -6388,7 +6848,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6388 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 6848 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
6389} 6849}
6390 6850
6391static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, 6851static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6392 u32 idt_vectoring_info, 6852 u32 idt_vectoring_info,
6393 int instr_len_field, 6853 int instr_len_field,
6394 int error_code_field) 6854 int error_code_field)
@@ -6399,46 +6859,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
6399 6859
6400 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6860 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
6401 6861
6402 vmx->vcpu.arch.nmi_injected = false; 6862 vcpu->arch.nmi_injected = false;
6403 kvm_clear_exception_queue(&vmx->vcpu); 6863 kvm_clear_exception_queue(vcpu);
6404 kvm_clear_interrupt_queue(&vmx->vcpu); 6864 kvm_clear_interrupt_queue(vcpu);
6405 6865
6406 if (!idtv_info_valid) 6866 if (!idtv_info_valid)
6407 return; 6867 return;
6408 6868
6409 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6869 kvm_make_request(KVM_REQ_EVENT, vcpu);
6410 6870
6411 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6871 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6412 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6872 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
6413 6873
6414 switch (type) { 6874 switch (type) {
6415 case INTR_TYPE_NMI_INTR: 6875 case INTR_TYPE_NMI_INTR:
6416 vmx->vcpu.arch.nmi_injected = true; 6876 vcpu->arch.nmi_injected = true;
6417 /* 6877 /*
6418 * SDM 3: 27.7.1.2 (September 2008) 6878 * SDM 3: 27.7.1.2 (September 2008)
6419 * Clear bit "block by NMI" before VM entry if a NMI 6879 * Clear bit "block by NMI" before VM entry if a NMI
6420 * delivery faulted. 6880 * delivery faulted.
6421 */ 6881 */
6422 vmx_set_nmi_mask(&vmx->vcpu, false); 6882 vmx_set_nmi_mask(vcpu, false);
6423 break; 6883 break;
6424 case INTR_TYPE_SOFT_EXCEPTION: 6884 case INTR_TYPE_SOFT_EXCEPTION:
6425 vmx->vcpu.arch.event_exit_inst_len = 6885 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6426 vmcs_read32(instr_len_field);
6427 /* fall through */ 6886 /* fall through */
6428 case INTR_TYPE_HARD_EXCEPTION: 6887 case INTR_TYPE_HARD_EXCEPTION:
6429 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6888 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6430 u32 err = vmcs_read32(error_code_field); 6889 u32 err = vmcs_read32(error_code_field);
6431 kvm_queue_exception_e(&vmx->vcpu, vector, err); 6890 kvm_queue_exception_e(vcpu, vector, err);
6432 } else 6891 } else
6433 kvm_queue_exception(&vmx->vcpu, vector); 6892 kvm_queue_exception(vcpu, vector);
6434 break; 6893 break;
6435 case INTR_TYPE_SOFT_INTR: 6894 case INTR_TYPE_SOFT_INTR:
6436 vmx->vcpu.arch.event_exit_inst_len = 6895 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6437 vmcs_read32(instr_len_field);
6438 /* fall through */ 6896 /* fall through */
6439 case INTR_TYPE_EXT_INTR: 6897 case INTR_TYPE_EXT_INTR:
6440 kvm_queue_interrupt(&vmx->vcpu, vector, 6898 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6441 type == INTR_TYPE_SOFT_INTR);
6442 break; 6899 break;
6443 default: 6900 default:
6444 break; 6901 break;
@@ -6447,18 +6904,14 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
6447 6904
6448static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6905static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
6449{ 6906{
6450 if (is_guest_mode(&vmx->vcpu)) 6907 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6451 return;
6452 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
6453 VM_EXIT_INSTRUCTION_LEN, 6908 VM_EXIT_INSTRUCTION_LEN,
6454 IDT_VECTORING_ERROR_CODE); 6909 IDT_VECTORING_ERROR_CODE);
6455} 6910}
6456 6911
6457static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6912static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
6458{ 6913{
6459 if (is_guest_mode(vcpu)) 6914 __vmx_complete_interrupts(vcpu,
6460 return;
6461 __vmx_complete_interrupts(to_vmx(vcpu),
6462 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6915 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6463 VM_ENTRY_INSTRUCTION_LEN, 6916 VM_ENTRY_INSTRUCTION_LEN,
6464 VM_ENTRY_EXCEPTION_ERROR_CODE); 6917 VM_ENTRY_EXCEPTION_ERROR_CODE);
@@ -6489,21 +6942,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6489 struct vcpu_vmx *vmx = to_vmx(vcpu); 6942 struct vcpu_vmx *vmx = to_vmx(vcpu);
6490 unsigned long debugctlmsr; 6943 unsigned long debugctlmsr;
6491 6944
6492 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6493 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6494 if (vmcs12->idt_vectoring_info_field &
6495 VECTORING_INFO_VALID_MASK) {
6496 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
6497 vmcs12->idt_vectoring_info_field);
6498 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
6499 vmcs12->vm_exit_instruction_len);
6500 if (vmcs12->idt_vectoring_info_field &
6501 VECTORING_INFO_DELIVER_CODE_MASK)
6502 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
6503 vmcs12->idt_vectoring_error_code);
6504 }
6505 }
6506
6507 /* Record the guest's net vcpu time for enforced NMI injections. */ 6945 /* Record the guest's net vcpu time for enforced NMI injections. */
6508 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 6946 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
6509 vmx->entry_time = ktime_get(); 6947 vmx->entry_time = ktime_get();
@@ -6513,6 +6951,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6513 if (vmx->emulation_required) 6951 if (vmx->emulation_required)
6514 return; 6952 return;
6515 6953
6954 if (vmx->nested.sync_shadow_vmcs) {
6955 copy_vmcs12_to_shadow(vmx);
6956 vmx->nested.sync_shadow_vmcs = false;
6957 }
6958
6516 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6959 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
6517 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 6960 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
6518 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 6961 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -6662,17 +7105,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6662 7105
6663 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7106 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
6664 7107
6665 if (is_guest_mode(vcpu)) {
6666 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6667 vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
6668 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
6669 vmcs12->idt_vectoring_error_code =
6670 vmcs_read32(IDT_VECTORING_ERROR_CODE);
6671 vmcs12->vm_exit_instruction_len =
6672 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6673 }
6674 }
6675
6676 vmx->loaded_vmcs->launched = 1; 7108 vmx->loaded_vmcs->launched = 1;
6677 7109
6678 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 7110 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6734,10 +7166,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
6734 put_cpu(); 7166 put_cpu();
6735 if (err) 7167 if (err)
6736 goto free_vmcs; 7168 goto free_vmcs;
6737 if (vm_need_virtualize_apic_accesses(kvm)) 7169 if (vm_need_virtualize_apic_accesses(kvm)) {
6738 err = alloc_apic_access_page(kvm); 7170 err = alloc_apic_access_page(kvm);
6739 if (err) 7171 if (err)
6740 goto free_vmcs; 7172 goto free_vmcs;
7173 }
6741 7174
6742 if (enable_ept) { 7175 if (enable_ept) {
6743 if (!kvm->arch.ept_identity_map_addr) 7176 if (!kvm->arch.ept_identity_map_addr)
@@ -6931,9 +7364,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6931 vmcs12->vm_entry_instruction_len); 7364 vmcs12->vm_entry_instruction_len);
6932 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 7365 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
6933 vmcs12->guest_interruptibility_info); 7366 vmcs12->guest_interruptibility_info);
6934 vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
6935 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 7367 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
6936 vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); 7368 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
6937 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); 7369 vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
6938 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 7370 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
6939 vmcs12->guest_pending_dbg_exceptions); 7371 vmcs12->guest_pending_dbg_exceptions);
@@ -6946,6 +7378,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6946 (vmcs_config.pin_based_exec_ctrl | 7378 (vmcs_config.pin_based_exec_ctrl |
6947 vmcs12->pin_based_vm_exec_control)); 7379 vmcs12->pin_based_vm_exec_control));
6948 7380
7381 if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
7382 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
7383 vmcs12->vmx_preemption_timer_value);
7384
6949 /* 7385 /*
6950 * Whether page-faults are trapped is determined by a combination of 7386 * Whether page-faults are trapped is determined by a combination of
6951 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 7387 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
@@ -7016,7 +7452,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7016 * Other fields are different per CPU, and will be set later when 7452 * Other fields are different per CPU, and will be set later when
7017 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 7453 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
7018 */ 7454 */
7019 vmx_set_constant_host_state(); 7455 vmx_set_constant_host_state(vmx);
7020 7456
7021 /* 7457 /*
7022 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 7458 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7082,7 +7518,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7082 7518
7083 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 7519 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
7084 vcpu->arch.efer = vmcs12->guest_ia32_efer; 7520 vcpu->arch.efer = vmcs12->guest_ia32_efer;
7085 if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 7521 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
7086 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7522 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
7087 else 7523 else
7088 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 7524 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7121,6 +7557,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7121 struct vcpu_vmx *vmx = to_vmx(vcpu); 7557 struct vcpu_vmx *vmx = to_vmx(vcpu);
7122 int cpu; 7558 int cpu;
7123 struct loaded_vmcs *vmcs02; 7559 struct loaded_vmcs *vmcs02;
7560 bool ia32e;
7124 7561
7125 if (!nested_vmx_check_permission(vcpu) || 7562 if (!nested_vmx_check_permission(vcpu) ||
7126 !nested_vmx_check_vmcs12(vcpu)) 7563 !nested_vmx_check_vmcs12(vcpu))
@@ -7129,6 +7566,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7129 skip_emulated_instruction(vcpu); 7566 skip_emulated_instruction(vcpu);
7130 vmcs12 = get_vmcs12(vcpu); 7567 vmcs12 = get_vmcs12(vcpu);
7131 7568
7569 if (enable_shadow_vmcs)
7570 copy_shadow_to_vmcs12(vmx);
7571
7132 /* 7572 /*
7133 * The nested entry process starts with enforcing various prerequisites 7573 * The nested entry process starts with enforcing various prerequisites
7134 * on vmcs12 as required by the Intel SDM, and act appropriately when 7574 * on vmcs12 as required by the Intel SDM, and act appropriately when
@@ -7146,6 +7586,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7146 return 1; 7586 return 1;
7147 } 7587 }
7148 7588
7589 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
7590 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
7591 return 1;
7592 }
7593
7149 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && 7594 if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
7150 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { 7595 !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
7151 /*TODO: Also verify bits beyond physical address width are 0*/ 7596 /*TODO: Also verify bits beyond physical address width are 0*/
@@ -7204,6 +7649,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7204 } 7649 }
7205 7650
7206 /* 7651 /*
7652 * If the load IA32_EFER VM-entry control is 1, the following checks
7653 * are performed on the field for the IA32_EFER MSR:
7654 * - Bits reserved in the IA32_EFER MSR must be 0.
7655 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
7656 * the IA-32e mode guest VM-exit control. It must also be identical
7657 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
7658 * CR0.PG) is 1.
7659 */
7660 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
7661 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
7662 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
7663 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
7664 ((vmcs12->guest_cr0 & X86_CR0_PG) &&
7665 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
7666 nested_vmx_entry_failure(vcpu, vmcs12,
7667 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
7668 return 1;
7669 }
7670 }
7671
7672 /*
7673 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
7674 * IA32_EFER MSR must be 0 in the field for that register. In addition,
7675 * the values of the LMA and LME bits in the field must each be that of
7676 * the host address-space size VM-exit control.
7677 */
7678 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
7679 ia32e = (vmcs12->vm_exit_controls &
7680 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
7681 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
7682 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
7683 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
7684 nested_vmx_entry_failure(vcpu, vmcs12,
7685 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
7686 return 1;
7687 }
7688 }
7689
7690 /*
7207 * We're finally done with prerequisite checking, and can start with 7691 * We're finally done with prerequisite checking, and can start with
7208 * the nested entry. 7692 * the nested entry.
7209 */ 7693 */
@@ -7223,6 +7707,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
7223 vcpu->cpu = cpu; 7707 vcpu->cpu = cpu;
7224 put_cpu(); 7708 put_cpu();
7225 7709
7710 vmx_segment_cache_clear(vmx);
7711
7226 vmcs12->launch_state = 1; 7712 vmcs12->launch_state = 1;
7227 7713
7228 prepare_vmcs02(vcpu, vmcs12); 7714 prepare_vmcs02(vcpu, vmcs12);
@@ -7273,6 +7759,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7273 vcpu->arch.cr4_guest_owned_bits)); 7759 vcpu->arch.cr4_guest_owned_bits));
7274} 7760}
7275 7761
7762static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
7763 struct vmcs12 *vmcs12)
7764{
7765 u32 idt_vectoring;
7766 unsigned int nr;
7767
7768 if (vcpu->arch.exception.pending) {
7769 nr = vcpu->arch.exception.nr;
7770 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
7771
7772 if (kvm_exception_is_soft(nr)) {
7773 vmcs12->vm_exit_instruction_len =
7774 vcpu->arch.event_exit_inst_len;
7775 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
7776 } else
7777 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
7778
7779 if (vcpu->arch.exception.has_error_code) {
7780 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
7781 vmcs12->idt_vectoring_error_code =
7782 vcpu->arch.exception.error_code;
7783 }
7784
7785 vmcs12->idt_vectoring_info_field = idt_vectoring;
7786 } else if (vcpu->arch.nmi_pending) {
7787 vmcs12->idt_vectoring_info_field =
7788 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
7789 } else if (vcpu->arch.interrupt.pending) {
7790 nr = vcpu->arch.interrupt.nr;
7791 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
7792
7793 if (vcpu->arch.interrupt.soft) {
7794 idt_vectoring |= INTR_TYPE_SOFT_INTR;
7795 vmcs12->vm_entry_instruction_len =
7796 vcpu->arch.event_exit_inst_len;
7797 } else
7798 idt_vectoring |= INTR_TYPE_EXT_INTR;
7799
7800 vmcs12->idt_vectoring_info_field = idt_vectoring;
7801 }
7802}
7803
7276/* 7804/*
7277 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 7805 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
7278 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 7806 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7284,7 +7812,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7284 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 7812 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
7285 * which already writes to vmcs12 directly. 7813 * which already writes to vmcs12 directly.
7286 */ 7814 */
7287void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7815static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7288{ 7816{
7289 /* update guest state fields: */ 7817 /* update guest state fields: */
7290 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 7818 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -7332,16 +7860,19 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7332 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 7860 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
7333 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 7861 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
7334 7862
7335 vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
7336 vmcs12->guest_interruptibility_info = 7863 vmcs12->guest_interruptibility_info =
7337 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 7864 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
7338 vmcs12->guest_pending_dbg_exceptions = 7865 vmcs12->guest_pending_dbg_exceptions =
7339 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 7866 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
7340 7867
7868 vmcs12->vm_entry_controls =
7869 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
7870 (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
7871
7341 /* TODO: These cannot have changed unless we have MSR bitmaps and 7872 /* TODO: These cannot have changed unless we have MSR bitmaps and
7342 * the relevant bit asks not to trap the change */ 7873 * the relevant bit asks not to trap the change */
7343 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 7874 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
7344 if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) 7875 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
7345 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 7876 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
7346 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 7877 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
7347 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 7878 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
@@ -7349,21 +7880,38 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7349 7880
7350 /* update exit information fields: */ 7881 /* update exit information fields: */
7351 7882
7352 vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); 7883 vmcs12->vm_exit_reason = to_vmx(vcpu)->exit_reason;
7353 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7884 vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7354 7885
7355 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7886 vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7356 vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 7887 if ((vmcs12->vm_exit_intr_info &
7357 vmcs12->idt_vectoring_info_field = 7888 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
7358 vmcs_read32(IDT_VECTORING_INFO_FIELD); 7889 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
7359 vmcs12->idt_vectoring_error_code = 7890 vmcs12->vm_exit_intr_error_code =
7360 vmcs_read32(IDT_VECTORING_ERROR_CODE); 7891 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
7892 vmcs12->idt_vectoring_info_field = 0;
7361 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 7893 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
7362 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7894 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7363 7895
7364 /* clear vm-entry fields which are to be cleared on exit */ 7896 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
7365 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 7897 /* vm_entry_intr_info_field is cleared on exit. Emulate this
7898 * instead of reading the real value. */
7366 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 7899 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
7900
7901 /*
7902 * Transfer the event that L0 or L1 may wanted to inject into
7903 * L2 to IDT_VECTORING_INFO_FIELD.
7904 */
7905 vmcs12_save_pending_event(vcpu, vmcs12);
7906 }
7907
7908 /*
7909 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
7910 * preserved above and would only end up incorrectly in L1.
7911 */
7912 vcpu->arch.nmi_injected = false;
7913 kvm_clear_exception_queue(vcpu);
7914 kvm_clear_interrupt_queue(vcpu);
7367} 7915}
7368 7916
7369/* 7917/*
@@ -7375,11 +7923,12 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7375 * Failures During or After Loading Guest State"). 7923 * Failures During or After Loading Guest State").
7376 * This function should be called when the active VMCS is L1's (vmcs01). 7924 * This function should be called when the active VMCS is L1's (vmcs01).
7377 */ 7925 */
7378void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 7926static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
7927 struct vmcs12 *vmcs12)
7379{ 7928{
7380 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 7929 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
7381 vcpu->arch.efer = vmcs12->host_ia32_efer; 7930 vcpu->arch.efer = vmcs12->host_ia32_efer;
7382 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 7931 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
7383 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 7932 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
7384 else 7933 else
7385 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 7934 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7387,6 +7936,7 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7387 7936
7388 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 7937 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
7389 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 7938 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
7939 vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
7390 /* 7940 /*
7391 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 7941 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
7392 * actually changed, because it depends on the current state of 7942 * actually changed, because it depends on the current state of
@@ -7445,6 +7995,9 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
7445 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 7995 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
7446 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 7996 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
7447 vmcs12->host_ia32_perf_global_ctrl); 7997 vmcs12->host_ia32_perf_global_ctrl);
7998
7999 kvm_set_dr(vcpu, 7, 0x400);
8000 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
7448} 8001}
7449 8002
7450/* 8003/*
@@ -7458,6 +8011,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7458 int cpu; 8011 int cpu;
7459 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8012 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7460 8013
8014 /* trying to cancel vmlaunch/vmresume is a bug */
8015 WARN_ON_ONCE(vmx->nested.nested_run_pending);
8016
7461 leave_guest_mode(vcpu); 8017 leave_guest_mode(vcpu);
7462 prepare_vmcs12(vcpu, vmcs12); 8018 prepare_vmcs12(vcpu, vmcs12);
7463 8019
@@ -7468,6 +8024,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7468 vcpu->cpu = cpu; 8024 vcpu->cpu = cpu;
7469 put_cpu(); 8025 put_cpu();
7470 8026
8027 vmx_segment_cache_clear(vmx);
8028
7471 /* if no vmcs02 cache requested, remove the one we used */ 8029 /* if no vmcs02 cache requested, remove the one we used */
7472 if (VMCS02_POOL_SIZE == 0) 8030 if (VMCS02_POOL_SIZE == 0)
7473 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 8031 nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
@@ -7496,6 +8054,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
7496 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 8054 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
7497 } else 8055 } else
7498 nested_vmx_succeed(vcpu); 8056 nested_vmx_succeed(vcpu);
8057 if (enable_shadow_vmcs)
8058 vmx->nested.sync_shadow_vmcs = true;
7499} 8059}
7500 8060
7501/* 8061/*
@@ -7513,6 +8073,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
7513 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 8073 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
7514 vmcs12->exit_qualification = qualification; 8074 vmcs12->exit_qualification = qualification;
7515 nested_vmx_succeed(vcpu); 8075 nested_vmx_succeed(vcpu);
8076 if (enable_shadow_vmcs)
8077 to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
7516} 8078}
7517 8079
7518static int vmx_check_intercept(struct kvm_vcpu *vcpu, 8080static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -7590,6 +8152,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
7590 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8152 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7591 .hwapic_irr_update = vmx_hwapic_irr_update, 8153 .hwapic_irr_update = vmx_hwapic_irr_update,
7592 .hwapic_isr_update = vmx_hwapic_isr_update, 8154 .hwapic_isr_update = vmx_hwapic_isr_update,
8155 .sync_pir_to_irr = vmx_sync_pir_to_irr,
8156 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
7593 8157
7594 .set_tss_addr = vmx_set_tss_addr, 8158 .set_tss_addr = vmx_set_tss_addr,
7595 .get_tdp_level = get_ept_level, 8159 .get_tdp_level = get_ept_level,
@@ -7618,6 +8182,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7618 .set_tdp_cr3 = vmx_set_cr3, 8182 .set_tdp_cr3 = vmx_set_cr3,
7619 8183
7620 .check_intercept = vmx_check_intercept, 8184 .check_intercept = vmx_check_intercept,
8185 .handle_external_intr = vmx_handle_external_intr,
7621}; 8186};
7622 8187
7623static int __init vmx_init(void) 8188static int __init vmx_init(void)
@@ -7656,6 +8221,24 @@ static int __init vmx_init(void)
7656 (unsigned long *)__get_free_page(GFP_KERNEL); 8221 (unsigned long *)__get_free_page(GFP_KERNEL);
7657 if (!vmx_msr_bitmap_longmode_x2apic) 8222 if (!vmx_msr_bitmap_longmode_x2apic)
7658 goto out4; 8223 goto out4;
8224 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
8225 if (!vmx_vmread_bitmap)
8226 goto out5;
8227
8228 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
8229 if (!vmx_vmwrite_bitmap)
8230 goto out6;
8231
8232 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
8233 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
8234 /* shadowed read/write fields */
8235 for (i = 0; i < max_shadow_read_write_fields; i++) {
8236 clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap);
8237 clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap);
8238 }
8239 /* shadowed read only fields */
8240 for (i = 0; i < max_shadow_read_only_fields; i++)
8241 clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap);
7659 8242
7660 /* 8243 /*
7661 * Allow direct access to the PC debug port (it is often used for I/O 8244 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7674,7 +8257,7 @@ static int __init vmx_init(void)
7674 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 8257 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
7675 __alignof__(struct vcpu_vmx), THIS_MODULE); 8258 __alignof__(struct vcpu_vmx), THIS_MODULE);
7676 if (r) 8259 if (r)
7677 goto out3; 8260 goto out7;
7678 8261
7679#ifdef CONFIG_KEXEC 8262#ifdef CONFIG_KEXEC
7680 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 8263 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7692,7 +8275,7 @@ static int __init vmx_init(void)
7692 memcpy(vmx_msr_bitmap_longmode_x2apic, 8275 memcpy(vmx_msr_bitmap_longmode_x2apic,
7693 vmx_msr_bitmap_longmode, PAGE_SIZE); 8276 vmx_msr_bitmap_longmode, PAGE_SIZE);
7694 8277
7695 if (enable_apicv_reg_vid) { 8278 if (enable_apicv) {
7696 for (msr = 0x800; msr <= 0x8ff; msr++) 8279 for (msr = 0x800; msr <= 0x8ff; msr++)
7697 vmx_disable_intercept_msr_read_x2apic(msr); 8280 vmx_disable_intercept_msr_read_x2apic(msr);
7698 8281
@@ -7722,6 +8305,12 @@ static int __init vmx_init(void)
7722 8305
7723 return 0; 8306 return 0;
7724 8307
8308out7:
8309 free_page((unsigned long)vmx_vmwrite_bitmap);
8310out6:
8311 free_page((unsigned long)vmx_vmread_bitmap);
8312out5:
8313 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7725out4: 8314out4:
7726 free_page((unsigned long)vmx_msr_bitmap_longmode); 8315 free_page((unsigned long)vmx_msr_bitmap_longmode);
7727out3: 8316out3:
@@ -7743,6 +8332,8 @@ static void __exit vmx_exit(void)
7743 free_page((unsigned long)vmx_msr_bitmap_longmode); 8332 free_page((unsigned long)vmx_msr_bitmap_longmode);
7744 free_page((unsigned long)vmx_io_bitmap_b); 8333 free_page((unsigned long)vmx_io_bitmap_b);
7745 free_page((unsigned long)vmx_io_bitmap_a); 8334 free_page((unsigned long)vmx_io_bitmap_a);
8335 free_page((unsigned long)vmx_vmwrite_bitmap);
8336 free_page((unsigned long)vmx_vmread_bitmap);
7746 8337
7747#ifdef CONFIG_KEXEC 8338#ifdef CONFIG_KEXEC
7748 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); 8339 rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1721324c271..05a8b1a2300d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0;
162 162
163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 163static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164 164
165static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
166
167static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 165static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
168{ 166{
169 int i; 167 int i;
@@ -263,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
263} 261}
264EXPORT_SYMBOL_GPL(kvm_set_apic_base); 262EXPORT_SYMBOL_GPL(kvm_set_apic_base);
265 263
264asmlinkage void kvm_spurious_fault(void)
265{
266 /* Fault while not rebooting. We want the trace. */
267 BUG();
268}
269EXPORT_SYMBOL_GPL(kvm_spurious_fault);
270
266#define EXCPT_BENIGN 0 271#define EXCPT_BENIGN 0
267#define EXCPT_CONTRIBUTORY 1 272#define EXCPT_CONTRIBUTORY 1
268#define EXCPT_PF 2 273#define EXCPT_PF 2
@@ -840,23 +845,17 @@ static const u32 emulated_msrs[] = {
840 MSR_IA32_MCG_CTL, 845 MSR_IA32_MCG_CTL,
841}; 846};
842 847
843static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 848bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
844{ 849{
845 u64 old_efer = vcpu->arch.efer;
846
847 if (efer & efer_reserved_bits) 850 if (efer & efer_reserved_bits)
848 return 1; 851 return false;
849
850 if (is_paging(vcpu)
851 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
852 return 1;
853 852
854 if (efer & EFER_FFXSR) { 853 if (efer & EFER_FFXSR) {
855 struct kvm_cpuid_entry2 *feat; 854 struct kvm_cpuid_entry2 *feat;
856 855
857 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 856 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
858 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) 857 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
859 return 1; 858 return false;
860 } 859 }
861 860
862 if (efer & EFER_SVME) { 861 if (efer & EFER_SVME) {
@@ -864,9 +863,24 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
864 863
865 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 864 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
866 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) 865 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
867 return 1; 866 return false;
868 } 867 }
869 868
869 return true;
870}
871EXPORT_SYMBOL_GPL(kvm_valid_efer);
872
873static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
874{
875 u64 old_efer = vcpu->arch.efer;
876
877 if (!kvm_valid_efer(vcpu, efer))
878 return 1;
879
880 if (is_paging(vcpu)
881 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
882 return 1;
883
870 efer &= ~EFER_LMA; 884 efer &= ~EFER_LMA;
871 efer |= vcpu->arch.efer & EFER_LMA; 885 efer |= vcpu->arch.efer & EFER_LMA;
872 886
@@ -1079,6 +1093,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1079 u32 thresh_lo, thresh_hi; 1093 u32 thresh_lo, thresh_hi;
1080 int use_scaling = 0; 1094 int use_scaling = 0;
1081 1095
1096 /* tsc_khz can be zero if TSC calibration fails */
1097 if (this_tsc_khz == 0)
1098 return;
1099
1082 /* Compute a scale to convert nanoseconds in TSC cycles */ 1100 /* Compute a scale to convert nanoseconds in TSC cycles */
1083 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1101 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1084 &vcpu->arch.virtual_tsc_shift, 1102 &vcpu->arch.virtual_tsc_shift,
@@ -1156,20 +1174,23 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1156 ns = get_kernel_ns(); 1174 ns = get_kernel_ns();
1157 elapsed = ns - kvm->arch.last_tsc_nsec; 1175 elapsed = ns - kvm->arch.last_tsc_nsec;
1158 1176
1159 /* n.b - signed multiplication and division required */ 1177 if (vcpu->arch.virtual_tsc_khz) {
1160 usdiff = data - kvm->arch.last_tsc_write; 1178 /* n.b - signed multiplication and division required */
1179 usdiff = data - kvm->arch.last_tsc_write;
1161#ifdef CONFIG_X86_64 1180#ifdef CONFIG_X86_64
1162 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; 1181 usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1163#else 1182#else
1164 /* do_div() only does unsigned */ 1183 /* do_div() only does unsigned */
1165 asm("idivl %2; xor %%edx, %%edx" 1184 asm("idivl %2; xor %%edx, %%edx"
1166 : "=A"(usdiff) 1185 : "=A"(usdiff)
1167 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); 1186 : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
1168#endif 1187#endif
1169 do_div(elapsed, 1000); 1188 do_div(elapsed, 1000);
1170 usdiff -= elapsed; 1189 usdiff -= elapsed;
1171 if (usdiff < 0) 1190 if (usdiff < 0)
1172 usdiff = -usdiff; 1191 usdiff = -usdiff;
1192 } else
1193 usdiff = USEC_PER_SEC; /* disable TSC match window below */
1173 1194
1174 /* 1195 /*
1175 * Special case: TSC write with a small delta (1 second) of virtual 1196 * Special case: TSC write with a small delta (1 second) of virtual
@@ -2034,7 +2055,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2034 case MSR_P6_EVNTSEL0: 2055 case MSR_P6_EVNTSEL0:
2035 case MSR_P6_EVNTSEL1: 2056 case MSR_P6_EVNTSEL1:
2036 if (kvm_pmu_msr(vcpu, msr)) 2057 if (kvm_pmu_msr(vcpu, msr))
2037 return kvm_pmu_set_msr(vcpu, msr, data); 2058 return kvm_pmu_set_msr(vcpu, msr_info);
2038 2059
2039 if (pr || data != 0) 2060 if (pr || data != 0)
2040 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 2061 vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2080,7 +2101,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2080 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 2101 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2081 return xen_hvm_config(vcpu, data); 2102 return xen_hvm_config(vcpu, data);
2082 if (kvm_pmu_msr(vcpu, msr)) 2103 if (kvm_pmu_msr(vcpu, msr))
2083 return kvm_pmu_set_msr(vcpu, msr, data); 2104 return kvm_pmu_set_msr(vcpu, msr_info);
2084 if (!ignore_msrs) { 2105 if (!ignore_msrs) {
2085 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 2106 vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2086 msr, data); 2107 msr, data);
@@ -2479,7 +2500,6 @@ int kvm_dev_ioctl_check_extension(long ext)
2479 case KVM_CAP_USER_NMI: 2500 case KVM_CAP_USER_NMI:
2480 case KVM_CAP_REINJECT_CONTROL: 2501 case KVM_CAP_REINJECT_CONTROL:
2481 case KVM_CAP_IRQ_INJECT_STATUS: 2502 case KVM_CAP_IRQ_INJECT_STATUS:
2482 case KVM_CAP_ASSIGN_DEV_IRQ:
2483 case KVM_CAP_IRQFD: 2503 case KVM_CAP_IRQFD:
2484 case KVM_CAP_IOEVENTFD: 2504 case KVM_CAP_IOEVENTFD:
2485 case KVM_CAP_PIT2: 2505 case KVM_CAP_PIT2:
@@ -2497,10 +2517,12 @@ int kvm_dev_ioctl_check_extension(long ext)
2497 case KVM_CAP_XSAVE: 2517 case KVM_CAP_XSAVE:
2498 case KVM_CAP_ASYNC_PF: 2518 case KVM_CAP_ASYNC_PF:
2499 case KVM_CAP_GET_TSC_KHZ: 2519 case KVM_CAP_GET_TSC_KHZ:
2500 case KVM_CAP_PCI_2_3:
2501 case KVM_CAP_KVMCLOCK_CTRL: 2520 case KVM_CAP_KVMCLOCK_CTRL:
2502 case KVM_CAP_READONLY_MEM: 2521 case KVM_CAP_READONLY_MEM:
2503 case KVM_CAP_IRQFD_RESAMPLE: 2522#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2523 case KVM_CAP_ASSIGN_DEV_IRQ:
2524 case KVM_CAP_PCI_2_3:
2525#endif
2504 r = 1; 2526 r = 1;
2505 break; 2527 break;
2506 case KVM_CAP_COALESCED_MMIO: 2528 case KVM_CAP_COALESCED_MMIO:
@@ -2521,9 +2543,11 @@ int kvm_dev_ioctl_check_extension(long ext)
2521 case KVM_CAP_PV_MMU: /* obsolete */ 2543 case KVM_CAP_PV_MMU: /* obsolete */
2522 r = 0; 2544 r = 0;
2523 break; 2545 break;
2546#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2524 case KVM_CAP_IOMMU: 2547 case KVM_CAP_IOMMU:
2525 r = iommu_present(&pci_bus_type); 2548 r = iommu_present(&pci_bus_type);
2526 break; 2549 break;
2550#endif
2527 case KVM_CAP_MCE: 2551 case KVM_CAP_MCE:
2528 r = KVM_MAX_MCE_BANKS; 2552 r = KVM_MAX_MCE_BANKS;
2529 break; 2553 break;
@@ -2679,6 +2703,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2679static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2703static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2680 struct kvm_lapic_state *s) 2704 struct kvm_lapic_state *s)
2681{ 2705{
2706 kvm_x86_ops->sync_pir_to_irr(vcpu);
2682 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2707 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2683 2708
2684 return 0; 2709 return 0;
@@ -2696,7 +2721,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2696static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2721static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2697 struct kvm_interrupt *irq) 2722 struct kvm_interrupt *irq)
2698{ 2723{
2699 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS) 2724 if (irq->irq >= KVM_NR_INTERRUPTS)
2700 return -EINVAL; 2725 return -EINVAL;
2701 if (irqchip_in_kernel(vcpu->kvm)) 2726 if (irqchip_in_kernel(vcpu->kvm))
2702 return -ENXIO; 2727 return -ENXIO;
@@ -2819,10 +2844,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2819 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2844 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2820 events->nmi.pad = 0; 2845 events->nmi.pad = 0;
2821 2846
2822 events->sipi_vector = vcpu->arch.sipi_vector; 2847 events->sipi_vector = 0; /* never valid when reporting to user space */
2823 2848
2824 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2849 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2825 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2826 | KVM_VCPUEVENT_VALID_SHADOW); 2850 | KVM_VCPUEVENT_VALID_SHADOW);
2827 memset(&events->reserved, 0, sizeof(events->reserved)); 2851 memset(&events->reserved, 0, sizeof(events->reserved));
2828} 2852}
@@ -2853,8 +2877,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2853 vcpu->arch.nmi_pending = events->nmi.pending; 2877 vcpu->arch.nmi_pending = events->nmi.pending;
2854 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); 2878 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2855 2879
2856 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2880 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
2857 vcpu->arch.sipi_vector = events->sipi_vector; 2881 kvm_vcpu_has_lapic(vcpu))
2882 vcpu->arch.apic->sipi_vector = events->sipi_vector;
2858 2883
2859 kvm_make_request(KVM_REQ_EVENT, vcpu); 2884 kvm_make_request(KVM_REQ_EVENT, vcpu);
2860 2885
@@ -3478,13 +3503,15 @@ out:
3478 return r; 3503 return r;
3479} 3504}
3480 3505
3481int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) 3506int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3507 bool line_status)
3482{ 3508{
3483 if (!irqchip_in_kernel(kvm)) 3509 if (!irqchip_in_kernel(kvm))
3484 return -ENXIO; 3510 return -ENXIO;
3485 3511
3486 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 3512 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3487 irq_event->irq, irq_event->level); 3513 irq_event->irq, irq_event->level,
3514 line_status);
3488 return 0; 3515 return 0;
3489} 3516}
3490 3517
@@ -4752,11 +4779,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4752} 4779}
4753 4780
4754static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, 4781static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4755 bool write_fault_to_shadow_pgtable) 4782 bool write_fault_to_shadow_pgtable,
4783 int emulation_type)
4756{ 4784{
4757 gpa_t gpa = cr2; 4785 gpa_t gpa = cr2;
4758 pfn_t pfn; 4786 pfn_t pfn;
4759 4787
4788 if (emulation_type & EMULTYPE_NO_REEXECUTE)
4789 return false;
4790
4760 if (!vcpu->arch.mmu.direct_map) { 4791 if (!vcpu->arch.mmu.direct_map) {
4761 /* 4792 /*
4762 * Write permission should be allowed since only 4793 * Write permission should be allowed since only
@@ -4899,8 +4930,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4899 if (r != EMULATION_OK) { 4930 if (r != EMULATION_OK) {
4900 if (emulation_type & EMULTYPE_TRAP_UD) 4931 if (emulation_type & EMULTYPE_TRAP_UD)
4901 return EMULATE_FAIL; 4932 return EMULATE_FAIL;
4902 if (reexecute_instruction(vcpu, cr2, 4933 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
4903 write_fault_to_spt)) 4934 emulation_type))
4904 return EMULATE_DONE; 4935 return EMULATE_DONE;
4905 if (emulation_type & EMULTYPE_SKIP) 4936 if (emulation_type & EMULTYPE_SKIP)
4906 return EMULATE_FAIL; 4937 return EMULATE_FAIL;
@@ -4930,7 +4961,8 @@ restart:
4930 return EMULATE_DONE; 4961 return EMULATE_DONE;
4931 4962
4932 if (r == EMULATION_FAILED) { 4963 if (r == EMULATION_FAILED) {
4933 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) 4964 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
4965 emulation_type))
4934 return EMULATE_DONE; 4966 return EMULATE_DONE;
4935 4967
4936 return handle_emulation_failure(vcpu); 4968 return handle_emulation_failure(vcpu);
@@ -5641,14 +5673,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5641#endif 5673#endif
5642} 5674}
5643 5675
5644static void update_eoi_exitmap(struct kvm_vcpu *vcpu) 5676static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
5645{ 5677{
5646 u64 eoi_exit_bitmap[4]; 5678 u64 eoi_exit_bitmap[4];
5679 u32 tmr[8];
5680
5681 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
5682 return;
5647 5683
5648 memset(eoi_exit_bitmap, 0, 32); 5684 memset(eoi_exit_bitmap, 0, 32);
5685 memset(tmr, 0, 32);
5649 5686
5650 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); 5687 kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
5651 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); 5688 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5689 kvm_apic_update_tmr(vcpu, tmr);
5652} 5690}
5653 5691
5654static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5692static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
@@ -5656,7 +5694,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5656 int r; 5694 int r;
5657 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5695 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5658 vcpu->run->request_interrupt_window; 5696 vcpu->run->request_interrupt_window;
5659 bool req_immediate_exit = 0; 5697 bool req_immediate_exit = false;
5660 5698
5661 if (vcpu->requests) { 5699 if (vcpu->requests) {
5662 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 5700 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5698,24 +5736,30 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5698 record_steal_time(vcpu); 5736 record_steal_time(vcpu);
5699 if (kvm_check_request(KVM_REQ_NMI, vcpu)) 5737 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5700 process_nmi(vcpu); 5738 process_nmi(vcpu);
5701 req_immediate_exit =
5702 kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
5703 if (kvm_check_request(KVM_REQ_PMU, vcpu)) 5739 if (kvm_check_request(KVM_REQ_PMU, vcpu))
5704 kvm_handle_pmu_event(vcpu); 5740 kvm_handle_pmu_event(vcpu);
5705 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5741 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5706 kvm_deliver_pmi(vcpu); 5742 kvm_deliver_pmi(vcpu);
5707 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) 5743 if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
5708 update_eoi_exitmap(vcpu); 5744 vcpu_scan_ioapic(vcpu);
5709 } 5745 }
5710 5746
5711 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5747 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5748 kvm_apic_accept_events(vcpu);
5749 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
5750 r = 1;
5751 goto out;
5752 }
5753
5712 inject_pending_event(vcpu); 5754 inject_pending_event(vcpu);
5713 5755
5714 /* enable NMI/IRQ window open exits if needed */ 5756 /* enable NMI/IRQ window open exits if needed */
5715 if (vcpu->arch.nmi_pending) 5757 if (vcpu->arch.nmi_pending)
5716 kvm_x86_ops->enable_nmi_window(vcpu); 5758 req_immediate_exit =
5759 kvm_x86_ops->enable_nmi_window(vcpu) != 0;
5717 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) 5760 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5718 kvm_x86_ops->enable_irq_window(vcpu); 5761 req_immediate_exit =
5762 kvm_x86_ops->enable_irq_window(vcpu) != 0;
5719 5763
5720 if (kvm_lapic_enabled(vcpu)) { 5764 if (kvm_lapic_enabled(vcpu)) {
5721 /* 5765 /*
@@ -5794,7 +5838,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5794 5838
5795 vcpu->mode = OUTSIDE_GUEST_MODE; 5839 vcpu->mode = OUTSIDE_GUEST_MODE;
5796 smp_wmb(); 5840 smp_wmb();
5797 local_irq_enable(); 5841
5842 /* Interrupt is enabled by handle_external_intr() */
5843 kvm_x86_ops->handle_external_intr(vcpu);
5798 5844
5799 ++vcpu->stat.exits; 5845 ++vcpu->stat.exits;
5800 5846
@@ -5843,16 +5889,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5843 int r; 5889 int r;
5844 struct kvm *kvm = vcpu->kvm; 5890 struct kvm *kvm = vcpu->kvm;
5845 5891
5846 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
5847 pr_debug("vcpu %d received sipi with vector # %x\n",
5848 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5849 kvm_lapic_reset(vcpu);
5850 r = kvm_vcpu_reset(vcpu);
5851 if (r)
5852 return r;
5853 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5854 }
5855
5856 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5892 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5857 r = vapic_enter(vcpu); 5893 r = vapic_enter(vcpu);
5858 if (r) { 5894 if (r) {
@@ -5869,8 +5905,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5869 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5905 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5870 kvm_vcpu_block(vcpu); 5906 kvm_vcpu_block(vcpu);
5871 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 5907 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5872 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 5908 if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
5873 { 5909 kvm_apic_accept_events(vcpu);
5874 switch(vcpu->arch.mp_state) { 5910 switch(vcpu->arch.mp_state) {
5875 case KVM_MP_STATE_HALTED: 5911 case KVM_MP_STATE_HALTED:
5876 vcpu->arch.mp_state = 5912 vcpu->arch.mp_state =
@@ -5878,7 +5914,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5878 case KVM_MP_STATE_RUNNABLE: 5914 case KVM_MP_STATE_RUNNABLE:
5879 vcpu->arch.apf.halted = false; 5915 vcpu->arch.apf.halted = false;
5880 break; 5916 break;
5881 case KVM_MP_STATE_SIPI_RECEIVED: 5917 case KVM_MP_STATE_INIT_RECEIVED:
5918 break;
5882 default: 5919 default:
5883 r = -EINTR; 5920 r = -EINTR;
5884 break; 5921 break;
@@ -6013,6 +6050,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6013 6050
6014 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 6051 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
6015 kvm_vcpu_block(vcpu); 6052 kvm_vcpu_block(vcpu);
6053 kvm_apic_accept_events(vcpu);
6016 clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 6054 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
6017 r = -EAGAIN; 6055 r = -EAGAIN;
6018 goto out; 6056 goto out;
@@ -6169,6 +6207,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6169int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 6207int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6170 struct kvm_mp_state *mp_state) 6208 struct kvm_mp_state *mp_state)
6171{ 6209{
6210 kvm_apic_accept_events(vcpu);
6172 mp_state->mp_state = vcpu->arch.mp_state; 6211 mp_state->mp_state = vcpu->arch.mp_state;
6173 return 0; 6212 return 0;
6174} 6213}
@@ -6176,7 +6215,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6176int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 6215int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6177 struct kvm_mp_state *mp_state) 6216 struct kvm_mp_state *mp_state)
6178{ 6217{
6179 vcpu->arch.mp_state = mp_state->mp_state; 6218 if (!kvm_vcpu_has_lapic(vcpu) &&
6219 mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
6220 return -EINVAL;
6221
6222 if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
6223 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
6224 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
6225 } else
6226 vcpu->arch.mp_state = mp_state->mp_state;
6180 kvm_make_request(KVM_REQ_EVENT, vcpu); 6227 kvm_make_request(KVM_REQ_EVENT, vcpu);
6181 return 0; 6228 return 0;
6182} 6229}
@@ -6475,9 +6522,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6475 r = vcpu_load(vcpu); 6522 r = vcpu_load(vcpu);
6476 if (r) 6523 if (r)
6477 return r; 6524 return r;
6478 r = kvm_vcpu_reset(vcpu); 6525 kvm_vcpu_reset(vcpu);
6479 if (r == 0) 6526 r = kvm_mmu_setup(vcpu);
6480 r = kvm_mmu_setup(vcpu);
6481 vcpu_put(vcpu); 6527 vcpu_put(vcpu);
6482 6528
6483 return r; 6529 return r;
@@ -6514,7 +6560,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6514 kvm_x86_ops->vcpu_free(vcpu); 6560 kvm_x86_ops->vcpu_free(vcpu);
6515} 6561}
6516 6562
6517static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6563void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6518{ 6564{
6519 atomic_set(&vcpu->arch.nmi_queued, 0); 6565 atomic_set(&vcpu->arch.nmi_queued, 0);
6520 vcpu->arch.nmi_pending = 0; 6566 vcpu->arch.nmi_pending = 0;
@@ -6541,7 +6587,18 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
6541 vcpu->arch.regs_avail = ~0; 6587 vcpu->arch.regs_avail = ~0;
6542 vcpu->arch.regs_dirty = ~0; 6588 vcpu->arch.regs_dirty = ~0;
6543 6589
6544 return kvm_x86_ops->vcpu_reset(vcpu); 6590 kvm_x86_ops->vcpu_reset(vcpu);
6591}
6592
6593void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
6594{
6595 struct kvm_segment cs;
6596
6597 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6598 cs.selector = vector << 8;
6599 cs.base = vector << 12;
6600 kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
6601 kvm_rip_write(vcpu, 0);
6545} 6602}
6546 6603
6547int kvm_arch_hardware_enable(void *garbage) 6604int kvm_arch_hardware_enable(void *garbage)
@@ -6706,8 +6763,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6706 } 6763 }
6707 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 6764 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
6708 6765
6709 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6766 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
6767 r = -ENOMEM;
6710 goto fail_free_mce_banks; 6768 goto fail_free_mce_banks;
6769 }
6711 6770
6712 r = fx_init(vcpu); 6771 r = fx_init(vcpu);
6713 if (r) 6772 if (r)
@@ -6811,6 +6870,23 @@ void kvm_arch_sync_events(struct kvm *kvm)
6811 6870
6812void kvm_arch_destroy_vm(struct kvm *kvm) 6871void kvm_arch_destroy_vm(struct kvm *kvm)
6813{ 6872{
6873 if (current->mm == kvm->mm) {
6874 /*
6875 * Free memory regions allocated on behalf of userspace,
6876 * unless the the memory map has changed due to process exit
6877 * or fd copying.
6878 */
6879 struct kvm_userspace_memory_region mem;
6880 memset(&mem, 0, sizeof(mem));
6881 mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
6882 kvm_set_memory_region(kvm, &mem);
6883
6884 mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
6885 kvm_set_memory_region(kvm, &mem);
6886
6887 mem.slot = TSS_PRIVATE_MEMSLOT;
6888 kvm_set_memory_region(kvm, &mem);
6889 }
6814 kvm_iommu_unmap_guest(kvm); 6890 kvm_iommu_unmap_guest(kvm);
6815 kfree(kvm->arch.vpic); 6891 kfree(kvm->arch.vpic);
6816 kfree(kvm->arch.vioapic); 6892 kfree(kvm->arch.vioapic);
@@ -6903,24 +6979,21 @@ out_free:
6903 6979
6904int kvm_arch_prepare_memory_region(struct kvm *kvm, 6980int kvm_arch_prepare_memory_region(struct kvm *kvm,
6905 struct kvm_memory_slot *memslot, 6981 struct kvm_memory_slot *memslot,
6906 struct kvm_memory_slot old,
6907 struct kvm_userspace_memory_region *mem, 6982 struct kvm_userspace_memory_region *mem,
6908 bool user_alloc) 6983 enum kvm_mr_change change)
6909{ 6984{
6910 int npages = memslot->npages;
6911
6912 /* 6985 /*
6913 * Only private memory slots need to be mapped here since 6986 * Only private memory slots need to be mapped here since
6914 * KVM_SET_MEMORY_REGION ioctl is no longer supported. 6987 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
6915 */ 6988 */
6916 if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { 6989 if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
6917 unsigned long userspace_addr; 6990 unsigned long userspace_addr;
6918 6991
6919 /* 6992 /*
6920 * MAP_SHARED to prevent internal slot pages from being moved 6993 * MAP_SHARED to prevent internal slot pages from being moved
6921 * by fork()/COW. 6994 * by fork()/COW.
6922 */ 6995 */
6923 userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, 6996 userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
6924 PROT_READ | PROT_WRITE, 6997 PROT_READ | PROT_WRITE,
6925 MAP_SHARED | MAP_ANONYMOUS, 0); 6998 MAP_SHARED | MAP_ANONYMOUS, 0);
6926 6999
@@ -6935,17 +7008,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6935 7008
6936void kvm_arch_commit_memory_region(struct kvm *kvm, 7009void kvm_arch_commit_memory_region(struct kvm *kvm,
6937 struct kvm_userspace_memory_region *mem, 7010 struct kvm_userspace_memory_region *mem,
6938 struct kvm_memory_slot old, 7011 const struct kvm_memory_slot *old,
6939 bool user_alloc) 7012 enum kvm_mr_change change)
6940{ 7013{
6941 7014
6942 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 7015 int nr_mmu_pages = 0;
6943 7016
6944 if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { 7017 if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
6945 int ret; 7018 int ret;
6946 7019
6947 ret = vm_munmap(old.userspace_addr, 7020 ret = vm_munmap(old->userspace_addr,
6948 old.npages * PAGE_SIZE); 7021 old->npages * PAGE_SIZE);
6949 if (ret < 0) 7022 if (ret < 0)
6950 printk(KERN_WARNING 7023 printk(KERN_WARNING
6951 "kvm_vm_ioctl_set_memory_region: " 7024 "kvm_vm_ioctl_set_memory_region: "
@@ -6962,14 +7035,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6962 * Existing largepage mappings are destroyed here and new ones will 7035 * Existing largepage mappings are destroyed here and new ones will
6963 * not be created until the end of the logging. 7036 * not be created until the end of the logging.
6964 */ 7037 */
6965 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) 7038 if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6966 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 7039 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6967 /* 7040 /*
6968 * If memory slot is created, or moved, we need to clear all 7041 * If memory slot is created, or moved, we need to clear all
6969 * mmio sptes. 7042 * mmio sptes.
6970 */ 7043 */
6971 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) { 7044 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
6972 kvm_mmu_zap_all(kvm); 7045 kvm_mmu_zap_mmio_sptes(kvm);
6973 kvm_reload_remote_mmus(kvm); 7046 kvm_reload_remote_mmus(kvm);
6974 } 7047 }
6975} 7048}
@@ -6991,7 +7064,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6991 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 7064 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6992 !vcpu->arch.apf.halted) 7065 !vcpu->arch.apf.halted)
6993 || !list_empty_careful(&vcpu->async_pf.done) 7066 || !list_empty_careful(&vcpu->async_pf.done)
6994 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 7067 || kvm_apic_has_events(vcpu)
6995 || atomic_read(&vcpu->arch.nmi_queued) || 7068 || atomic_read(&vcpu->arch.nmi_queued) ||
6996 (kvm_arch_interrupt_allowed(vcpu) && 7069 (kvm_arch_interrupt_allowed(vcpu) &&
6997 kvm_cpu_has_interrupt(vcpu)); 7070 kvm_cpu_has_interrupt(vcpu));