aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
commit89f883372fa60f604d136924baf3e89ff1870e9e (patch)
treecb69b0a14957945ba00d3d392bf9ccbbef56f3b8 /arch
parent9e2d59ad580d590134285f361a0e80f0e98c0207 (diff)
parent6b73a96065e89dc9fa75ba4f78b1aa3a3bbd0470 (diff)
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "KVM updates for the 3.9 merge window, including x86 real mode emulation fixes, stronger memory slot interface restrictions, mmu_lock spinlock hold time reduction, improved handling of large page faults on shadow, initial APICv HW acceleration support, s390 channel IO based virtio, amongst others" * tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) Revert "KVM: MMU: lazily drop large spte" x86: pvclock kvm: align allocation size to page size KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr x86 emulator: fix parity calculation for AAD instruction KVM: PPC: BookE: Handle alignment interrupts booke: Added DBCR4 SPR number KVM: PPC: booke: Allow multiple exception types KVM: PPC: booke: use vcpu reference from thread_struct KVM: Remove user_alloc from struct kvm_memory_slot KVM: VMX: disable apicv by default KVM: s390: Fix handling of iscs. KVM: MMU: cleanup __direct_map KVM: MMU: remove pt_access in mmu_set_spte KVM: MMU: cleanup mapping-level KVM: MMU: lazily drop large spte KVM: VMX: cleanup vmx_set_cr0(). KVM: VMX: add missing exit names to VMX_EXIT_REASONS array KVM: VMX: disable SMEP feature when guest is in non-paging mode KVM: Remove duplicate text in api.txt Revert "KVM: MMU: split kvm_mmu_free_page" ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/kvm-ia64.c8
-rw-r--r--arch/ia64/kvm/lapic.h6
-rw-r--r--arch/powerpc/include/asm/kvm_host.h8
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h12
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/reg_booke.h1
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h6
-rw-r--r--arch/powerpc/kernel/asm-offsets.c2
-rw-r--r--arch/powerpc/kvm/Makefile9
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c30
-rw-r--r--arch/powerpc/kvm/book3s_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c5
-rw-r--r--arch/powerpc/kvm/booke.c70
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c3
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S49
-rw-r--r--arch/powerpc/kvm/e500.c16
-rw-r--r--arch/powerpc/kvm/e500.h1
-rw-r--r--arch/powerpc/kvm/e500_mmu.c (renamed from arch/powerpc/kvm/e500_tlb.c)659
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c699
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.h18
-rw-r--r--arch/powerpc/kvm/emulate.c5
-rw-r--r--arch/powerpc/kvm/powerpc.c17
-rw-r--r--arch/s390/include/asm/irq.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h15
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kvm/intercept.c45
-rw-r--r--arch/s390/kvm/interrupt.c264
-rw-r--r--arch/s390/kvm/kvm-s390.c50
-rw-r--r--arch/s390/kvm/kvm-s390.h46
-rw-r--r--arch/s390/kvm/priv.c316
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/trace-s390.h26
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/vmx.h9
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kvm/emulate.c673
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/irq.c74
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c168
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h106
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c714
-rw-r--r--arch/x86/kvm/x86.c168
51 files changed, 3044 insertions, 1539 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 6d6a5ac48d85..cfa74983c675 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,9 +23,7 @@
23#ifndef __ASM_KVM_HOST_H 23#ifndef __ASM_KVM_HOST_H
24#define __ASM_KVM_HOST_H 24#define __ASM_KVM_HOST_H
25 25
26#define KVM_MEMORY_SLOTS 32 26#define KVM_USER_MEM_SLOTS 32
27/* memory slots that does not exposed to userspace */
28#define KVM_PRIVATE_MEM_SLOTS 4
29 27
30#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 28#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
31 29
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd1c51555038..ad3126a58644 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -955,7 +955,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
955 kvm_mem.guest_phys_addr; 955 kvm_mem.guest_phys_addr;
956 kvm_userspace_mem.memory_size = kvm_mem.memory_size; 956 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
957 r = kvm_vm_ioctl_set_memory_region(kvm, 957 r = kvm_vm_ioctl_set_memory_region(kvm,
958 &kvm_userspace_mem, 0); 958 &kvm_userspace_mem, false);
959 if (r) 959 if (r)
960 goto out; 960 goto out;
961 break; 961 break;
@@ -1580,7 +1580,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1580 struct kvm_memory_slot *memslot, 1580 struct kvm_memory_slot *memslot,
1581 struct kvm_memory_slot old, 1581 struct kvm_memory_slot old,
1582 struct kvm_userspace_memory_region *mem, 1582 struct kvm_userspace_memory_region *mem,
1583 int user_alloc) 1583 bool user_alloc)
1584{ 1584{
1585 unsigned long i; 1585 unsigned long i;
1586 unsigned long pfn; 1586 unsigned long pfn;
@@ -1611,7 +1611,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1611void kvm_arch_commit_memory_region(struct kvm *kvm, 1611void kvm_arch_commit_memory_region(struct kvm *kvm,
1612 struct kvm_userspace_memory_region *mem, 1612 struct kvm_userspace_memory_region *mem,
1613 struct kvm_memory_slot old, 1613 struct kvm_memory_slot old,
1614 int user_alloc) 1614 bool user_alloc)
1615{ 1615{
1616 return; 1616 return;
1617} 1617}
@@ -1834,7 +1834,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1834 mutex_lock(&kvm->slots_lock); 1834 mutex_lock(&kvm->slots_lock);
1835 1835
1836 r = -EINVAL; 1836 r = -EINVAL;
1837 if (log->slot >= KVM_MEMORY_SLOTS) 1837 if (log->slot >= KVM_USER_MEM_SLOTS)
1838 goto out; 1838 goto out;
1839 1839
1840 memslot = id_to_memslot(kvm->memslots, log->slot); 1840 memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c5f92a926a9a..c3e2935b6db4 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,4 +27,10 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
27#define kvm_apic_present(x) (true) 27#define kvm_apic_present(x) (true)
28#define kvm_lapic_enabled(x) (true) 28#define kvm_lapic_enabled(x) (true)
29 29
30static inline bool kvm_apic_vid_enabled(void)
31{
32 /* IA64 has no apicv supporting, do nothing here */
33 return false;
34}
35
30#endif 36#endif
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 03d7beae89a0..d1bb86074721 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -37,10 +37,8 @@
37 37
38#define KVM_MAX_VCPUS NR_CPUS 38#define KVM_MAX_VCPUS NR_CPUS
39#define KVM_MAX_VCORES NR_CPUS 39#define KVM_MAX_VCORES NR_CPUS
40#define KVM_MEMORY_SLOTS 32 40#define KVM_USER_MEM_SLOTS 32
41/* memory slots that does not exposed to userspace */ 41#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS
42#define KVM_PRIVATE_MEM_SLOTS 4
43#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
44 42
45#ifdef CONFIG_KVM_MMIO 43#ifdef CONFIG_KVM_MMIO
46#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 44#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -523,6 +521,8 @@ struct kvm_vcpu_arch {
523 u8 sane; 521 u8 sane;
524 u8 cpu_type; 522 u8 cpu_type;
525 u8 hcall_needed; 523 u8 hcall_needed;
524 u8 epr_enabled;
525 u8 epr_needed;
526 526
527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 527 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
528 528
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 572aa7530619..44a657adf416 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -44,12 +44,11 @@ enum emulation_result {
44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */ 44 EMULATE_DO_DCR, /* kvm_run filled with DCR request */
45 EMULATE_FAIL, /* can't emulate this instruction */ 45 EMULATE_FAIL, /* can't emulate this instruction */
46 EMULATE_AGAIN, /* something went wrong. go again */ 46 EMULATE_AGAIN, /* something went wrong. go again */
47 EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */
47}; 48};
48 49
49extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 50extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
50extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 51extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
51extern char kvmppc_handlers_start[];
52extern unsigned long kvmppc_handler_len;
53extern void kvmppc_handler_highmem(void); 52extern void kvmppc_handler_highmem(void);
54 53
55extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu); 54extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
@@ -263,6 +262,15 @@ static inline void kvm_linear_init(void)
263{} 262{}
264#endif 263#endif
265 264
265static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
266{
267#ifdef CONFIG_KVM_BOOKE_HV
268 mtspr(SPRN_GEPR, epr);
269#elif defined(CONFIG_BOOKE)
270 vcpu->arch.epr = epr;
271#endif
272}
273
266int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, 274int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
267 struct kvm_config_tlb *cfg); 275 struct kvm_config_tlb *cfg);
268int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, 276int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7035e608f3fa..e66586122030 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -956,8 +956,6 @@
956#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9 956#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9
957#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9 957#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9
958#endif 958#endif
959#define SPRN_SPRG_RVCPU SPRN_SPRG1
960#define SPRN_SPRG_WVCPU SPRN_SPRG1
961#endif 959#endif
962 960
963#ifdef CONFIG_8xx 961#ifdef CONFIG_8xx
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index e07e6af5e1ff..b417de3cc2c4 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -56,6 +56,7 @@
56#define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ 56#define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */
57#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ 57#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */
58#define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ 58#define SPRN_DBCR2 0x136 /* Debug Control Register 2 */
59#define SPRN_DBCR4 0x233 /* Debug Control Register 4 */
59#define SPRN_MSRP 0x137 /* MSR Protect Register */ 60#define SPRN_MSRP 0x137 /* MSR Protect Register */
60#define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ 61#define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */
61#define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ 62#define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 2fba8a66fb10..16064d00adb9 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -114,7 +114,10 @@ struct kvm_regs {
114/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */ 114/* Embedded Floating Point (SPE) -- IVOR32-34 if KVM_SREGS_E_IVOR */
115#define KVM_SREGS_E_SPE (1 << 9) 115#define KVM_SREGS_E_SPE (1 << 9)
116 116
117/* External Proxy (EXP) -- EPR */ 117/*
118 * DEPRECATED! USE ONE_REG FOR THIS ONE!
119 * External Proxy (EXP) -- EPR
120 */
118#define KVM_SREGS_EXP (1 << 10) 121#define KVM_SREGS_EXP (1 << 10)
119 122
120/* External PID (E.PD) -- EPSC/EPLC */ 123/* External PID (E.PD) -- EPSC/EPLC */
@@ -412,5 +415,6 @@ struct kvm_get_htab_header {
412#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) 415#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
413 416
414#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) 417#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
418#define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
415 419
416#endif /* __LINUX_KVM_POWERPC_H */ 420#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 781190367292..b6c17ec9b169 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -118,7 +118,7 @@ int main(void)
118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER 118#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); 119 DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
120#endif 120#endif
121#ifdef CONFIG_KVM_BOOKE_HV 121#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
122 DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); 122 DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
123#endif 123#endif
124 124
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1e473d46322c..b772eded8c26 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -10,7 +10,8 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
10 eventfd.o) 10 eventfd.o)
11 11
12CFLAGS_44x_tlb.o := -I. 12CFLAGS_44x_tlb.o := -I.
13CFLAGS_e500_tlb.o := -I. 13CFLAGS_e500_mmu.o := -I.
14CFLAGS_e500_mmu_host.o := -I.
14CFLAGS_emulate.o := -I. 15CFLAGS_emulate.o := -I.
15 16
16common-objs-y += powerpc.o emulate.o 17common-objs-y += powerpc.o emulate.o
@@ -35,7 +36,8 @@ kvm-e500-objs := \
35 booke_emulate.o \ 36 booke_emulate.o \
36 booke_interrupts.o \ 37 booke_interrupts.o \
37 e500.o \ 38 e500.o \
38 e500_tlb.o \ 39 e500_mmu.o \
40 e500_mmu_host.o \
39 e500_emulate.o 41 e500_emulate.o
40kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) 42kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
41 43
@@ -45,7 +47,8 @@ kvm-e500mc-objs := \
45 booke_emulate.o \ 47 booke_emulate.o \
46 bookehv_interrupts.o \ 48 bookehv_interrupts.o \
47 e500mc.o \ 49 e500mc.o \
48 e500_tlb.o \ 50 e500_mmu.o \
51 e500_mmu_host.o \
49 e500_emulate.o 52 e500_emulate.o
50kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) 53kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
51 54
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index d31a716f7f2b..836c56975e21 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -34,6 +34,8 @@
34#define OP_31_XOP_MTSRIN 242 34#define OP_31_XOP_MTSRIN 242
35#define OP_31_XOP_TLBIEL 274 35#define OP_31_XOP_TLBIEL 274
36#define OP_31_XOP_TLBIE 306 36#define OP_31_XOP_TLBIE 306
37/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
38#define OP_31_XOP_FAKE_SC1 308
37#define OP_31_XOP_SLBMTE 402 39#define OP_31_XOP_SLBMTE 402
38#define OP_31_XOP_SLBIE 434 40#define OP_31_XOP_SLBIE 434
39#define OP_31_XOP_SLBIA 498 41#define OP_31_XOP_SLBIA 498
@@ -170,6 +172,32 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
170 vcpu->arch.mmu.tlbie(vcpu, addr, large); 172 vcpu->arch.mmu.tlbie(vcpu, addr, large);
171 break; 173 break;
172 } 174 }
175#ifdef CONFIG_KVM_BOOK3S_64_PR
176 case OP_31_XOP_FAKE_SC1:
177 {
178 /* SC 1 papr hypercalls */
179 ulong cmd = kvmppc_get_gpr(vcpu, 3);
180 int i;
181
182 if ((vcpu->arch.shared->msr & MSR_PR) ||
183 !vcpu->arch.papr_enabled) {
184 emulated = EMULATE_FAIL;
185 break;
186 }
187
188 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
189 break;
190
191 run->papr_hcall.nr = cmd;
192 for (i = 0; i < 9; ++i) {
193 ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
194 run->papr_hcall.args[i] = gpr;
195 }
196
197 emulated = EMULATE_DO_PAPR;
198 break;
199 }
200#endif
173 case OP_31_XOP_EIOIO: 201 case OP_31_XOP_EIOIO:
174 break; 202 break;
175 case OP_31_XOP_SLBMTE: 203 case OP_31_XOP_SLBMTE:
@@ -427,6 +455,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
427 case SPRN_PMC3_GEKKO: 455 case SPRN_PMC3_GEKKO:
428 case SPRN_PMC4_GEKKO: 456 case SPRN_PMC4_GEKKO:
429 case SPRN_WPAR_GEKKO: 457 case SPRN_WPAR_GEKKO:
458 case SPRN_MSSSR0:
430 break; 459 break;
431unprivileged: 460unprivileged:
432 default: 461 default:
@@ -523,6 +552,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
523 case SPRN_PMC3_GEKKO: 552 case SPRN_PMC3_GEKKO:
524 case SPRN_PMC4_GEKKO: 553 case SPRN_PMC4_GEKKO:
525 case SPRN_WPAR_GEKKO: 554 case SPRN_WPAR_GEKKO:
555 case SPRN_MSSSR0:
526 *spr_val = 0; 556 *spr_val = 0;
527 break; 557 break;
528 default: 558 default:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 71d0c90b62bf..80dcc53a1aba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1549,7 +1549,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
1549 mutex_lock(&kvm->slots_lock); 1549 mutex_lock(&kvm->slots_lock);
1550 1550
1551 r = -EINVAL; 1551 r = -EINVAL;
1552 if (log->slot >= KVM_MEMORY_SLOTS) 1552 if (log->slot >= KVM_USER_MEM_SLOTS)
1553 goto out; 1553 goto out;
1554 1554
1555 memslot = id_to_memslot(kvm->memslots, log->slot); 1555 memslot = id_to_memslot(kvm->memslots, log->slot);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 6702442ca818..5e93438afb06 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -762,6 +762,11 @@ program_interrupt:
762 run->exit_reason = KVM_EXIT_MMIO; 762 run->exit_reason = KVM_EXIT_MMIO;
763 r = RESUME_HOST_NV; 763 r = RESUME_HOST_NV;
764 break; 764 break;
765 case EMULATE_DO_PAPR:
766 run->exit_reason = KVM_EXIT_PAPR_HCALL;
767 vcpu->arch.hcall_needed = 1;
768 r = RESUME_HOST_NV;
769 break;
765 default: 770 default:
766 BUG(); 771 BUG();
767 } 772 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 69f114015780..020923e43134 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -182,6 +182,14 @@ static void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu,
182 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); 182 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
183} 183}
184 184
185static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags,
186 ulong esr_flags)
187{
188 vcpu->arch.queued_dear = dear_flags;
189 vcpu->arch.queued_esr = esr_flags;
190 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT);
191}
192
185void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) 193void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
186{ 194{
187 vcpu->arch.queued_esr = esr_flags; 195 vcpu->arch.queued_esr = esr_flags;
@@ -300,13 +308,22 @@ static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
300#endif 308#endif
301} 309}
302 310
311static unsigned long get_guest_epr(struct kvm_vcpu *vcpu)
312{
313#ifdef CONFIG_KVM_BOOKE_HV
314 return mfspr(SPRN_GEPR);
315#else
316 return vcpu->arch.epr;
317#endif
318}
319
303/* Deliver the interrupt of the corresponding priority, if possible. */ 320/* Deliver the interrupt of the corresponding priority, if possible. */
304static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, 321static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
305 unsigned int priority) 322 unsigned int priority)
306{ 323{
307 int allowed = 0; 324 int allowed = 0;
308 ulong msr_mask = 0; 325 ulong msr_mask = 0;
309 bool update_esr = false, update_dear = false; 326 bool update_esr = false, update_dear = false, update_epr = false;
310 ulong crit_raw = vcpu->arch.shared->critical; 327 ulong crit_raw = vcpu->arch.shared->critical;
311 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); 328 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
312 bool crit; 329 bool crit;
@@ -330,9 +347,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
330 keep_irq = true; 347 keep_irq = true;
331 } 348 }
332 349
350 if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
351 update_epr = true;
352
333 switch (priority) { 353 switch (priority) {
334 case BOOKE_IRQPRIO_DTLB_MISS: 354 case BOOKE_IRQPRIO_DTLB_MISS:
335 case BOOKE_IRQPRIO_DATA_STORAGE: 355 case BOOKE_IRQPRIO_DATA_STORAGE:
356 case BOOKE_IRQPRIO_ALIGNMENT:
336 update_dear = true; 357 update_dear = true;
337 /* fall through */ 358 /* fall through */
338 case BOOKE_IRQPRIO_INST_STORAGE: 359 case BOOKE_IRQPRIO_INST_STORAGE:
@@ -346,7 +367,6 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
346 case BOOKE_IRQPRIO_SPE_FP_DATA: 367 case BOOKE_IRQPRIO_SPE_FP_DATA:
347 case BOOKE_IRQPRIO_SPE_FP_ROUND: 368 case BOOKE_IRQPRIO_SPE_FP_ROUND:
348 case BOOKE_IRQPRIO_AP_UNAVAIL: 369 case BOOKE_IRQPRIO_AP_UNAVAIL:
349 case BOOKE_IRQPRIO_ALIGNMENT:
350 allowed = 1; 370 allowed = 1;
351 msr_mask = MSR_CE | MSR_ME | MSR_DE; 371 msr_mask = MSR_CE | MSR_ME | MSR_DE;
352 int_class = INT_CLASS_NONCRIT; 372 int_class = INT_CLASS_NONCRIT;
@@ -408,6 +428,8 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
408 set_guest_esr(vcpu, vcpu->arch.queued_esr); 428 set_guest_esr(vcpu, vcpu->arch.queued_esr);
409 if (update_dear == true) 429 if (update_dear == true)
410 set_guest_dear(vcpu, vcpu->arch.queued_dear); 430 set_guest_dear(vcpu, vcpu->arch.queued_dear);
431 if (update_epr == true)
432 kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
411 433
412 new_msr &= msr_mask; 434 new_msr &= msr_mask;
413#if defined(CONFIG_64BIT) 435#if defined(CONFIG_64BIT)
@@ -581,6 +603,11 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
581 603
582 kvmppc_core_check_exceptions(vcpu); 604 kvmppc_core_check_exceptions(vcpu);
583 605
606 if (vcpu->requests) {
607 /* Exception delivery raised request; start over */
608 return 1;
609 }
610
584 if (vcpu->arch.shared->msr & MSR_WE) { 611 if (vcpu->arch.shared->msr & MSR_WE) {
585 local_irq_enable(); 612 local_irq_enable();
586 kvm_vcpu_block(vcpu); 613 kvm_vcpu_block(vcpu);
@@ -610,6 +637,13 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
610 r = 0; 637 r = 0;
611 } 638 }
612 639
640 if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) {
641 vcpu->run->epr.epr = 0;
642 vcpu->arch.epr_needed = true;
643 vcpu->run->exit_reason = KVM_EXIT_EPR;
644 r = 0;
645 }
646
613 return r; 647 return r;
614} 648}
615 649
@@ -945,6 +979,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
945 r = RESUME_GUEST; 979 r = RESUME_GUEST;
946 break; 980 break;
947 981
982 case BOOKE_INTERRUPT_ALIGNMENT:
983 kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear,
984 vcpu->arch.fault_esr);
985 r = RESUME_GUEST;
986 break;
987
948#ifdef CONFIG_KVM_BOOKE_HV 988#ifdef CONFIG_KVM_BOOKE_HV
949 case BOOKE_INTERRUPT_HV_SYSCALL: 989 case BOOKE_INTERRUPT_HV_SYSCALL:
950 if (!(vcpu->arch.shared->msr & MSR_PR)) { 990 if (!(vcpu->arch.shared->msr & MSR_PR)) {
@@ -1388,6 +1428,11 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1388 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); 1428 &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
1389 break; 1429 break;
1390 } 1430 }
1431 case KVM_REG_PPC_EPR: {
1432 u32 epr = get_guest_epr(vcpu);
1433 r = put_user(epr, (u32 __user *)(long)reg->addr);
1434 break;
1435 }
1391#if defined(CONFIG_64BIT) 1436#if defined(CONFIG_64BIT)
1392 case KVM_REG_PPC_EPCR: 1437 case KVM_REG_PPC_EPCR:
1393 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); 1438 r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
@@ -1420,6 +1465,13 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
1420 (u64 __user *)(long)reg->addr, sizeof(u64)); 1465 (u64 __user *)(long)reg->addr, sizeof(u64));
1421 break; 1466 break;
1422 } 1467 }
1468 case KVM_REG_PPC_EPR: {
1469 u32 new_epr;
1470 r = get_user(new_epr, (u32 __user *)(long)reg->addr);
1471 if (!r)
1472 kvmppc_set_epr(vcpu, new_epr);
1473 break;
1474 }
1423#if defined(CONFIG_64BIT) 1475#if defined(CONFIG_64BIT)
1424 case KVM_REG_PPC_EPCR: { 1476 case KVM_REG_PPC_EPCR: {
1425 u32 new_epcr; 1477 u32 new_epcr;
@@ -1556,7 +1608,9 @@ int __init kvmppc_booke_init(void)
1556{ 1608{
1557#ifndef CONFIG_KVM_BOOKE_HV 1609#ifndef CONFIG_KVM_BOOKE_HV
1558 unsigned long ivor[16]; 1610 unsigned long ivor[16];
1611 unsigned long *handler = kvmppc_booke_handler_addr;
1559 unsigned long max_ivor = 0; 1612 unsigned long max_ivor = 0;
1613 unsigned long handler_len;
1560 int i; 1614 int i;
1561 1615
1562 /* We install our own exception handlers by hijacking IVPR. IVPR must 1616 /* We install our own exception handlers by hijacking IVPR. IVPR must
@@ -1589,14 +1643,16 @@ int __init kvmppc_booke_init(void)
1589 1643
1590 for (i = 0; i < 16; i++) { 1644 for (i = 0; i < 16; i++) {
1591 if (ivor[i] > max_ivor) 1645 if (ivor[i] > max_ivor)
1592 max_ivor = ivor[i]; 1646 max_ivor = i;
1593 1647
1648 handler_len = handler[i + 1] - handler[i];
1594 memcpy((void *)kvmppc_booke_handlers + ivor[i], 1649 memcpy((void *)kvmppc_booke_handlers + ivor[i],
1595 kvmppc_handlers_start + i * kvmppc_handler_len, 1650 (void *)handler[i], handler_len);
1596 kvmppc_handler_len);
1597 } 1651 }
1598 flush_icache_range(kvmppc_booke_handlers, 1652
1599 kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); 1653 handler_len = handler[max_ivor + 1] - handler[max_ivor];
1654 flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
1655 ivor[max_ivor] + handler_len);
1600#endif /* !BOOKE_HV */ 1656#endif /* !BOOKE_HV */
1601 return 0; 1657 return 0;
1602} 1658}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index e9b88e433f64..5fd1ba693579 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -65,6 +65,7 @@
65 (1 << BOOKE_IRQPRIO_CRITICAL)) 65 (1 << BOOKE_IRQPRIO_CRITICAL))
66 66
67extern unsigned long kvmppc_booke_handlers; 67extern unsigned long kvmppc_booke_handlers;
68extern unsigned long kvmppc_booke_handler_addr[];
68 69
69void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 70void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
70void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 71void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 4685b8cf2249..27a4b2877c10 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -269,6 +269,9 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
269 case SPRN_ESR: 269 case SPRN_ESR:
270 *spr_val = vcpu->arch.shared->esr; 270 *spr_val = vcpu->arch.shared->esr;
271 break; 271 break;
272 case SPRN_EPR:
273 *spr_val = vcpu->arch.epr;
274 break;
272 case SPRN_CSRR0: 275 case SPRN_CSRR0:
273 *spr_val = vcpu->arch.csrr0; 276 *spr_val = vcpu->arch.csrr0;
274 break; 277 break;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index bb46b32f9813..f4bb55c96517 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -45,18 +45,21 @@
45 (1<<BOOKE_INTERRUPT_DEBUG)) 45 (1<<BOOKE_INTERRUPT_DEBUG))
46 46
47#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ 47#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
48 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 48 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
49 (1<<BOOKE_INTERRUPT_ALIGNMENT))
49 50
50#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ 51#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
51 (1<<BOOKE_INTERRUPT_INST_STORAGE) | \ 52 (1<<BOOKE_INTERRUPT_INST_STORAGE) | \
52 (1<<BOOKE_INTERRUPT_PROGRAM) | \ 53 (1<<BOOKE_INTERRUPT_PROGRAM) | \
53 (1<<BOOKE_INTERRUPT_DTLB_MISS)) 54 (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
55 (1<<BOOKE_INTERRUPT_ALIGNMENT))
54 56
55.macro KVM_HANDLER ivor_nr scratch srr0 57.macro KVM_HANDLER ivor_nr scratch srr0
56_GLOBAL(kvmppc_handler_\ivor_nr) 58_GLOBAL(kvmppc_handler_\ivor_nr)
57 /* Get pointer to vcpu and record exit number. */ 59 /* Get pointer to vcpu and record exit number. */
58 mtspr \scratch , r4 60 mtspr \scratch , r4
59 mfspr r4, SPRN_SPRG_RVCPU 61 mfspr r4, SPRN_SPRG_THREAD
62 lwz r4, THREAD_KVM_VCPU(r4)
60 stw r3, VCPU_GPR(R3)(r4) 63 stw r3, VCPU_GPR(R3)(r4)
61 stw r5, VCPU_GPR(R5)(r4) 64 stw r5, VCPU_GPR(R5)(r4)
62 stw r6, VCPU_GPR(R6)(r4) 65 stw r6, VCPU_GPR(R6)(r4)
@@ -73,6 +76,14 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
73 bctr 76 bctr
74.endm 77.endm
75 78
79.macro KVM_HANDLER_ADDR ivor_nr
80 .long kvmppc_handler_\ivor_nr
81.endm
82
83.macro KVM_HANDLER_END
84 .long kvmppc_handlers_end
85.endm
86
76_GLOBAL(kvmppc_handlers_start) 87_GLOBAL(kvmppc_handlers_start)
77KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 88KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
78KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 89KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
@@ -93,9 +104,7 @@ KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
93KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 104KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
94KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 105KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
95KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 106KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
96 107_GLOBAL(kvmppc_handlers_end)
97_GLOBAL(kvmppc_handler_len)
98 .long kvmppc_handler_1 - kvmppc_handler_0
99 108
100/* Registers: 109/* Registers:
101 * SPRG_SCRATCH0: guest r4 110 * SPRG_SCRATCH0: guest r4
@@ -402,9 +411,6 @@ lightweight_exit:
402 lwz r8, kvmppc_booke_handlers@l(r8) 411 lwz r8, kvmppc_booke_handlers@l(r8)
403 mtspr SPRN_IVPR, r8 412 mtspr SPRN_IVPR, r8
404 413
405 /* Save vcpu pointer for the exception handlers. */
406 mtspr SPRN_SPRG_WVCPU, r4
407
408 lwz r5, VCPU_SHARED(r4) 414 lwz r5, VCPU_SHARED(r4)
409 415
410 /* Can't switch the stack pointer until after IVPR is switched, 416 /* Can't switch the stack pointer until after IVPR is switched,
@@ -463,6 +469,31 @@ lightweight_exit:
463 lwz r4, VCPU_GPR(R4)(r4) 469 lwz r4, VCPU_GPR(R4)(r4)
464 rfi 470 rfi
465 471
472 .data
473 .align 4
474 .globl kvmppc_booke_handler_addr
475kvmppc_booke_handler_addr:
476KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL
477KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK
478KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE
479KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE
480KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL
481KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT
482KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM
483KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL
484KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL
485KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL
486KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER
487KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT
488KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG
489KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS
490KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS
491KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG
492KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL
493KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA
494KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND
495KVM_HANDLER_END /*Always keep this in end*/
496
466#ifdef CONFIG_SPE 497#ifdef CONFIG_SPE
467_GLOBAL(kvmppc_save_guest_spe) 498_GLOBAL(kvmppc_save_guest_spe)
468 cmpi 0,r3,0 499 cmpi 0,r3,0
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index b479ed77c515..6dd4de7802bf 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -491,6 +491,9 @@ static int __init kvmppc_e500_init(void)
491{ 491{
492 int r, i; 492 int r, i;
493 unsigned long ivor[3]; 493 unsigned long ivor[3];
494 /* Process remaining handlers above the generic first 16 */
495 unsigned long *handler = &kvmppc_booke_handler_addr[16];
496 unsigned long handler_len;
494 unsigned long max_ivor = 0; 497 unsigned long max_ivor = 0;
495 498
496 r = kvmppc_core_check_processor_compat(); 499 r = kvmppc_core_check_processor_compat();
@@ -506,15 +509,16 @@ static int __init kvmppc_e500_init(void)
506 ivor[1] = mfspr(SPRN_IVOR33); 509 ivor[1] = mfspr(SPRN_IVOR33);
507 ivor[2] = mfspr(SPRN_IVOR34); 510 ivor[2] = mfspr(SPRN_IVOR34);
508 for (i = 0; i < 3; i++) { 511 for (i = 0; i < 3; i++) {
509 if (ivor[i] > max_ivor) 512 if (ivor[i] > ivor[max_ivor])
510 max_ivor = ivor[i]; 513 max_ivor = i;
511 514
515 handler_len = handler[i + 1] - handler[i];
512 memcpy((void *)kvmppc_booke_handlers + ivor[i], 516 memcpy((void *)kvmppc_booke_handlers + ivor[i],
513 kvmppc_handlers_start + (i + 16) * kvmppc_handler_len, 517 (void *)handler[i], handler_len);
514 kvmppc_handler_len);
515 } 518 }
516 flush_icache_range(kvmppc_booke_handlers, 519 handler_len = handler[max_ivor + 1] - handler[max_ivor];
517 kvmppc_booke_handlers + max_ivor + kvmppc_handler_len); 520 flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
521 ivor[max_ivor] + handler_len);
518 522
519 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); 523 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
520} 524}
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index c70d37ed770a..41cefd43655f 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -28,6 +28,7 @@
28 28
29#define E500_TLB_VALID 1 29#define E500_TLB_VALID 1
30#define E500_TLB_BITMAP 2 30#define E500_TLB_BITMAP 2
31#define E500_TLB_TLB0 (1 << 2)
31 32
32struct tlbe_ref { 33struct tlbe_ref {
33 pfn_t pfn; 34 pfn_t pfn;
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_mmu.c
index cf3f18012371..5c4475983f78 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -1,10 +1,11 @@
1/* 1/*
2 * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. 2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 * 3 *
4 * Author: Yu Liu, yu.liu@freescale.com 4 * Author: Yu Liu, yu.liu@freescale.com
5 * Scott Wood, scottwood@freescale.com 5 * Scott Wood, scottwood@freescale.com
6 * Ashish Kalra, ashish.kalra@freescale.com 6 * Ashish Kalra, ashish.kalra@freescale.com
7 * Varun Sethi, varun.sethi@freescale.com 7 * Varun Sethi, varun.sethi@freescale.com
8 * Alexander Graf, agraf@suse.de
8 * 9 *
9 * Description: 10 * Description:
10 * This file is based on arch/powerpc/kvm/44x_tlb.c, 11 * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -33,10 +34,7 @@
33#include "e500.h" 34#include "e500.h"
34#include "trace.h" 35#include "trace.h"
35#include "timing.h" 36#include "timing.h"
36 37#include "e500_mmu_host.h"
37#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
38
39static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
40 38
41static inline unsigned int gtlb0_get_next_victim( 39static inline unsigned int gtlb0_get_next_victim(
42 struct kvmppc_vcpu_e500 *vcpu_e500) 40 struct kvmppc_vcpu_e500 *vcpu_e500)
@@ -50,174 +48,6 @@ static inline unsigned int gtlb0_get_next_victim(
50 return victim; 48 return victim;
51} 49}
52 50
53static inline unsigned int tlb1_max_shadow_size(void)
54{
55 /* reserve one entry for magic page */
56 return host_tlb_params[1].entries - tlbcam_index - 1;
57}
58
59static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
60{
61 return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
62}
63
64static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
65{
66 /* Mask off reserved bits. */
67 mas3 &= MAS3_ATTRIB_MASK;
68
69#ifndef CONFIG_KVM_BOOKE_HV
70 if (!usermode) {
71 /* Guest is in supervisor mode,
72 * so we need to translate guest
73 * supervisor permissions into user permissions. */
74 mas3 &= ~E500_TLB_USER_PERM_MASK;
75 mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
76 }
77 mas3 |= E500_TLB_SUPER_PERM_MASK;
78#endif
79 return mas3;
80}
81
82static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
83{
84#ifdef CONFIG_SMP
85 return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
86#else
87 return mas2 & MAS2_ATTRIB_MASK;
88#endif
89}
90
91/*
92 * writing shadow tlb entry to host TLB
93 */
94static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
95 uint32_t mas0)
96{
97 unsigned long flags;
98
99 local_irq_save(flags);
100 mtspr(SPRN_MAS0, mas0);
101 mtspr(SPRN_MAS1, stlbe->mas1);
102 mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
103 mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
104 mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
105#ifdef CONFIG_KVM_BOOKE_HV
106 mtspr(SPRN_MAS8, stlbe->mas8);
107#endif
108 asm volatile("isync; tlbwe" : : : "memory");
109
110#ifdef CONFIG_KVM_BOOKE_HV
111 /* Must clear mas8 for other host tlbwe's */
112 mtspr(SPRN_MAS8, 0);
113 isync();
114#endif
115 local_irq_restore(flags);
116
117 trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
118 stlbe->mas2, stlbe->mas7_3);
119}
120
121/*
122 * Acquire a mas0 with victim hint, as if we just took a TLB miss.
123 *
124 * We don't care about the address we're searching for, other than that it's
125 * in the right set and is not present in the TLB. Using a zero PID and a
126 * userspace address means we don't have to set and then restore MAS5, or
127 * calculate a proper MAS6 value.
128 */
129static u32 get_host_mas0(unsigned long eaddr)
130{
131 unsigned long flags;
132 u32 mas0;
133
134 local_irq_save(flags);
135 mtspr(SPRN_MAS6, 0);
136 asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
137 mas0 = mfspr(SPRN_MAS0);
138 local_irq_restore(flags);
139
140 return mas0;
141}
142
143/* sesel is for tlb1 only */
144static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
145 int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
146{
147 u32 mas0;
148
149 if (tlbsel == 0) {
150 mas0 = get_host_mas0(stlbe->mas2);
151 __write_host_tlbe(stlbe, mas0);
152 } else {
153 __write_host_tlbe(stlbe,
154 MAS0_TLBSEL(1) |
155 MAS0_ESEL(to_htlb1_esel(sesel)));
156 }
157}
158
159#ifdef CONFIG_KVM_E500V2
160void kvmppc_map_magic(struct kvm_vcpu *vcpu)
161{
162 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
163 struct kvm_book3e_206_tlb_entry magic;
164 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
165 unsigned int stid;
166 pfn_t pfn;
167
168 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
169 get_page(pfn_to_page(pfn));
170
171 preempt_disable();
172 stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
173
174 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
175 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
176 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
177 magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
178 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
179 magic.mas8 = 0;
180
181 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
182 preempt_enable();
183}
184#endif
185
186static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
187 int tlbsel, int esel)
188{
189 struct kvm_book3e_206_tlb_entry *gtlbe =
190 get_entry(vcpu_e500, tlbsel, esel);
191
192 if (tlbsel == 1 &&
193 vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) {
194 u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
195 int hw_tlb_indx;
196 unsigned long flags;
197
198 local_irq_save(flags);
199 while (tmp) {
200 hw_tlb_indx = __ilog2_u64(tmp & -tmp);
201 mtspr(SPRN_MAS0,
202 MAS0_TLBSEL(1) |
203 MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
204 mtspr(SPRN_MAS1, 0);
205 asm volatile("tlbwe");
206 vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
207 tmp &= tmp - 1;
208 }
209 mb();
210 vcpu_e500->g2h_tlb1_map[esel] = 0;
211 vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP;
212 local_irq_restore(flags);
213
214 return;
215 }
216
217 /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
218 kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
219}
220
221static int tlb0_set_base(gva_t addr, int sets, int ways) 51static int tlb0_set_base(gva_t addr, int sets, int ways)
222{ 52{
223 int set_base; 53 int set_base;
@@ -296,70 +126,6 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
296 return -1; 126 return -1;
297} 127}
298 128
299static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
300 struct kvm_book3e_206_tlb_entry *gtlbe,
301 pfn_t pfn)
302{
303 ref->pfn = pfn;
304 ref->flags = E500_TLB_VALID;
305
306 if (tlbe_is_writable(gtlbe))
307 kvm_set_pfn_dirty(pfn);
308}
309
310static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
311{
312 if (ref->flags & E500_TLB_VALID) {
313 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
314 ref->flags = 0;
315 }
316}
317
318static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
319{
320 if (vcpu_e500->g2h_tlb1_map)
321 memset(vcpu_e500->g2h_tlb1_map, 0,
322 sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
323 if (vcpu_e500->h2g_tlb1_rmap)
324 memset(vcpu_e500->h2g_tlb1_rmap, 0,
325 sizeof(unsigned int) * host_tlb_params[1].entries);
326}
327
328static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
329{
330 int tlbsel = 0;
331 int i;
332
333 for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
334 struct tlbe_ref *ref =
335 &vcpu_e500->gtlb_priv[tlbsel][i].ref;
336 kvmppc_e500_ref_release(ref);
337 }
338}
339
340static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
341{
342 int stlbsel = 1;
343 int i;
344
345 kvmppc_e500_tlbil_all(vcpu_e500);
346
347 for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
348 struct tlbe_ref *ref =
349 &vcpu_e500->tlb_refs[stlbsel][i];
350 kvmppc_e500_ref_release(ref);
351 }
352
353 clear_tlb_privs(vcpu_e500);
354}
355
356void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
357{
358 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
359 clear_tlb_refs(vcpu_e500);
360 clear_tlb1_bitmap(vcpu_e500);
361}
362
363static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, 129static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
364 unsigned int eaddr, int as) 130 unsigned int eaddr, int as)
365{ 131{
@@ -385,216 +151,6 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
385 | (as ? MAS6_SAS : 0); 151 | (as ? MAS6_SAS : 0);
386} 152}
387 153
388/* TID must be supplied by the caller */
389static inline void kvmppc_e500_setup_stlbe(
390 struct kvm_vcpu *vcpu,
391 struct kvm_book3e_206_tlb_entry *gtlbe,
392 int tsize, struct tlbe_ref *ref, u64 gvaddr,
393 struct kvm_book3e_206_tlb_entry *stlbe)
394{
395 pfn_t pfn = ref->pfn;
396 u32 pr = vcpu->arch.shared->msr & MSR_PR;
397
398 BUG_ON(!(ref->flags & E500_TLB_VALID));
399
400 /* Force IPROT=0 for all guest mappings. */
401 stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
402 stlbe->mas2 = (gvaddr & MAS2_EPN) |
403 e500_shadow_mas2_attrib(gtlbe->mas2, pr);
404 stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
405 e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
406
407#ifdef CONFIG_KVM_BOOKE_HV
408 stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
409#endif
410}
411
412static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
413 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
414 int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
415 struct tlbe_ref *ref)
416{
417 struct kvm_memory_slot *slot;
418 unsigned long pfn = 0; /* silence GCC warning */
419 unsigned long hva;
420 int pfnmap = 0;
421 int tsize = BOOK3E_PAGESZ_4K;
422
423 /*
424 * Translate guest physical to true physical, acquiring
425 * a page reference if it is normal, non-reserved memory.
426 *
427 * gfn_to_memslot() must succeed because otherwise we wouldn't
428 * have gotten this far. Eventually we should just pass the slot
429 * pointer through from the first lookup.
430 */
431 slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
432 hva = gfn_to_hva_memslot(slot, gfn);
433
434 if (tlbsel == 1) {
435 struct vm_area_struct *vma;
436 down_read(&current->mm->mmap_sem);
437
438 vma = find_vma(current->mm, hva);
439 if (vma && hva >= vma->vm_start &&
440 (vma->vm_flags & VM_PFNMAP)) {
441 /*
442 * This VMA is a physically contiguous region (e.g.
443 * /dev/mem) that bypasses normal Linux page
444 * management. Find the overlap between the
445 * vma and the memslot.
446 */
447
448 unsigned long start, end;
449 unsigned long slot_start, slot_end;
450
451 pfnmap = 1;
452
453 start = vma->vm_pgoff;
454 end = start +
455 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
456
457 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
458
459 slot_start = pfn - (gfn - slot->base_gfn);
460 slot_end = slot_start + slot->npages;
461
462 if (start < slot_start)
463 start = slot_start;
464 if (end > slot_end)
465 end = slot_end;
466
467 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
468 MAS1_TSIZE_SHIFT;
469
470 /*
471 * e500 doesn't implement the lowest tsize bit,
472 * or 1K pages.
473 */
474 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
475
476 /*
477 * Now find the largest tsize (up to what the guest
478 * requested) that will cover gfn, stay within the
479 * range, and for which gfn and pfn are mutually
480 * aligned.
481 */
482
483 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
484 unsigned long gfn_start, gfn_end, tsize_pages;
485 tsize_pages = 1 << (tsize - 2);
486
487 gfn_start = gfn & ~(tsize_pages - 1);
488 gfn_end = gfn_start + tsize_pages;
489
490 if (gfn_start + pfn - gfn < start)
491 continue;
492 if (gfn_end + pfn - gfn > end)
493 continue;
494 if ((gfn & (tsize_pages - 1)) !=
495 (pfn & (tsize_pages - 1)))
496 continue;
497
498 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
499 pfn &= ~(tsize_pages - 1);
500 break;
501 }
502 } else if (vma && hva >= vma->vm_start &&
503 (vma->vm_flags & VM_HUGETLB)) {
504 unsigned long psize = vma_kernel_pagesize(vma);
505
506 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
507 MAS1_TSIZE_SHIFT;
508
509 /*
510 * Take the largest page size that satisfies both host
511 * and guest mapping
512 */
513 tsize = min(__ilog2(psize) - 10, tsize);
514
515 /*
516 * e500 doesn't implement the lowest tsize bit,
517 * or 1K pages.
518 */
519 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
520 }
521
522 up_read(&current->mm->mmap_sem);
523 }
524
525 if (likely(!pfnmap)) {
526 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
527 pfn = gfn_to_pfn_memslot(slot, gfn);
528 if (is_error_noslot_pfn(pfn)) {
529 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
530 (long)gfn);
531 return;
532 }
533
534 /* Align guest and physical address to page map boundaries */
535 pfn &= ~(tsize_pages - 1);
536 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
537 }
538
539 /* Drop old ref and setup new one. */
540 kvmppc_e500_ref_release(ref);
541 kvmppc_e500_ref_setup(ref, gtlbe, pfn);
542
543 kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
544 ref, gvaddr, stlbe);
545
546 /* Clear i-cache for new pages */
547 kvmppc_mmu_flush_icache(pfn);
548
549 /* Drop refcount on page, so that mmu notifiers can clear it */
550 kvm_release_pfn_clean(pfn);
551}
552
553/* XXX only map the one-one case, for now use TLB0 */
554static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
555 int esel,
556 struct kvm_book3e_206_tlb_entry *stlbe)
557{
558 struct kvm_book3e_206_tlb_entry *gtlbe;
559 struct tlbe_ref *ref;
560
561 gtlbe = get_entry(vcpu_e500, 0, esel);
562 ref = &vcpu_e500->gtlb_priv[0][esel].ref;
563
564 kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
565 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
566 gtlbe, 0, stlbe, ref);
567}
568
569/* Caller must ensure that the specified guest TLB entry is safe to insert into
570 * the shadow TLB. */
571/* XXX for both one-one and one-to-many , for now use TLB1 */
572static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
573 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
574 struct kvm_book3e_206_tlb_entry *stlbe, int esel)
575{
576 struct tlbe_ref *ref;
577 unsigned int victim;
578
579 victim = vcpu_e500->host_tlb1_nv++;
580
581 if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
582 vcpu_e500->host_tlb1_nv = 0;
583
584 ref = &vcpu_e500->tlb_refs[1][victim];
585 kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
586
587 vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim;
588 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
589 if (vcpu_e500->h2g_tlb1_rmap[victim]) {
590 unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim];
591 vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim);
592 }
593 vcpu_e500->h2g_tlb1_rmap[victim] = esel;
594
595 return victim;
596}
597
598static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) 154static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
599{ 155{
600 int size = vcpu_e500->gtlb_params[1].entries; 156 int size = vcpu_e500->gtlb_params[1].entries;
@@ -683,8 +239,8 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
683 for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) 239 for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
684 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); 240 kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
685 241
686 /* Invalidate all vcpu id mappings */ 242 /* Invalidate all host shadow mappings */
687 kvmppc_e500_tlbil_all(vcpu_e500); 243 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
688 244
689 return EMULATE_DONE; 245 return EMULATE_DONE;
690} 246}
@@ -713,8 +269,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
713 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 269 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
714 } 270 }
715 271
716 /* Invalidate all vcpu id mappings */ 272 /* Invalidate all host shadow mappings */
717 kvmppc_e500_tlbil_all(vcpu_e500); 273 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
718 274
719 return EMULATE_DONE; 275 return EMULATE_DONE;
720} 276}
@@ -834,27 +390,11 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
834 return EMULATE_DONE; 390 return EMULATE_DONE;
835} 391}
836 392
837/* sesel is for tlb1 only */
838static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
839 struct kvm_book3e_206_tlb_entry *gtlbe,
840 struct kvm_book3e_206_tlb_entry *stlbe,
841 int stlbsel, int sesel)
842{
843 int stid;
844
845 preempt_disable();
846 stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
847
848 stlbe->mas1 |= MAS1_TID(stid);
849 write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
850 preempt_enable();
851}
852
853int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) 393int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
854{ 394{
855 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 395 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
856 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; 396 struct kvm_book3e_206_tlb_entry *gtlbe;
857 int tlbsel, esel, stlbsel, sesel; 397 int tlbsel, esel;
858 int recal = 0; 398 int recal = 0;
859 399
860 tlbsel = get_tlb_tlbsel(vcpu); 400 tlbsel = get_tlb_tlbsel(vcpu);
@@ -892,40 +432,16 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
892 432
893 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ 433 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
894 if (tlbe_is_host_safe(vcpu, gtlbe)) { 434 if (tlbe_is_host_safe(vcpu, gtlbe)) {
895 u64 eaddr; 435 u64 eaddr = get_tlb_eaddr(gtlbe);
896 u64 raddr; 436 u64 raddr = get_tlb_raddr(gtlbe);
897 437
898 switch (tlbsel) { 438 if (tlbsel == 0) {
899 case 0:
900 /* TLB0 */
901 gtlbe->mas1 &= ~MAS1_TSIZE(~0); 439 gtlbe->mas1 &= ~MAS1_TSIZE(~0);
902 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); 440 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
903
904 stlbsel = 0;
905 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
906 sesel = 0; /* unused */
907
908 break;
909
910 case 1:
911 /* TLB1 */
912 eaddr = get_tlb_eaddr(gtlbe);
913 raddr = get_tlb_raddr(gtlbe);
914
915 /* Create a 4KB mapping on the host.
916 * If the guest wanted a large page,
917 * only the first 4KB is mapped here and the rest
918 * are mapped on the fly. */
919 stlbsel = 1;
920 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
921 raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel);
922 break;
923
924 default:
925 BUG();
926 } 441 }
927 442
928 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); 443 /* Premap the faulting page */
444 kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel));
929 } 445 }
930 446
931 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); 447 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
@@ -1019,100 +535,14 @@ void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
1019{ 535{
1020} 536}
1021 537
1022void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
1023 unsigned int index)
1024{
1025 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1026 struct tlbe_priv *priv;
1027 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
1028 int tlbsel = tlbsel_of(index);
1029 int esel = esel_of(index);
1030 int stlbsel, sesel;
1031
1032 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
1033
1034 switch (tlbsel) {
1035 case 0:
1036 stlbsel = 0;
1037 sesel = 0; /* unused */
1038 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
1039
1040 /* Only triggers after clear_tlb_refs */
1041 if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
1042 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
1043 else
1044 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
1045 &priv->ref, eaddr, &stlbe);
1046 break;
1047
1048 case 1: {
1049 gfn_t gfn = gpaddr >> PAGE_SHIFT;
1050
1051 stlbsel = 1;
1052 sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
1053 gtlbe, &stlbe, esel);
1054 break;
1055 }
1056
1057 default:
1058 BUG();
1059 break;
1060 }
1061
1062 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
1063}
1064
1065/************* MMU Notifiers *************/
1066
1067int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1068{
1069 trace_kvm_unmap_hva(hva);
1070
1071 /*
1072 * Flush all shadow tlb entries everywhere. This is slow, but
1073 * we are 100% sure that we catch the to be unmapped page
1074 */
1075 kvm_flush_remote_tlbs(kvm);
1076
1077 return 0;
1078}
1079
1080int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1081{
1082 /* kvm_unmap_hva flushes everything anyways */
1083 kvm_unmap_hva(kvm, start);
1084
1085 return 0;
1086}
1087
1088int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1089{
1090 /* XXX could be more clever ;) */
1091 return 0;
1092}
1093
1094int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1095{
1096 /* XXX could be more clever ;) */
1097 return 0;
1098}
1099
1100void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1101{
1102 /* The page will get remapped properly on its next fault */
1103 kvm_unmap_hva(kvm, hva);
1104}
1105
1106/*****************************************/ 538/*****************************************/
1107 539
1108static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) 540static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
1109{ 541{
1110 int i; 542 int i;
1111 543
1112 clear_tlb1_bitmap(vcpu_e500); 544 kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
1113 kfree(vcpu_e500->g2h_tlb1_map); 545 kfree(vcpu_e500->g2h_tlb1_map);
1114
1115 clear_tlb_refs(vcpu_e500);
1116 kfree(vcpu_e500->gtlb_priv[0]); 546 kfree(vcpu_e500->gtlb_priv[0]);
1117 kfree(vcpu_e500->gtlb_priv[1]); 547 kfree(vcpu_e500->gtlb_priv[1]);
1118 548
@@ -1303,7 +733,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
1303{ 733{
1304 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 734 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
1305 kvmppc_recalc_tlb1map_range(vcpu_e500); 735 kvmppc_recalc_tlb1map_range(vcpu_e500);
1306 clear_tlb_refs(vcpu_e500); 736 kvmppc_core_flush_tlb(vcpu);
1307 return 0; 737 return 0;
1308} 738}
1309 739
@@ -1313,37 +743,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1313 int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); 743 int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
1314 int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; 744 int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
1315 745
1316 host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; 746 if (e500_mmu_host_init(vcpu_e500))
1317 host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; 747 goto err;
1318
1319 /*
1320 * This should never happen on real e500 hardware, but is
1321 * architecturally possible -- e.g. in some weird nested
1322 * virtualization case.
1323 */
1324 if (host_tlb_params[0].entries == 0 ||
1325 host_tlb_params[1].entries == 0) {
1326 pr_err("%s: need to know host tlb size\n", __func__);
1327 return -ENODEV;
1328 }
1329
1330 host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
1331 TLBnCFG_ASSOC_SHIFT;
1332 host_tlb_params[1].ways = host_tlb_params[1].entries;
1333
1334 if (!is_power_of_2(host_tlb_params[0].entries) ||
1335 !is_power_of_2(host_tlb_params[0].ways) ||
1336 host_tlb_params[0].entries < host_tlb_params[0].ways ||
1337 host_tlb_params[0].ways == 0) {
1338 pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
1339 __func__, host_tlb_params[0].entries,
1340 host_tlb_params[0].ways);
1341 return -ENODEV;
1342 }
1343
1344 host_tlb_params[0].sets =
1345 host_tlb_params[0].entries / host_tlb_params[0].ways;
1346 host_tlb_params[1].sets = 1;
1347 748
1348 vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; 749 vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
1349 vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; 750 vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -1362,18 +763,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1362 vcpu_e500->gtlb_offset[0] = 0; 763 vcpu_e500->gtlb_offset[0] = 0;
1363 vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; 764 vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
1364 765
1365 vcpu_e500->tlb_refs[0] =
1366 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
1367 GFP_KERNEL);
1368 if (!vcpu_e500->tlb_refs[0])
1369 goto err;
1370
1371 vcpu_e500->tlb_refs[1] =
1372 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
1373 GFP_KERNEL);
1374 if (!vcpu_e500->tlb_refs[1])
1375 goto err;
1376
1377 vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * 766 vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
1378 vcpu_e500->gtlb_params[0].entries, 767 vcpu_e500->gtlb_params[0].entries,
1379 GFP_KERNEL); 768 GFP_KERNEL);
@@ -1392,12 +781,6 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1392 if (!vcpu_e500->g2h_tlb1_map) 781 if (!vcpu_e500->g2h_tlb1_map)
1393 goto err; 782 goto err;
1394 783
1395 vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
1396 host_tlb_params[1].entries,
1397 GFP_KERNEL);
1398 if (!vcpu_e500->h2g_tlb1_rmap)
1399 goto err;
1400
1401 /* Init TLB configuration register */ 784 /* Init TLB configuration register */
1402 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & 785 vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
1403 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); 786 ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
@@ -1416,15 +799,11 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
1416 799
1417err: 800err:
1418 free_gtlb(vcpu_e500); 801 free_gtlb(vcpu_e500);
1419 kfree(vcpu_e500->tlb_refs[0]);
1420 kfree(vcpu_e500->tlb_refs[1]);
1421 return -1; 802 return -1;
1422} 803}
1423 804
1424void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) 805void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
1425{ 806{
1426 free_gtlb(vcpu_e500); 807 free_gtlb(vcpu_e500);
1427 kfree(vcpu_e500->h2g_tlb1_rmap); 808 e500_mmu_host_uninit(vcpu_e500);
1428 kfree(vcpu_e500->tlb_refs[0]);
1429 kfree(vcpu_e500->tlb_refs[1]);
1430} 809}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
new file mode 100644
index 000000000000..a222edfb9a9b
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -0,0 +1,699 @@
1/*
2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 *
4 * Author: Yu Liu, yu.liu@freescale.com
5 * Scott Wood, scottwood@freescale.com
6 * Ashish Kalra, ashish.kalra@freescale.com
7 * Varun Sethi, varun.sethi@freescale.com
8 * Alexander Graf, agraf@suse.de
9 *
10 * Description:
11 * This file is based on arch/powerpc/kvm/44x_tlb.c,
12 * by Hollis Blanchard <hollisb@us.ibm.com>.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License, version 2, as
16 * published by the Free Software Foundation.
17 */
18
19#include <linux/kernel.h>
20#include <linux/types.h>
21#include <linux/slab.h>
22#include <linux/string.h>
23#include <linux/kvm.h>
24#include <linux/kvm_host.h>
25#include <linux/highmem.h>
26#include <linux/log2.h>
27#include <linux/uaccess.h>
28#include <linux/sched.h>
29#include <linux/rwsem.h>
30#include <linux/vmalloc.h>
31#include <linux/hugetlb.h>
32#include <asm/kvm_ppc.h>
33
34#include "e500.h"
35#include "trace.h"
36#include "timing.h"
37#include "e500_mmu_host.h"
38
39#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
40
41static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
42
43static inline unsigned int tlb1_max_shadow_size(void)
44{
45 /* reserve one entry for magic page */
46 return host_tlb_params[1].entries - tlbcam_index - 1;
47}
48
49static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
50{
51 /* Mask off reserved bits. */
52 mas3 &= MAS3_ATTRIB_MASK;
53
54#ifndef CONFIG_KVM_BOOKE_HV
55 if (!usermode) {
56 /* Guest is in supervisor mode,
57 * so we need to translate guest
58 * supervisor permissions into user permissions. */
59 mas3 &= ~E500_TLB_USER_PERM_MASK;
60 mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
61 }
62 mas3 |= E500_TLB_SUPER_PERM_MASK;
63#endif
64 return mas3;
65}
66
67static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
68{
69#ifdef CONFIG_SMP
70 return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
71#else
72 return mas2 & MAS2_ATTRIB_MASK;
73#endif
74}
75
76/*
77 * writing shadow tlb entry to host TLB
78 */
79static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
80 uint32_t mas0)
81{
82 unsigned long flags;
83
84 local_irq_save(flags);
85 mtspr(SPRN_MAS0, mas0);
86 mtspr(SPRN_MAS1, stlbe->mas1);
87 mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
88 mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
89 mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
90#ifdef CONFIG_KVM_BOOKE_HV
91 mtspr(SPRN_MAS8, stlbe->mas8);
92#endif
93 asm volatile("isync; tlbwe" : : : "memory");
94
95#ifdef CONFIG_KVM_BOOKE_HV
96 /* Must clear mas8 for other host tlbwe's */
97 mtspr(SPRN_MAS8, 0);
98 isync();
99#endif
100 local_irq_restore(flags);
101
102 trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
103 stlbe->mas2, stlbe->mas7_3);
104}
105
106/*
107 * Acquire a mas0 with victim hint, as if we just took a TLB miss.
108 *
109 * We don't care about the address we're searching for, other than that it's
110 * in the right set and is not present in the TLB. Using a zero PID and a
111 * userspace address means we don't have to set and then restore MAS5, or
112 * calculate a proper MAS6 value.
113 */
114static u32 get_host_mas0(unsigned long eaddr)
115{
116 unsigned long flags;
117 u32 mas0;
118
119 local_irq_save(flags);
120 mtspr(SPRN_MAS6, 0);
121 asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
122 mas0 = mfspr(SPRN_MAS0);
123 local_irq_restore(flags);
124
125 return mas0;
126}
127
128/* sesel is for tlb1 only */
129static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
130 int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
131{
132 u32 mas0;
133
134 if (tlbsel == 0) {
135 mas0 = get_host_mas0(stlbe->mas2);
136 __write_host_tlbe(stlbe, mas0);
137 } else {
138 __write_host_tlbe(stlbe,
139 MAS0_TLBSEL(1) |
140 MAS0_ESEL(to_htlb1_esel(sesel)));
141 }
142}
143
144/* sesel is for tlb1 only */
145static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
146 struct kvm_book3e_206_tlb_entry *gtlbe,
147 struct kvm_book3e_206_tlb_entry *stlbe,
148 int stlbsel, int sesel)
149{
150 int stid;
151
152 preempt_disable();
153 stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
154
155 stlbe->mas1 |= MAS1_TID(stid);
156 write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
157 preempt_enable();
158}
159
160#ifdef CONFIG_KVM_E500V2
161/* XXX should be a hook in the gva2hpa translation */
162void kvmppc_map_magic(struct kvm_vcpu *vcpu)
163{
164 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
165 struct kvm_book3e_206_tlb_entry magic;
166 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
167 unsigned int stid;
168 pfn_t pfn;
169
170 pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
171 get_page(pfn_to_page(pfn));
172
173 preempt_disable();
174 stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
175
176 magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
177 MAS1_TSIZE(BOOK3E_PAGESZ_4K);
178 magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
179 magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
180 MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
181 magic.mas8 = 0;
182
183 __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
184 preempt_enable();
185}
186#endif
187
188void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
189 int esel)
190{
191 struct kvm_book3e_206_tlb_entry *gtlbe =
192 get_entry(vcpu_e500, tlbsel, esel);
193 struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref;
194
195 /* Don't bother with unmapped entries */
196 if (!(ref->flags & E500_TLB_VALID))
197 return;
198
199 if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) {
200 u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
201 int hw_tlb_indx;
202 unsigned long flags;
203
204 local_irq_save(flags);
205 while (tmp) {
206 hw_tlb_indx = __ilog2_u64(tmp & -tmp);
207 mtspr(SPRN_MAS0,
208 MAS0_TLBSEL(1) |
209 MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
210 mtspr(SPRN_MAS1, 0);
211 asm volatile("tlbwe");
212 vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
213 tmp &= tmp - 1;
214 }
215 mb();
216 vcpu_e500->g2h_tlb1_map[esel] = 0;
217 ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID);
218 local_irq_restore(flags);
219 }
220
221 if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) {
222 /*
223 * TLB1 entry is backed by 4k pages. This should happen
224 * rarely and is not worth optimizing. Invalidate everything.
225 */
226 kvmppc_e500_tlbil_all(vcpu_e500);
227 ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID);
228 }
229
230 /* Already invalidated in between */
231 if (!(ref->flags & E500_TLB_VALID))
232 return;
233
234 /* Guest tlbe is backed by at most one host tlbe per shadow pid. */
235 kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
236
237 /* Mark the TLB as not backed by the host anymore */
238 ref->flags &= ~E500_TLB_VALID;
239}
240
241static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
242{
243 return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
244}
245
246static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
247 struct kvm_book3e_206_tlb_entry *gtlbe,
248 pfn_t pfn)
249{
250 ref->pfn = pfn;
251 ref->flags = E500_TLB_VALID;
252
253 if (tlbe_is_writable(gtlbe))
254 kvm_set_pfn_dirty(pfn);
255}
256
257static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
258{
259 if (ref->flags & E500_TLB_VALID) {
260 trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
261 ref->flags = 0;
262 }
263}
264
265static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
266{
267 if (vcpu_e500->g2h_tlb1_map)
268 memset(vcpu_e500->g2h_tlb1_map, 0,
269 sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
270 if (vcpu_e500->h2g_tlb1_rmap)
271 memset(vcpu_e500->h2g_tlb1_rmap, 0,
272 sizeof(unsigned int) * host_tlb_params[1].entries);
273}
274
275static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
276{
277 int tlbsel = 0;
278 int i;
279
280 for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
281 struct tlbe_ref *ref =
282 &vcpu_e500->gtlb_priv[tlbsel][i].ref;
283 kvmppc_e500_ref_release(ref);
284 }
285}
286
287static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
288{
289 int stlbsel = 1;
290 int i;
291
292 kvmppc_e500_tlbil_all(vcpu_e500);
293
294 for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
295 struct tlbe_ref *ref =
296 &vcpu_e500->tlb_refs[stlbsel][i];
297 kvmppc_e500_ref_release(ref);
298 }
299
300 clear_tlb_privs(vcpu_e500);
301}
302
303void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
304{
305 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
306 clear_tlb_refs(vcpu_e500);
307 clear_tlb1_bitmap(vcpu_e500);
308}
309
310/* TID must be supplied by the caller */
311static void kvmppc_e500_setup_stlbe(
312 struct kvm_vcpu *vcpu,
313 struct kvm_book3e_206_tlb_entry *gtlbe,
314 int tsize, struct tlbe_ref *ref, u64 gvaddr,
315 struct kvm_book3e_206_tlb_entry *stlbe)
316{
317 pfn_t pfn = ref->pfn;
318 u32 pr = vcpu->arch.shared->msr & MSR_PR;
319
320 BUG_ON(!(ref->flags & E500_TLB_VALID));
321
322 /* Force IPROT=0 for all guest mappings. */
323 stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
324 stlbe->mas2 = (gvaddr & MAS2_EPN) |
325 e500_shadow_mas2_attrib(gtlbe->mas2, pr);
326 stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
327 e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
328
329#ifdef CONFIG_KVM_BOOKE_HV
330 stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
331#endif
332}
333
334static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
335 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
336 int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
337 struct tlbe_ref *ref)
338{
339 struct kvm_memory_slot *slot;
340 unsigned long pfn = 0; /* silence GCC warning */
341 unsigned long hva;
342 int pfnmap = 0;
343 int tsize = BOOK3E_PAGESZ_4K;
344
345 /*
346 * Translate guest physical to true physical, acquiring
347 * a page reference if it is normal, non-reserved memory.
348 *
349 * gfn_to_memslot() must succeed because otherwise we wouldn't
350 * have gotten this far. Eventually we should just pass the slot
351 * pointer through from the first lookup.
352 */
353 slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
354 hva = gfn_to_hva_memslot(slot, gfn);
355
356 if (tlbsel == 1) {
357 struct vm_area_struct *vma;
358 down_read(&current->mm->mmap_sem);
359
360 vma = find_vma(current->mm, hva);
361 if (vma && hva >= vma->vm_start &&
362 (vma->vm_flags & VM_PFNMAP)) {
363 /*
364 * This VMA is a physically contiguous region (e.g.
365 * /dev/mem) that bypasses normal Linux page
366 * management. Find the overlap between the
367 * vma and the memslot.
368 */
369
370 unsigned long start, end;
371 unsigned long slot_start, slot_end;
372
373 pfnmap = 1;
374
375 start = vma->vm_pgoff;
376 end = start +
377 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
378
379 pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
380
381 slot_start = pfn - (gfn - slot->base_gfn);
382 slot_end = slot_start + slot->npages;
383
384 if (start < slot_start)
385 start = slot_start;
386 if (end > slot_end)
387 end = slot_end;
388
389 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
390 MAS1_TSIZE_SHIFT;
391
392 /*
393 * e500 doesn't implement the lowest tsize bit,
394 * or 1K pages.
395 */
396 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
397
398 /*
399 * Now find the largest tsize (up to what the guest
400 * requested) that will cover gfn, stay within the
401 * range, and for which gfn and pfn are mutually
402 * aligned.
403 */
404
405 for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
406 unsigned long gfn_start, gfn_end, tsize_pages;
407 tsize_pages = 1 << (tsize - 2);
408
409 gfn_start = gfn & ~(tsize_pages - 1);
410 gfn_end = gfn_start + tsize_pages;
411
412 if (gfn_start + pfn - gfn < start)
413 continue;
414 if (gfn_end + pfn - gfn > end)
415 continue;
416 if ((gfn & (tsize_pages - 1)) !=
417 (pfn & (tsize_pages - 1)))
418 continue;
419
420 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
421 pfn &= ~(tsize_pages - 1);
422 break;
423 }
424 } else if (vma && hva >= vma->vm_start &&
425 (vma->vm_flags & VM_HUGETLB)) {
426 unsigned long psize = vma_kernel_pagesize(vma);
427
428 tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
429 MAS1_TSIZE_SHIFT;
430
431 /*
432 * Take the largest page size that satisfies both host
433 * and guest mapping
434 */
435 tsize = min(__ilog2(psize) - 10, tsize);
436
437 /*
438 * e500 doesn't implement the lowest tsize bit,
439 * or 1K pages.
440 */
441 tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
442 }
443
444 up_read(&current->mm->mmap_sem);
445 }
446
447 if (likely(!pfnmap)) {
448 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
449 pfn = gfn_to_pfn_memslot(slot, gfn);
450 if (is_error_noslot_pfn(pfn)) {
451 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
452 (long)gfn);
453 return -EINVAL;
454 }
455
456 /* Align guest and physical address to page map boundaries */
457 pfn &= ~(tsize_pages - 1);
458 gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
459 }
460
461 /* Drop old ref and setup new one. */
462 kvmppc_e500_ref_release(ref);
463 kvmppc_e500_ref_setup(ref, gtlbe, pfn);
464
465 kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
466 ref, gvaddr, stlbe);
467
468 /* Clear i-cache for new pages */
469 kvmppc_mmu_flush_icache(pfn);
470
471 /* Drop refcount on page, so that mmu notifiers can clear it */
472 kvm_release_pfn_clean(pfn);
473
474 return 0;
475}
476
477/* XXX only map the one-one case, for now use TLB0 */
478static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel,
479 struct kvm_book3e_206_tlb_entry *stlbe)
480{
481 struct kvm_book3e_206_tlb_entry *gtlbe;
482 struct tlbe_ref *ref;
483 int stlbsel = 0;
484 int sesel = 0;
485 int r;
486
487 gtlbe = get_entry(vcpu_e500, 0, esel);
488 ref = &vcpu_e500->gtlb_priv[0][esel].ref;
489
490 r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
491 get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
492 gtlbe, 0, stlbe, ref);
493 if (r)
494 return r;
495
496 write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel);
497
498 return 0;
499}
500
501static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500,
502 struct tlbe_ref *ref,
503 int esel)
504{
505 unsigned int sesel = vcpu_e500->host_tlb1_nv++;
506
507 if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
508 vcpu_e500->host_tlb1_nv = 0;
509
510 vcpu_e500->tlb_refs[1][sesel] = *ref;
511 vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel;
512 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
513 if (vcpu_e500->h2g_tlb1_rmap[sesel]) {
514 unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel];
515 vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel);
516 }
517 vcpu_e500->h2g_tlb1_rmap[sesel] = esel;
518
519 return sesel;
520}
521
522/* Caller must ensure that the specified guest TLB entry is safe to insert into
523 * the shadow TLB. */
524/* For both one-one and one-to-many */
525static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
526 u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
527 struct kvm_book3e_206_tlb_entry *stlbe, int esel)
528{
529 struct tlbe_ref ref;
530 int sesel;
531 int r;
532
533 ref.flags = 0;
534 r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe,
535 &ref);
536 if (r)
537 return r;
538
539 /* Use TLB0 when we can only map a page with 4k */
540 if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) {
541 vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0;
542 write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0);
543 return 0;
544 }
545
546 /* Otherwise map into TLB1 */
547 sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, &ref, esel);
548 write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel);
549
550 return 0;
551}
552
553void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
554 unsigned int index)
555{
556 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
557 struct tlbe_priv *priv;
558 struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
559 int tlbsel = tlbsel_of(index);
560 int esel = esel_of(index);
561
562 gtlbe = get_entry(vcpu_e500, tlbsel, esel);
563
564 switch (tlbsel) {
565 case 0:
566 priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
567
568 /* Triggers after clear_tlb_refs or on initial mapping */
569 if (!(priv->ref.flags & E500_TLB_VALID)) {
570 kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
571 } else {
572 kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
573 &priv->ref, eaddr, &stlbe);
574 write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0);
575 }
576 break;
577
578 case 1: {
579 gfn_t gfn = gpaddr >> PAGE_SHIFT;
580 kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe,
581 esel);
582 break;
583 }
584
585 default:
586 BUG();
587 break;
588 }
589}
590
591/************* MMU Notifiers *************/
592
593int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
594{
595 trace_kvm_unmap_hva(hva);
596
597 /*
598 * Flush all shadow tlb entries everywhere. This is slow, but
599 * we are 100% sure that we catch the to be unmapped page
600 */
601 kvm_flush_remote_tlbs(kvm);
602
603 return 0;
604}
605
606int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
607{
608 /* kvm_unmap_hva flushes everything anyways */
609 kvm_unmap_hva(kvm, start);
610
611 return 0;
612}
613
614int kvm_age_hva(struct kvm *kvm, unsigned long hva)
615{
616 /* XXX could be more clever ;) */
617 return 0;
618}
619
620int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
621{
622 /* XXX could be more clever ;) */
623 return 0;
624}
625
626void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
627{
628 /* The page will get remapped properly on its next fault */
629 kvm_unmap_hva(kvm, hva);
630}
631
632/*****************************************/
633
634int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
635{
636 host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
637 host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
638
639 /*
640 * This should never happen on real e500 hardware, but is
641 * architecturally possible -- e.g. in some weird nested
642 * virtualization case.
643 */
644 if (host_tlb_params[0].entries == 0 ||
645 host_tlb_params[1].entries == 0) {
646 pr_err("%s: need to know host tlb size\n", __func__);
647 return -ENODEV;
648 }
649
650 host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
651 TLBnCFG_ASSOC_SHIFT;
652 host_tlb_params[1].ways = host_tlb_params[1].entries;
653
654 if (!is_power_of_2(host_tlb_params[0].entries) ||
655 !is_power_of_2(host_tlb_params[0].ways) ||
656 host_tlb_params[0].entries < host_tlb_params[0].ways ||
657 host_tlb_params[0].ways == 0) {
658 pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
659 __func__, host_tlb_params[0].entries,
660 host_tlb_params[0].ways);
661 return -ENODEV;
662 }
663
664 host_tlb_params[0].sets =
665 host_tlb_params[0].entries / host_tlb_params[0].ways;
666 host_tlb_params[1].sets = 1;
667
668 vcpu_e500->tlb_refs[0] =
669 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries,
670 GFP_KERNEL);
671 if (!vcpu_e500->tlb_refs[0])
672 goto err;
673
674 vcpu_e500->tlb_refs[1] =
675 kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries,
676 GFP_KERNEL);
677 if (!vcpu_e500->tlb_refs[1])
678 goto err;
679
680 vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
681 host_tlb_params[1].entries,
682 GFP_KERNEL);
683 if (!vcpu_e500->h2g_tlb1_rmap)
684 goto err;
685
686 return 0;
687
688err:
689 kfree(vcpu_e500->tlb_refs[0]);
690 kfree(vcpu_e500->tlb_refs[1]);
691 return -EINVAL;
692}
693
694void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
695{
696 kfree(vcpu_e500->h2g_tlb1_rmap);
697 kfree(vcpu_e500->tlb_refs[0]);
698 kfree(vcpu_e500->tlb_refs[1]);
699}
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
new file mode 100644
index 000000000000..7624835b76c7
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2, as
6 * published by the Free Software Foundation.
7 */
8
9#ifndef KVM_E500_MMU_HOST_H
10#define KVM_E500_MMU_HOST_H
11
12void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
13 int esel);
14
15int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500);
16void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
17
18#endif /* KVM_E500_MMU_HOST_H */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 9d9cddc5b346..7a73b6f72a8b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -150,8 +150,6 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
150 case SPRN_TBWL: break; 150 case SPRN_TBWL: break;
151 case SPRN_TBWU: break; 151 case SPRN_TBWU: break;
152 152
153 case SPRN_MSSSR0: break;
154
155 case SPRN_DEC: 153 case SPRN_DEC:
156 vcpu->arch.dec = spr_val; 154 vcpu->arch.dec = spr_val;
157 kvmppc_emulate_dec(vcpu); 155 kvmppc_emulate_dec(vcpu);
@@ -202,9 +200,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
202 case SPRN_PIR: 200 case SPRN_PIR:
203 spr_val = vcpu->vcpu_id; 201 spr_val = vcpu->vcpu_id;
204 break; 202 break;
205 case SPRN_MSSSR0:
206 spr_val = 0;
207 break;
208 203
209 /* Note: mftb and TBRL/TBWL are user-accessible, so 204 /* Note: mftb and TBRL/TBWL are user-accessible, so
210 * the guest can always access the real TB anyways. 205 * the guest can always access the real TB anyways.
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 70739a089560..934413cd3a1b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -237,7 +237,8 @@ int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
237 r = RESUME_HOST; 237 r = RESUME_HOST;
238 break; 238 break;
239 default: 239 default:
240 BUG(); 240 WARN_ON(1);
241 r = RESUME_GUEST;
241 } 242 }
242 243
243 return r; 244 return r;
@@ -305,6 +306,7 @@ int kvm_dev_ioctl_check_extension(long ext)
305#ifdef CONFIG_BOOKE 306#ifdef CONFIG_BOOKE
306 case KVM_CAP_PPC_BOOKE_SREGS: 307 case KVM_CAP_PPC_BOOKE_SREGS:
307 case KVM_CAP_PPC_BOOKE_WATCHDOG: 308 case KVM_CAP_PPC_BOOKE_WATCHDOG:
309 case KVM_CAP_PPC_EPR:
308#else 310#else
309 case KVM_CAP_PPC_SEGSTATE: 311 case KVM_CAP_PPC_SEGSTATE:
310 case KVM_CAP_PPC_HIOR: 312 case KVM_CAP_PPC_HIOR:
@@ -412,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
412 struct kvm_memory_slot *memslot, 414 struct kvm_memory_slot *memslot,
413 struct kvm_memory_slot old, 415 struct kvm_memory_slot old,
414 struct kvm_userspace_memory_region *mem, 416 struct kvm_userspace_memory_region *mem,
415 int user_alloc) 417 bool user_alloc)
416{ 418{
417 return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 419 return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
418} 420}
@@ -420,7 +422,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
420void kvm_arch_commit_memory_region(struct kvm *kvm, 422void kvm_arch_commit_memory_region(struct kvm *kvm,
421 struct kvm_userspace_memory_region *mem, 423 struct kvm_userspace_memory_region *mem,
422 struct kvm_memory_slot old, 424 struct kvm_memory_slot old,
423 int user_alloc) 425 bool user_alloc)
424{ 426{
425 kvmppc_core_commit_memory_region(kvm, mem, old); 427 kvmppc_core_commit_memory_region(kvm, mem, old);
426} 428}
@@ -720,6 +722,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
720 for (i = 0; i < 9; ++i) 722 for (i = 0; i < 9; ++i)
721 kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); 723 kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
722 vcpu->arch.hcall_needed = 0; 724 vcpu->arch.hcall_needed = 0;
725#ifdef CONFIG_BOOKE
726 } else if (vcpu->arch.epr_needed) {
727 kvmppc_set_epr(vcpu, run->epr.epr);
728 vcpu->arch.epr_needed = 0;
729#endif
723 } 730 }
724 731
725 r = kvmppc_vcpu_run(run, vcpu); 732 r = kvmppc_vcpu_run(run, vcpu);
@@ -761,6 +768,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
761 r = 0; 768 r = 0;
762 vcpu->arch.papr_enabled = true; 769 vcpu->arch.papr_enabled = true;
763 break; 770 break;
771 case KVM_CAP_PPC_EPR:
772 r = 0;
773 vcpu->arch.epr_enabled = cap->args[0];
774 break;
764#ifdef CONFIG_BOOKE 775#ifdef CONFIG_BOOKE
765 case KVM_CAP_PPC_BOOKE_WATCHDOG: 776 case KVM_CAP_PPC_BOOKE_WATCHDOG:
766 r = 0; 777 r = 0;
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 7def77302d63..87c17bfb2968 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -41,6 +41,7 @@ enum interruption_class {
41 IRQIO_CSC, 41 IRQIO_CSC,
42 IRQIO_PCI, 42 IRQIO_PCI,
43 IRQIO_MSI, 43 IRQIO_MSI,
44 IRQIO_VIR,
44 NMI_NMI, 45 NMI_NMI,
45 CPU_RST, 46 CPU_RST,
46 NR_ARCH_IRQS 47 NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b7841546991f..16bd5d169cdb 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -20,9 +20,7 @@
20#include <asm/cpu.h> 20#include <asm/cpu.h>
21 21
22#define KVM_MAX_VCPUS 64 22#define KVM_MAX_VCPUS 64
23#define KVM_MEMORY_SLOTS 32 23#define KVM_USER_MEM_SLOTS 32
24/* memory slots that does not exposed to userspace */
25#define KVM_PRIVATE_MEM_SLOTS 4
26 24
27struct sca_entry { 25struct sca_entry {
28 atomic_t scn; 26 atomic_t scn;
@@ -76,8 +74,11 @@ struct kvm_s390_sie_block {
76 __u64 epoch; /* 0x0038 */ 74 __u64 epoch; /* 0x0038 */
77 __u8 reserved40[4]; /* 0x0040 */ 75 __u8 reserved40[4]; /* 0x0040 */
78#define LCTL_CR0 0x8000 76#define LCTL_CR0 0x8000
77#define LCTL_CR6 0x0200
78#define LCTL_CR14 0x0002
79 __u16 lctl; /* 0x0044 */ 79 __u16 lctl; /* 0x0044 */
80 __s16 icpua; /* 0x0046 */ 80 __s16 icpua; /* 0x0046 */
81#define ICTL_LPSW 0x00400000
81 __u32 ictl; /* 0x0048 */ 82 __u32 ictl; /* 0x0048 */
82 __u32 eca; /* 0x004c */ 83 __u32 eca; /* 0x004c */
83 __u8 icptcode; /* 0x0050 */ 84 __u8 icptcode; /* 0x0050 */
@@ -127,6 +128,7 @@ struct kvm_vcpu_stat {
127 u32 deliver_prefix_signal; 128 u32 deliver_prefix_signal;
128 u32 deliver_restart_signal; 129 u32 deliver_restart_signal;
129 u32 deliver_program_int; 130 u32 deliver_program_int;
131 u32 deliver_io_int;
130 u32 exit_wait_state; 132 u32 exit_wait_state;
131 u32 instruction_stidp; 133 u32 instruction_stidp;
132 u32 instruction_spx; 134 u32 instruction_spx;
@@ -187,6 +189,11 @@ struct kvm_s390_emerg_info {
187 __u16 code; 189 __u16 code;
188}; 190};
189 191
192struct kvm_s390_mchk_info {
193 __u64 cr14;
194 __u64 mcic;
195};
196
190struct kvm_s390_interrupt_info { 197struct kvm_s390_interrupt_info {
191 struct list_head list; 198 struct list_head list;
192 u64 type; 199 u64 type;
@@ -197,6 +204,7 @@ struct kvm_s390_interrupt_info {
197 struct kvm_s390_emerg_info emerg; 204 struct kvm_s390_emerg_info emerg;
198 struct kvm_s390_extcall_info extcall; 205 struct kvm_s390_extcall_info extcall;
199 struct kvm_s390_prefix_info prefix; 206 struct kvm_s390_prefix_info prefix;
207 struct kvm_s390_mchk_info mchk;
200 }; 208 };
201}; 209};
202 210
@@ -254,6 +262,7 @@ struct kvm_arch{
254 debug_info_t *dbf; 262 debug_info_t *dbf;
255 struct kvm_s390_float_interrupt float_int; 263 struct kvm_s390_float_interrupt float_int;
256 struct gmap *gmap; 264 struct gmap *gmap;
265 int css_support;
257}; 266};
258 267
259extern int sie64a(struct kvm_s390_sie_block *, u64 *); 268extern int sie64a(struct kvm_s390_sie_block *, u64 *);
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 9df824ea1667..1630f439cd2a 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -81,6 +81,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
81 [IRQIO_CSC] = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"}, 81 [IRQIO_CSC] = {.name = "CSC", .desc = "[I/O] CHSC Subchannel"},
82 [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" }, 82 [IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
83 [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" }, 83 [IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
84 [IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
84 [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"}, 85 [NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"},
85 [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"}, 86 [CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"},
86}; 87};
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 22798ec33fd1..f26ff1e31bdb 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -26,27 +26,20 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
26{ 26{
27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 27 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 28 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
29 int base2 = vcpu->arch.sie_block->ipb >> 28;
30 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
31 ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
32 u64 useraddr; 29 u64 useraddr;
33 int reg, rc; 30 int reg, rc;
34 31
35 vcpu->stat.instruction_lctlg++; 32 vcpu->stat.instruction_lctlg++;
36 if ((vcpu->arch.sie_block->ipb & 0xff) != 0x2f)
37 return -EOPNOTSUPP;
38 33
39 useraddr = disp2; 34 useraddr = kvm_s390_get_base_disp_rsy(vcpu);
40 if (base2)
41 useraddr += vcpu->run->s.regs.gprs[base2];
42 35
43 if (useraddr & 7) 36 if (useraddr & 7)
44 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 37 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
45 38
46 reg = reg1; 39 reg = reg1;
47 40
48 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 41 VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3,
49 disp2); 42 useraddr);
50 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr); 43 trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
51 44
52 do { 45 do {
@@ -68,23 +61,19 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
68{ 61{
69 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 62 int reg1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
70 int reg3 = vcpu->arch.sie_block->ipa & 0x000f; 63 int reg3 = vcpu->arch.sie_block->ipa & 0x000f;
71 int base2 = vcpu->arch.sie_block->ipb >> 28;
72 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
73 u64 useraddr; 64 u64 useraddr;
74 u32 val = 0; 65 u32 val = 0;
75 int reg, rc; 66 int reg, rc;
76 67
77 vcpu->stat.instruction_lctl++; 68 vcpu->stat.instruction_lctl++;
78 69
79 useraddr = disp2; 70 useraddr = kvm_s390_get_base_disp_rs(vcpu);
80 if (base2)
81 useraddr += vcpu->run->s.regs.gprs[base2];
82 71
83 if (useraddr & 3) 72 if (useraddr & 3)
84 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 73 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
85 74
86 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2, 75 VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3,
87 disp2); 76 useraddr);
88 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr); 77 trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
89 78
90 reg = reg1; 79 reg = reg1;
@@ -104,14 +93,31 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
104 return 0; 93 return 0;
105} 94}
106 95
107static intercept_handler_t instruction_handlers[256] = { 96static const intercept_handler_t eb_handlers[256] = {
97 [0x2f] = handle_lctlg,
98 [0x8a] = kvm_s390_handle_priv_eb,
99};
100
101static int handle_eb(struct kvm_vcpu *vcpu)
102{
103 intercept_handler_t handler;
104
105 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
106 if (handler)
107 return handler(vcpu);
108 return -EOPNOTSUPP;
109}
110
111static const intercept_handler_t instruction_handlers[256] = {
108 [0x01] = kvm_s390_handle_01, 112 [0x01] = kvm_s390_handle_01,
113 [0x82] = kvm_s390_handle_lpsw,
109 [0x83] = kvm_s390_handle_diag, 114 [0x83] = kvm_s390_handle_diag,
110 [0xae] = kvm_s390_handle_sigp, 115 [0xae] = kvm_s390_handle_sigp,
111 [0xb2] = kvm_s390_handle_b2, 116 [0xb2] = kvm_s390_handle_b2,
112 [0xb7] = handle_lctl, 117 [0xb7] = handle_lctl,
118 [0xb9] = kvm_s390_handle_b9,
113 [0xe5] = kvm_s390_handle_e5, 119 [0xe5] = kvm_s390_handle_e5,
114 [0xeb] = handle_lctlg, 120 [0xeb] = handle_eb,
115}; 121};
116 122
117static int handle_noop(struct kvm_vcpu *vcpu) 123static int handle_noop(struct kvm_vcpu *vcpu)
@@ -258,6 +264,7 @@ static const intercept_handler_t intercept_funcs[] = {
258 [0x0C >> 2] = handle_instruction_and_prog, 264 [0x0C >> 2] = handle_instruction_and_prog,
259 [0x10 >> 2] = handle_noop, 265 [0x10 >> 2] = handle_noop,
260 [0x14 >> 2] = handle_noop, 266 [0x14 >> 2] = handle_noop,
267 [0x18 >> 2] = handle_noop,
261 [0x1C >> 2] = kvm_s390_handle_wait, 268 [0x1C >> 2] = kvm_s390_handle_wait,
262 [0x20 >> 2] = handle_validity, 269 [0x20 >> 2] = handle_validity,
263 [0x28 >> 2] = handle_stop, 270 [0x28 >> 2] = handle_stop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 87418b50f21c..37116a77cb4b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -21,11 +21,31 @@
21#include "gaccess.h" 21#include "gaccess.h"
22#include "trace-s390.h" 22#include "trace-s390.h"
23 23
24#define IOINT_SCHID_MASK 0x0000ffff
25#define IOINT_SSID_MASK 0x00030000
26#define IOINT_CSSID_MASK 0x03fc0000
27#define IOINT_AI_MASK 0x04000000
28
29static int is_ioint(u64 type)
30{
31 return ((type & 0xfffe0000u) != 0xfffe0000u);
32}
33
24static int psw_extint_disabled(struct kvm_vcpu *vcpu) 34static int psw_extint_disabled(struct kvm_vcpu *vcpu)
25{ 35{
26 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT); 36 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
27} 37}
28 38
39static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
40{
41 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
42}
43
44static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
45{
46 return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
47}
48
29static int psw_interrupts_disabled(struct kvm_vcpu *vcpu) 49static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
30{ 50{
31 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) || 51 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
@@ -35,6 +55,13 @@ static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
35 return 1; 55 return 1;
36} 56}
37 57
58static u64 int_word_to_isc_bits(u32 int_word)
59{
60 u8 isc = (int_word & 0x38000000) >> 27;
61
62 return (0x80 >> isc) << 24;
63}
64
38static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu, 65static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
39 struct kvm_s390_interrupt_info *inti) 66 struct kvm_s390_interrupt_info *inti)
40{ 67{
@@ -67,7 +94,22 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
67 case KVM_S390_SIGP_SET_PREFIX: 94 case KVM_S390_SIGP_SET_PREFIX:
68 case KVM_S390_RESTART: 95 case KVM_S390_RESTART:
69 return 1; 96 return 1;
97 case KVM_S390_MCHK:
98 if (psw_mchk_disabled(vcpu))
99 return 0;
100 if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
101 return 1;
102 return 0;
103 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
104 if (psw_ioint_disabled(vcpu))
105 return 0;
106 if (vcpu->arch.sie_block->gcr[6] &
107 int_word_to_isc_bits(inti->io.io_int_word))
108 return 1;
109 return 0;
70 default: 110 default:
111 printk(KERN_WARNING "illegal interrupt type %llx\n",
112 inti->type);
71 BUG(); 113 BUG();
72 } 114 }
73 return 0; 115 return 0;
@@ -93,6 +135,7 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
93 CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT, 135 CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
94 &vcpu->arch.sie_block->cpuflags); 136 &vcpu->arch.sie_block->cpuflags);
95 vcpu->arch.sie_block->lctl = 0x0000; 137 vcpu->arch.sie_block->lctl = 0x0000;
138 vcpu->arch.sie_block->ictl &= ~ICTL_LPSW;
96} 139}
97 140
98static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag) 141static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
@@ -116,6 +159,18 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
116 case KVM_S390_SIGP_STOP: 159 case KVM_S390_SIGP_STOP:
117 __set_cpuflag(vcpu, CPUSTAT_STOP_INT); 160 __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
118 break; 161 break;
162 case KVM_S390_MCHK:
163 if (psw_mchk_disabled(vcpu))
164 vcpu->arch.sie_block->ictl |= ICTL_LPSW;
165 else
166 vcpu->arch.sie_block->lctl |= LCTL_CR14;
167 break;
168 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
169 if (psw_ioint_disabled(vcpu))
170 __set_cpuflag(vcpu, CPUSTAT_IO_INT);
171 else
172 vcpu->arch.sie_block->lctl |= LCTL_CR6;
173 break;
119 default: 174 default:
120 BUG(); 175 BUG();
121 } 176 }
@@ -297,6 +352,73 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
297 exception = 1; 352 exception = 1;
298 break; 353 break;
299 354
355 case KVM_S390_MCHK:
356 VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
357 inti->mchk.mcic);
358 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
359 inti->mchk.cr14,
360 inti->mchk.mcic);
361 rc = kvm_s390_vcpu_store_status(vcpu,
362 KVM_S390_STORE_STATUS_PREFIXED);
363 if (rc == -EFAULT)
364 exception = 1;
365
366 rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
367 if (rc == -EFAULT)
368 exception = 1;
369
370 rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
371 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
372 if (rc == -EFAULT)
373 exception = 1;
374
375 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
376 __LC_MCK_NEW_PSW, sizeof(psw_t));
377 if (rc == -EFAULT)
378 exception = 1;
379 break;
380
381 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
382 {
383 __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
384 inti->io.subchannel_nr;
385 __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
386 inti->io.io_int_word;
387 VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
388 vcpu->stat.deliver_io_int++;
389 trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
390 param0, param1);
391 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
392 inti->io.subchannel_id);
393 if (rc == -EFAULT)
394 exception = 1;
395
396 rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
397 inti->io.subchannel_nr);
398 if (rc == -EFAULT)
399 exception = 1;
400
401 rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
402 inti->io.io_int_parm);
403 if (rc == -EFAULT)
404 exception = 1;
405
406 rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
407 inti->io.io_int_word);
408 if (rc == -EFAULT)
409 exception = 1;
410
411 rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
412 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
413 if (rc == -EFAULT)
414 exception = 1;
415
416 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
417 __LC_IO_NEW_PSW, sizeof(psw_t));
418 if (rc == -EFAULT)
419 exception = 1;
420 break;
421 }
300 default: 422 default:
301 BUG(); 423 BUG();
302 } 424 }
@@ -518,6 +640,61 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
518 } 640 }
519} 641}
520 642
643void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
644{
645 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
646 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
647 struct kvm_s390_interrupt_info *n, *inti = NULL;
648 int deliver;
649
650 __reset_intercept_indicators(vcpu);
651 if (atomic_read(&li->active)) {
652 do {
653 deliver = 0;
654 spin_lock_bh(&li->lock);
655 list_for_each_entry_safe(inti, n, &li->list, list) {
656 if ((inti->type == KVM_S390_MCHK) &&
657 __interrupt_is_deliverable(vcpu, inti)) {
658 list_del(&inti->list);
659 deliver = 1;
660 break;
661 }
662 __set_intercept_indicator(vcpu, inti);
663 }
664 if (list_empty(&li->list))
665 atomic_set(&li->active, 0);
666 spin_unlock_bh(&li->lock);
667 if (deliver) {
668 __do_deliver_interrupt(vcpu, inti);
669 kfree(inti);
670 }
671 } while (deliver);
672 }
673
674 if (atomic_read(&fi->active)) {
675 do {
676 deliver = 0;
677 spin_lock(&fi->lock);
678 list_for_each_entry_safe(inti, n, &fi->list, list) {
679 if ((inti->type == KVM_S390_MCHK) &&
680 __interrupt_is_deliverable(vcpu, inti)) {
681 list_del(&inti->list);
682 deliver = 1;
683 break;
684 }
685 __set_intercept_indicator(vcpu, inti);
686 }
687 if (list_empty(&fi->list))
688 atomic_set(&fi->active, 0);
689 spin_unlock(&fi->lock);
690 if (deliver) {
691 __do_deliver_interrupt(vcpu, inti);
692 kfree(inti);
693 }
694 } while (deliver);
695 }
696}
697
521int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) 698int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
522{ 699{
523 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 700 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -540,12 +717,50 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
540 return 0; 717 return 0;
541} 718}
542 719
720struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
721 u64 cr6, u64 schid)
722{
723 struct kvm_s390_float_interrupt *fi;
724 struct kvm_s390_interrupt_info *inti, *iter;
725
726 if ((!schid && !cr6) || (schid && cr6))
727 return NULL;
728 mutex_lock(&kvm->lock);
729 fi = &kvm->arch.float_int;
730 spin_lock(&fi->lock);
731 inti = NULL;
732 list_for_each_entry(iter, &fi->list, list) {
733 if (!is_ioint(iter->type))
734 continue;
735 if (cr6 &&
736 ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
737 continue;
738 if (schid) {
739 if (((schid & 0x00000000ffff0000) >> 16) !=
740 iter->io.subchannel_id)
741 continue;
742 if ((schid & 0x000000000000ffff) !=
743 iter->io.subchannel_nr)
744 continue;
745 }
746 inti = iter;
747 break;
748 }
749 if (inti)
750 list_del_init(&inti->list);
751 if (list_empty(&fi->list))
752 atomic_set(&fi->active, 0);
753 spin_unlock(&fi->lock);
754 mutex_unlock(&kvm->lock);
755 return inti;
756}
757
543int kvm_s390_inject_vm(struct kvm *kvm, 758int kvm_s390_inject_vm(struct kvm *kvm,
544 struct kvm_s390_interrupt *s390int) 759 struct kvm_s390_interrupt *s390int)
545{ 760{
546 struct kvm_s390_local_interrupt *li; 761 struct kvm_s390_local_interrupt *li;
547 struct kvm_s390_float_interrupt *fi; 762 struct kvm_s390_float_interrupt *fi;
548 struct kvm_s390_interrupt_info *inti; 763 struct kvm_s390_interrupt_info *inti, *iter;
549 int sigcpu; 764 int sigcpu;
550 765
551 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 766 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -569,6 +784,29 @@ int kvm_s390_inject_vm(struct kvm *kvm,
569 case KVM_S390_SIGP_STOP: 784 case KVM_S390_SIGP_STOP:
570 case KVM_S390_INT_EXTERNAL_CALL: 785 case KVM_S390_INT_EXTERNAL_CALL:
571 case KVM_S390_INT_EMERGENCY: 786 case KVM_S390_INT_EMERGENCY:
787 kfree(inti);
788 return -EINVAL;
789 case KVM_S390_MCHK:
790 VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
791 s390int->parm64);
792 inti->type = s390int->type;
793 inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
794 inti->mchk.mcic = s390int->parm64;
795 break;
796 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
797 if (s390int->type & IOINT_AI_MASK)
798 VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
799 else
800 VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
801 s390int->type & IOINT_CSSID_MASK,
802 s390int->type & IOINT_SSID_MASK,
803 s390int->type & IOINT_SCHID_MASK);
804 inti->type = s390int->type;
805 inti->io.subchannel_id = s390int->parm >> 16;
806 inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
807 inti->io.io_int_parm = s390int->parm64 >> 32;
808 inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
809 break;
572 default: 810 default:
573 kfree(inti); 811 kfree(inti);
574 return -EINVAL; 812 return -EINVAL;
@@ -579,7 +817,22 @@ int kvm_s390_inject_vm(struct kvm *kvm,
579 mutex_lock(&kvm->lock); 817 mutex_lock(&kvm->lock);
580 fi = &kvm->arch.float_int; 818 fi = &kvm->arch.float_int;
581 spin_lock(&fi->lock); 819 spin_lock(&fi->lock);
582 list_add_tail(&inti->list, &fi->list); 820 if (!is_ioint(inti->type))
821 list_add_tail(&inti->list, &fi->list);
822 else {
823 u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
824
825 /* Keep I/O interrupts sorted in isc order. */
826 list_for_each_entry(iter, &fi->list, list) {
827 if (!is_ioint(iter->type))
828 continue;
829 if (int_word_to_isc_bits(iter->io.io_int_word)
830 <= isc_bits)
831 continue;
832 break;
833 }
834 list_add_tail(&inti->list, &iter->list);
835 }
583 atomic_set(&fi->active, 1); 836 atomic_set(&fi->active, 1);
584 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); 837 sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
585 if (sigcpu == KVM_MAX_VCPUS) { 838 if (sigcpu == KVM_MAX_VCPUS) {
@@ -651,8 +904,15 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
651 inti->type = s390int->type; 904 inti->type = s390int->type;
652 inti->emerg.code = s390int->parm; 905 inti->emerg.code = s390int->parm;
653 break; 906 break;
907 case KVM_S390_MCHK:
908 VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
909 s390int->parm64);
910 inti->type = s390int->type;
911 inti->mchk.mcic = s390int->parm64;
912 break;
654 case KVM_S390_INT_VIRTIO: 913 case KVM_S390_INT_VIRTIO:
655 case KVM_S390_INT_SERVICE: 914 case KVM_S390_INT_SERVICE:
915 case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
656 default: 916 default:
657 kfree(inti); 917 kfree(inti);
658 return -EINVAL; 918 return -EINVAL;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2923781590a6..4cf35a0a79e7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -140,6 +140,8 @@ int kvm_dev_ioctl_check_extension(long ext)
140#endif 140#endif
141 case KVM_CAP_SYNC_REGS: 141 case KVM_CAP_SYNC_REGS:
142 case KVM_CAP_ONE_REG: 142 case KVM_CAP_ONE_REG:
143 case KVM_CAP_ENABLE_CAP:
144 case KVM_CAP_S390_CSS_SUPPORT:
143 r = 1; 145 r = 1;
144 break; 146 break;
145 case KVM_CAP_NR_VCPUS: 147 case KVM_CAP_NR_VCPUS:
@@ -234,6 +236,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
234 if (!kvm->arch.gmap) 236 if (!kvm->arch.gmap)
235 goto out_nogmap; 237 goto out_nogmap;
236 } 238 }
239
240 kvm->arch.css_support = 0;
241
237 return 0; 242 return 0;
238out_nogmap: 243out_nogmap:
239 debug_unregister(kvm->arch.dbf); 244 debug_unregister(kvm->arch.dbf);
@@ -659,6 +664,7 @@ rerun_vcpu:
659 case KVM_EXIT_INTR: 664 case KVM_EXIT_INTR:
660 case KVM_EXIT_S390_RESET: 665 case KVM_EXIT_S390_RESET:
661 case KVM_EXIT_S390_UCONTROL: 666 case KVM_EXIT_S390_UCONTROL:
667 case KVM_EXIT_S390_TSCH:
662 break; 668 break;
663 default: 669 default:
664 BUG(); 670 BUG();
@@ -766,6 +772,14 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
766 } else 772 } else
767 prefix = 0; 773 prefix = 0;
768 774
775 /*
776 * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy
777 * copying in vcpu load/put. Lets update our copies before we save
778 * it into the save area
779 */
780 save_fp_regs(&vcpu->arch.guest_fpregs);
781 save_access_regs(vcpu->run->s.regs.acrs);
782
769 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), 783 if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs),
770 vcpu->arch.guest_fpregs.fprs, 128, prefix)) 784 vcpu->arch.guest_fpregs.fprs, 128, prefix))
771 return -EFAULT; 785 return -EFAULT;
@@ -810,6 +824,29 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
810 return 0; 824 return 0;
811} 825}
812 826
827static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
828 struct kvm_enable_cap *cap)
829{
830 int r;
831
832 if (cap->flags)
833 return -EINVAL;
834
835 switch (cap->cap) {
836 case KVM_CAP_S390_CSS_SUPPORT:
837 if (!vcpu->kvm->arch.css_support) {
838 vcpu->kvm->arch.css_support = 1;
839 trace_kvm_s390_enable_css(vcpu->kvm);
840 }
841 r = 0;
842 break;
843 default:
844 r = -EINVAL;
845 break;
846 }
847 return r;
848}
849
813long kvm_arch_vcpu_ioctl(struct file *filp, 850long kvm_arch_vcpu_ioctl(struct file *filp,
814 unsigned int ioctl, unsigned long arg) 851 unsigned int ioctl, unsigned long arg)
815{ 852{
@@ -896,6 +933,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
896 r = 0; 933 r = 0;
897 break; 934 break;
898 } 935 }
936 case KVM_ENABLE_CAP:
937 {
938 struct kvm_enable_cap cap;
939 r = -EFAULT;
940 if (copy_from_user(&cap, argp, sizeof(cap)))
941 break;
942 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
943 break;
944 }
899 default: 945 default:
900 r = -ENOTTY; 946 r = -ENOTTY;
901 } 947 }
@@ -930,7 +976,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
930 struct kvm_memory_slot *memslot, 976 struct kvm_memory_slot *memslot,
931 struct kvm_memory_slot old, 977 struct kvm_memory_slot old,
932 struct kvm_userspace_memory_region *mem, 978 struct kvm_userspace_memory_region *mem,
933 int user_alloc) 979 bool user_alloc)
934{ 980{
935 /* A few sanity checks. We can have exactly one memory slot which has 981 /* A few sanity checks. We can have exactly one memory slot which has
936 to start at guest virtual zero and which has to be located at a 982 to start at guest virtual zero and which has to be located at a
@@ -960,7 +1006,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
960void kvm_arch_commit_memory_region(struct kvm *kvm, 1006void kvm_arch_commit_memory_region(struct kvm *kvm,
961 struct kvm_userspace_memory_region *mem, 1007 struct kvm_userspace_memory_region *mem,
962 struct kvm_memory_slot old, 1008 struct kvm_memory_slot old,
963 int user_alloc) 1009 bool user_alloc)
964{ 1010{
965 int rc; 1011 int rc;
966 1012
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index d75bc5e92c5b..4d89d64a8161 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -65,21 +65,67 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
65 vcpu->arch.sie_block->ihcpu = 0xffff; 65 vcpu->arch.sie_block->ihcpu = 0xffff;
66} 66}
67 67
68static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu)
69{
70 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
71 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
72
73 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
74}
75
76static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
77 u64 *address1, u64 *address2)
78{
79 u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
80 u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
81 u32 base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
82 u32 disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
83
84 *address1 = (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
85 *address2 = (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
86}
87
88static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu)
89{
90 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
91 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
92 ((vcpu->arch.sie_block->ipb & 0xff00) << 4);
93 /* The displacement is a 20bit _SIGNED_ value */
94 if (disp2 & 0x80000)
95 disp2+=0xfff00000;
96
97 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
98}
99
100static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu)
101{
102 u32 base2 = vcpu->arch.sie_block->ipb >> 28;
103 u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
104
105 return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
106}
107
68int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); 108int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
69enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer); 109enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
70void kvm_s390_tasklet(unsigned long parm); 110void kvm_s390_tasklet(unsigned long parm);
71void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu); 111void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
112void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
72int kvm_s390_inject_vm(struct kvm *kvm, 113int kvm_s390_inject_vm(struct kvm *kvm,
73 struct kvm_s390_interrupt *s390int); 114 struct kvm_s390_interrupt *s390int);
74int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 115int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
75 struct kvm_s390_interrupt *s390int); 116 struct kvm_s390_interrupt *s390int);
76int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 117int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
77int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action); 118int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
119struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
120 u64 cr6, u64 schid);
78 121
79/* implemented in priv.c */ 122/* implemented in priv.c */
80int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); 123int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
81int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); 124int kvm_s390_handle_e5(struct kvm_vcpu *vcpu);
82int kvm_s390_handle_01(struct kvm_vcpu *vcpu); 125int kvm_s390_handle_01(struct kvm_vcpu *vcpu);
126int kvm_s390_handle_b9(struct kvm_vcpu *vcpu);
127int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu);
128int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu);
83 129
84/* implemented in sigp.c */ 130/* implemented in sigp.c */
85int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 131int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d768906f15c8..0ef9894606e5 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -18,23 +18,21 @@
18#include <asm/debug.h> 18#include <asm/debug.h>
19#include <asm/ebcdic.h> 19#include <asm/ebcdic.h>
20#include <asm/sysinfo.h> 20#include <asm/sysinfo.h>
21#include <asm/ptrace.h>
22#include <asm/compat.h>
21#include "gaccess.h" 23#include "gaccess.h"
22#include "kvm-s390.h" 24#include "kvm-s390.h"
23#include "trace.h" 25#include "trace.h"
24 26
25static int handle_set_prefix(struct kvm_vcpu *vcpu) 27static int handle_set_prefix(struct kvm_vcpu *vcpu)
26{ 28{
27 int base2 = vcpu->arch.sie_block->ipb >> 28;
28 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
29 u64 operand2; 29 u64 operand2;
30 u32 address = 0; 30 u32 address = 0;
31 u8 tmp; 31 u8 tmp;
32 32
33 vcpu->stat.instruction_spx++; 33 vcpu->stat.instruction_spx++;
34 34
35 operand2 = disp2; 35 operand2 = kvm_s390_get_base_disp_s(vcpu);
36 if (base2)
37 operand2 += vcpu->run->s.regs.gprs[base2];
38 36
39 /* must be word boundary */ 37 /* must be word boundary */
40 if (operand2 & 3) { 38 if (operand2 & 3) {
@@ -67,15 +65,12 @@ out:
67 65
68static int handle_store_prefix(struct kvm_vcpu *vcpu) 66static int handle_store_prefix(struct kvm_vcpu *vcpu)
69{ 67{
70 int base2 = vcpu->arch.sie_block->ipb >> 28;
71 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
72 u64 operand2; 68 u64 operand2;
73 u32 address; 69 u32 address;
74 70
75 vcpu->stat.instruction_stpx++; 71 vcpu->stat.instruction_stpx++;
76 operand2 = disp2; 72
77 if (base2) 73 operand2 = kvm_s390_get_base_disp_s(vcpu);
78 operand2 += vcpu->run->s.regs.gprs[base2];
79 74
80 /* must be word boundary */ 75 /* must be word boundary */
81 if (operand2 & 3) { 76 if (operand2 & 3) {
@@ -100,15 +95,12 @@ out:
100 95
101static int handle_store_cpu_address(struct kvm_vcpu *vcpu) 96static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
102{ 97{
103 int base2 = vcpu->arch.sie_block->ipb >> 28;
104 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
105 u64 useraddr; 98 u64 useraddr;
106 int rc; 99 int rc;
107 100
108 vcpu->stat.instruction_stap++; 101 vcpu->stat.instruction_stap++;
109 useraddr = disp2; 102
110 if (base2) 103 useraddr = kvm_s390_get_base_disp_s(vcpu);
111 useraddr += vcpu->run->s.regs.gprs[base2];
112 104
113 if (useraddr & 1) { 105 if (useraddr & 1) {
114 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 106 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -135,24 +127,96 @@ static int handle_skey(struct kvm_vcpu *vcpu)
135 return 0; 127 return 0;
136} 128}
137 129
138static int handle_stsch(struct kvm_vcpu *vcpu) 130static int handle_tpi(struct kvm_vcpu *vcpu)
139{ 131{
140 vcpu->stat.instruction_stsch++; 132 u64 addr;
141 VCPU_EVENT(vcpu, 4, "%s", "store subchannel - CC3"); 133 struct kvm_s390_interrupt_info *inti;
142 /* condition code 3 */ 134 int cc;
135
136 addr = kvm_s390_get_base_disp_s(vcpu);
137
138 inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
139 if (inti) {
140 if (addr) {
141 /*
142 * Store the two-word I/O interruption code into the
143 * provided area.
144 */
145 put_guest_u16(vcpu, addr, inti->io.subchannel_id);
146 put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr);
147 put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm);
148 } else {
149 /*
150 * Store the three-word I/O interruption code into
151 * the appropriate lowcore area.
152 */
153 put_guest_u16(vcpu, 184, inti->io.subchannel_id);
154 put_guest_u16(vcpu, 186, inti->io.subchannel_nr);
155 put_guest_u32(vcpu, 188, inti->io.io_int_parm);
156 put_guest_u32(vcpu, 192, inti->io.io_int_word);
157 }
158 cc = 1;
159 } else
160 cc = 0;
161 kfree(inti);
162 /* Set condition code and we're done. */
143 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 163 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
144 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44; 164 vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
145 return 0; 165 return 0;
146} 166}
147 167
148static int handle_chsc(struct kvm_vcpu *vcpu) 168static int handle_tsch(struct kvm_vcpu *vcpu)
149{ 169{
150 vcpu->stat.instruction_chsc++; 170 struct kvm_s390_interrupt_info *inti;
151 VCPU_EVENT(vcpu, 4, "%s", "channel subsystem call - CC3"); 171
152 /* condition code 3 */ 172 inti = kvm_s390_get_io_int(vcpu->kvm, 0,
153 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44); 173 vcpu->run->s.regs.gprs[1]);
154 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44; 174
155 return 0; 175 /*
176 * Prepare exit to userspace.
177 * We indicate whether we dequeued a pending I/O interrupt
178 * so that userspace can re-inject it if the instruction gets
179 * a program check. While this may re-order the pending I/O
180 * interrupts, this is no problem since the priority is kept
181 * intact.
182 */
183 vcpu->run->exit_reason = KVM_EXIT_S390_TSCH;
184 vcpu->run->s390_tsch.dequeued = !!inti;
185 if (inti) {
186 vcpu->run->s390_tsch.subchannel_id = inti->io.subchannel_id;
187 vcpu->run->s390_tsch.subchannel_nr = inti->io.subchannel_nr;
188 vcpu->run->s390_tsch.io_int_parm = inti->io.io_int_parm;
189 vcpu->run->s390_tsch.io_int_word = inti->io.io_int_word;
190 }
191 vcpu->run->s390_tsch.ipb = vcpu->arch.sie_block->ipb;
192 kfree(inti);
193 return -EREMOTE;
194}
195
196static int handle_io_inst(struct kvm_vcpu *vcpu)
197{
198 VCPU_EVENT(vcpu, 4, "%s", "I/O instruction");
199
200 if (vcpu->kvm->arch.css_support) {
201 /*
202 * Most I/O instructions will be handled by userspace.
203 * Exceptions are tpi and the interrupt portion of tsch.
204 */
205 if (vcpu->arch.sie_block->ipa == 0xb236)
206 return handle_tpi(vcpu);
207 if (vcpu->arch.sie_block->ipa == 0xb235)
208 return handle_tsch(vcpu);
209 /* Handle in userspace. */
210 return -EOPNOTSUPP;
211 } else {
212 /*
213 * Set condition code 3 to stop the guest from issueing channel
214 * I/O instructions.
215 */
216 vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
217 vcpu->arch.sie_block->gpsw.mask |= (3 & 3ul) << 44;
218 return 0;
219 }
156} 220}
157 221
158static int handle_stfl(struct kvm_vcpu *vcpu) 222static int handle_stfl(struct kvm_vcpu *vcpu)
@@ -176,17 +240,107 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
176 return 0; 240 return 0;
177} 241}
178 242
243static void handle_new_psw(struct kvm_vcpu *vcpu)
244{
245 /* Check whether the new psw is enabled for machine checks. */
246 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK)
247 kvm_s390_deliver_pending_machine_checks(vcpu);
248}
249
250#define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
251#define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
252#define PSW_ADDR_24 0x00000000000fffffUL
253#define PSW_ADDR_31 0x000000007fffffffUL
254
255int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
256{
257 u64 addr;
258 psw_compat_t new_psw;
259
260 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
261 return kvm_s390_inject_program_int(vcpu,
262 PGM_PRIVILEGED_OPERATION);
263
264 addr = kvm_s390_get_base_disp_s(vcpu);
265
266 if (addr & 7) {
267 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
268 goto out;
269 }
270
271 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
272 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
273 goto out;
274 }
275
276 if (!(new_psw.mask & PSW32_MASK_BASE)) {
277 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
278 goto out;
279 }
280
281 vcpu->arch.sie_block->gpsw.mask =
282 (new_psw.mask & ~PSW32_MASK_BASE) << 32;
283 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
284
285 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
286 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
287 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
288 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
289 PSW_MASK_EA)) {
290 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
291 goto out;
292 }
293
294 handle_new_psw(vcpu);
295out:
296 return 0;
297}
298
299static int handle_lpswe(struct kvm_vcpu *vcpu)
300{
301 u64 addr;
302 psw_t new_psw;
303
304 addr = kvm_s390_get_base_disp_s(vcpu);
305
306 if (addr & 7) {
307 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
308 goto out;
309 }
310
311 if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
312 kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
313 goto out;
314 }
315
316 vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
317 vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
318
319 if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
320 (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
321 PSW_MASK_BA) &&
322 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
323 (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
324 (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
325 ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
326 PSW_MASK_EA)) {
327 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
328 goto out;
329 }
330
331 handle_new_psw(vcpu);
332out:
333 return 0;
334}
335
179static int handle_stidp(struct kvm_vcpu *vcpu) 336static int handle_stidp(struct kvm_vcpu *vcpu)
180{ 337{
181 int base2 = vcpu->arch.sie_block->ipb >> 28;
182 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
183 u64 operand2; 338 u64 operand2;
184 int rc; 339 int rc;
185 340
186 vcpu->stat.instruction_stidp++; 341 vcpu->stat.instruction_stidp++;
187 operand2 = disp2; 342
188 if (base2) 343 operand2 = kvm_s390_get_base_disp_s(vcpu);
189 operand2 += vcpu->run->s.regs.gprs[base2];
190 344
191 if (operand2 & 7) { 345 if (operand2 & 7) {
192 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 346 kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -240,17 +394,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
240 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28; 394 int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
241 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff; 395 int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
242 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff; 396 int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
243 int base2 = vcpu->arch.sie_block->ipb >> 28;
244 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
245 u64 operand2; 397 u64 operand2;
246 unsigned long mem; 398 unsigned long mem;
247 399
248 vcpu->stat.instruction_stsi++; 400 vcpu->stat.instruction_stsi++;
249 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); 401 VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
250 402
251 operand2 = disp2; 403 operand2 = kvm_s390_get_base_disp_s(vcpu);
252 if (base2)
253 operand2 += vcpu->run->s.regs.gprs[base2];
254 404
255 if (operand2 & 0xfff && fc > 0) 405 if (operand2 & 0xfff && fc > 0)
256 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); 406 return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -297,7 +447,7 @@ out_fail:
297 return 0; 447 return 0;
298} 448}
299 449
300static intercept_handler_t priv_handlers[256] = { 450static const intercept_handler_t b2_handlers[256] = {
301 [0x02] = handle_stidp, 451 [0x02] = handle_stidp,
302 [0x10] = handle_set_prefix, 452 [0x10] = handle_set_prefix,
303 [0x11] = handle_store_prefix, 453 [0x11] = handle_store_prefix,
@@ -305,10 +455,25 @@ static intercept_handler_t priv_handlers[256] = {
305 [0x29] = handle_skey, 455 [0x29] = handle_skey,
306 [0x2a] = handle_skey, 456 [0x2a] = handle_skey,
307 [0x2b] = handle_skey, 457 [0x2b] = handle_skey,
308 [0x34] = handle_stsch, 458 [0x30] = handle_io_inst,
309 [0x5f] = handle_chsc, 459 [0x31] = handle_io_inst,
460 [0x32] = handle_io_inst,
461 [0x33] = handle_io_inst,
462 [0x34] = handle_io_inst,
463 [0x35] = handle_io_inst,
464 [0x36] = handle_io_inst,
465 [0x37] = handle_io_inst,
466 [0x38] = handle_io_inst,
467 [0x39] = handle_io_inst,
468 [0x3a] = handle_io_inst,
469 [0x3b] = handle_io_inst,
470 [0x3c] = handle_io_inst,
471 [0x5f] = handle_io_inst,
472 [0x74] = handle_io_inst,
473 [0x76] = handle_io_inst,
310 [0x7d] = handle_stsi, 474 [0x7d] = handle_stsi,
311 [0xb1] = handle_stfl, 475 [0xb1] = handle_stfl,
476 [0xb2] = handle_lpswe,
312}; 477};
313 478
314int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) 479int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
@@ -322,7 +487,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
322 * state bit and (a) handle the instruction or (b) send a code 2 487 * state bit and (a) handle the instruction or (b) send a code 2
323 * program check. 488 * program check.
324 * Anything else goes to userspace.*/ 489 * Anything else goes to userspace.*/
325 handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; 490 handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
326 if (handler) { 491 if (handler) {
327 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) 492 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
328 return kvm_s390_inject_program_int(vcpu, 493 return kvm_s390_inject_program_int(vcpu,
@@ -333,19 +498,74 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
333 return -EOPNOTSUPP; 498 return -EOPNOTSUPP;
334} 499}
335 500
501static int handle_epsw(struct kvm_vcpu *vcpu)
502{
503 int reg1, reg2;
504
505 reg1 = (vcpu->arch.sie_block->ipb & 0x00f00000) >> 24;
506 reg2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
507
508 /* This basically extracts the mask half of the psw. */
509 vcpu->run->s.regs.gprs[reg1] &= 0xffffffff00000000;
510 vcpu->run->s.regs.gprs[reg1] |= vcpu->arch.sie_block->gpsw.mask >> 32;
511 if (reg2) {
512 vcpu->run->s.regs.gprs[reg2] &= 0xffffffff00000000;
513 vcpu->run->s.regs.gprs[reg2] |=
514 vcpu->arch.sie_block->gpsw.mask & 0x00000000ffffffff;
515 }
516 return 0;
517}
518
519static const intercept_handler_t b9_handlers[256] = {
520 [0x8d] = handle_epsw,
521 [0x9c] = handle_io_inst,
522};
523
524int kvm_s390_handle_b9(struct kvm_vcpu *vcpu)
525{
526 intercept_handler_t handler;
527
528 /* This is handled just as for the B2 instructions. */
529 handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
530 if (handler) {
531 if ((handler != handle_epsw) &&
532 (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE))
533 return kvm_s390_inject_program_int(vcpu,
534 PGM_PRIVILEGED_OPERATION);
535 else
536 return handler(vcpu);
537 }
538 return -EOPNOTSUPP;
539}
540
541static const intercept_handler_t eb_handlers[256] = {
542 [0x8a] = handle_io_inst,
543};
544
545int kvm_s390_handle_priv_eb(struct kvm_vcpu *vcpu)
546{
547 intercept_handler_t handler;
548
549 /* All eb instructions that end up here are privileged. */
550 if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
551 return kvm_s390_inject_program_int(vcpu,
552 PGM_PRIVILEGED_OPERATION);
553 handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff];
554 if (handler)
555 return handler(vcpu);
556 return -EOPNOTSUPP;
557}
558
336static int handle_tprot(struct kvm_vcpu *vcpu) 559static int handle_tprot(struct kvm_vcpu *vcpu)
337{ 560{
338 int base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; 561 u64 address1, address2;
339 int disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
340 int base2 = (vcpu->arch.sie_block->ipb & 0xf000) >> 12;
341 int disp2 = vcpu->arch.sie_block->ipb & 0x0fff;
342 u64 address1 = disp1 + base1 ? vcpu->run->s.regs.gprs[base1] : 0;
343 u64 address2 = disp2 + base2 ? vcpu->run->s.regs.gprs[base2] : 0;
344 struct vm_area_struct *vma; 562 struct vm_area_struct *vma;
345 unsigned long user_address; 563 unsigned long user_address;
346 564
347 vcpu->stat.instruction_tprot++; 565 vcpu->stat.instruction_tprot++;
348 566
567 kvm_s390_get_base_disp_sse(vcpu, &address1, &address2);
568
349 /* we only handle the Linux memory detection case: 569 /* we only handle the Linux memory detection case:
350 * access key == 0 570 * access key == 0
351 * guest DAT == off 571 * guest DAT == off
@@ -405,7 +625,7 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
405 return 0; 625 return 0;
406} 626}
407 627
408static intercept_handler_t x01_handlers[256] = { 628static const intercept_handler_t x01_handlers[256] = {
409 [0x07] = handle_sckpf, 629 [0x07] = handle_sckpf,
410}; 630};
411 631
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 566ddf6e8dfb..1c48ab2845e0 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -137,8 +137,10 @@ static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
137 inti->type = KVM_S390_SIGP_STOP; 137 inti->type = KVM_S390_SIGP_STOP;
138 138
139 spin_lock_bh(&li->lock); 139 spin_lock_bh(&li->lock);
140 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) 140 if ((atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
141 kfree(inti);
141 goto out; 142 goto out;
143 }
142 list_add_tail(&inti->list, &li->list); 144 list_add_tail(&inti->list, &li->list);
143 atomic_set(&li->active, 1); 145 atomic_set(&li->active, 1);
144 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 146 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
@@ -324,8 +326,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
324{ 326{
325 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4; 327 int r1 = (vcpu->arch.sie_block->ipa & 0x00f0) >> 4;
326 int r3 = vcpu->arch.sie_block->ipa & 0x000f; 328 int r3 = vcpu->arch.sie_block->ipa & 0x000f;
327 int base2 = vcpu->arch.sie_block->ipb >> 28;
328 int disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
329 u32 parameter; 329 u32 parameter;
330 u16 cpu_addr = vcpu->run->s.regs.gprs[r3]; 330 u16 cpu_addr = vcpu->run->s.regs.gprs[r3];
331 u8 order_code; 331 u8 order_code;
@@ -336,9 +336,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
336 return kvm_s390_inject_program_int(vcpu, 336 return kvm_s390_inject_program_int(vcpu,
337 PGM_PRIVILEGED_OPERATION); 337 PGM_PRIVILEGED_OPERATION);
338 338
339 order_code = disp2; 339 order_code = kvm_s390_get_base_disp_rs(vcpu);
340 if (base2)
341 order_code += vcpu->run->s.regs.gprs[base2];
342 340
343 if (r1 % 2) 341 if (r1 % 2)
344 parameter = vcpu->run->s.regs.gprs[r1]; 342 parameter = vcpu->run->s.regs.gprs[r1];
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 90fdf85b5ff7..13f30f58a2df 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -141,13 +141,13 @@ TRACE_EVENT(kvm_s390_inject_vcpu,
141 * Trace point for the actual delivery of interrupts. 141 * Trace point for the actual delivery of interrupts.
142 */ 142 */
143TRACE_EVENT(kvm_s390_deliver_interrupt, 143TRACE_EVENT(kvm_s390_deliver_interrupt,
144 TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1), 144 TP_PROTO(unsigned int id, __u64 type, __u64 data0, __u64 data1),
145 TP_ARGS(id, type, data0, data1), 145 TP_ARGS(id, type, data0, data1),
146 146
147 TP_STRUCT__entry( 147 TP_STRUCT__entry(
148 __field(int, id) 148 __field(int, id)
149 __field(__u32, inttype) 149 __field(__u32, inttype)
150 __field(__u32, data0) 150 __field(__u64, data0)
151 __field(__u64, data1) 151 __field(__u64, data1)
152 ), 152 ),
153 153
@@ -159,7 +159,7 @@ TRACE_EVENT(kvm_s390_deliver_interrupt,
159 ), 159 ),
160 160
161 TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ 161 TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
162 "data:%08x %016llx", 162 "data:%08llx %016llx",
163 __entry->id, __entry->inttype, 163 __entry->id, __entry->inttype,
164 __print_symbolic(__entry->inttype, kvm_s390_int_type), 164 __print_symbolic(__entry->inttype, kvm_s390_int_type),
165 __entry->data0, __entry->data1) 165 __entry->data0, __entry->data1)
@@ -204,6 +204,26 @@ TRACE_EVENT(kvm_s390_stop_request,
204 ); 204 );
205 205
206 206
207/*
208 * Trace point for enabling channel I/O instruction support.
209 */
210TRACE_EVENT(kvm_s390_enable_css,
211 TP_PROTO(void *kvm),
212 TP_ARGS(kvm),
213
214 TP_STRUCT__entry(
215 __field(void *, kvm)
216 ),
217
218 TP_fast_assign(
219 __entry->kvm = kvm;
220 ),
221
222 TP_printk("enabling channel I/O support (kvm @ %p)\n",
223 __entry->kvm)
224 );
225
226
207#endif /* _TRACE_KVMS390_H */ 227#endif /* _TRACE_KVMS390_H */
208 228
209/* This part must be outside protection */ 229/* This part must be outside protection */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65e9c3a..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,10 +33,10 @@
33 33
34#define KVM_MAX_VCPUS 254 34#define KVM_MAX_VCPUS 254
35#define KVM_SOFT_MAX_VCPUS 160 35#define KVM_SOFT_MAX_VCPUS 160
36#define KVM_MEMORY_SLOTS 32 36#define KVM_USER_MEM_SLOTS 125
37/* memory slots that does not exposed to userspace */ 37/* memory slots that are not exposed to userspace */
38#define KVM_PRIVATE_MEM_SLOTS 4 38#define KVM_PRIVATE_MEM_SLOTS 3
39#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 39#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
40 40
41#define KVM_MMIO_SIZE 16 41#define KVM_MMIO_SIZE 16
42 42
@@ -219,11 +219,6 @@ struct kvm_mmu_page {
219 u64 *spt; 219 u64 *spt;
220 /* hold the gfn of each spte inside spt */ 220 /* hold the gfn of each spte inside spt */
221 gfn_t *gfns; 221 gfn_t *gfns;
222 /*
223 * One bit set per slot which has memory
224 * in this shadow page.
225 */
226 DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
227 bool unsync; 222 bool unsync;
228 int root_count; /* Currently serving as active root */ 223 int root_count; /* Currently serving as active root */
229 unsigned int unsync_children; 224 unsigned int unsync_children;
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch {
502 u64 msr_val; 497 u64 msr_val;
503 struct gfn_to_hva_cache data; 498 struct gfn_to_hva_cache data;
504 } pv_eoi; 499 } pv_eoi;
500
501 /*
502 * Indicate whether the access faults on its page table in guest
503 * which is set when fix page fault and used to detect unhandeable
504 * instruction.
505 */
506 bool write_fault_to_shadow_pgtable;
505}; 507};
506 508
507struct kvm_lpage_info { 509struct kvm_lpage_info {
@@ -697,6 +699,11 @@ struct kvm_x86_ops {
697 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
698 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 700 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
699 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
700 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
701 int (*get_tdp_level)(void); 708 int (*get_tdp_level)(void);
702 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
991int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 998int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
992void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 999void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
993int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 1000int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1001int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
994int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
995int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
996int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65231e173baf..695399f2d5eb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
27 * 27 *
28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. 28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
29 * The hypercall number should be placed in rax and the return value will be 29 * The hypercall number should be placed in rax and the return value will be
30 * placed in rax. No other registers will be clobbered unless explicited 30 * placed in rax. No other registers will be clobbered unless explicitly
31 * noted by the particular hypercall. 31 * noted by the particular hypercall.
32 */ 32 */
33 33
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 235b49fa554b..b6fbf860e398 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -57,9 +57,12 @@
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
59#define SECONDARY_EXEC_RDTSCP 0x00000008 59#define SECONDARY_EXEC_RDTSCP 0x00000008
60#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
60#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 61#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
61#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 62#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
62#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 63#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
64#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
63#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
64#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
65 68
@@ -97,6 +100,7 @@ enum vmcs_field {
97 GUEST_GS_SELECTOR = 0x0000080a, 100 GUEST_GS_SELECTOR = 0x0000080a,
98 GUEST_LDTR_SELECTOR = 0x0000080c, 101 GUEST_LDTR_SELECTOR = 0x0000080c,
99 GUEST_TR_SELECTOR = 0x0000080e, 102 GUEST_TR_SELECTOR = 0x0000080e,
103 GUEST_INTR_STATUS = 0x00000810,
100 HOST_ES_SELECTOR = 0x00000c00, 104 HOST_ES_SELECTOR = 0x00000c00,
101 HOST_CS_SELECTOR = 0x00000c02, 105 HOST_CS_SELECTOR = 0x00000c02,
102 HOST_SS_SELECTOR = 0x00000c04, 106 HOST_SS_SELECTOR = 0x00000c04,
@@ -124,6 +128,14 @@ enum vmcs_field {
124 APIC_ACCESS_ADDR_HIGH = 0x00002015, 128 APIC_ACCESS_ADDR_HIGH = 0x00002015,
125 EPT_POINTER = 0x0000201a, 129 EPT_POINTER = 0x0000201a,
126 EPT_POINTER_HIGH = 0x0000201b, 130 EPT_POINTER_HIGH = 0x0000201b,
131 EOI_EXIT_BITMAP0 = 0x0000201c,
132 EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
133 EOI_EXIT_BITMAP1 = 0x0000201e,
134 EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
135 EOI_EXIT_BITMAP2 = 0x00002020,
136 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
137 EOI_EXIT_BITMAP3 = 0x00002022,
138 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
127 GUEST_PHYSICAL_ADDRESS = 0x00002400, 139 GUEST_PHYSICAL_ADDRESS = 0x00002400,
128 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 140 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
129 VMCS_LINK_POINTER = 0x00002800, 141 VMCS_LINK_POINTER = 0x00002800,
@@ -346,9 +358,9 @@ enum vmcs_field {
346 358
347#define AR_RESERVD_MASK 0xfffe0f00 359#define AR_RESERVD_MASK 0xfffe0f00
348 360
349#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) 361#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
350#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) 362#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
351#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) 363#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
352 364
353#define VMX_NR_VPIDS (1 << 16) 365#define VMX_NR_VPIDS (1 << 16)
354#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 366#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 979d03bce135..2871fccfee68 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -62,10 +62,12 @@
62#define EXIT_REASON_MCE_DURING_VMENTRY 41 62#define EXIT_REASON_MCE_DURING_VMENTRY 41
63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
64#define EXIT_REASON_APIC_ACCESS 44 64#define EXIT_REASON_APIC_ACCESS 44
65#define EXIT_REASON_EOI_INDUCED 45
65#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
66#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
67#define EXIT_REASON_WBINVD 54 68#define EXIT_REASON_WBINVD 54
68#define EXIT_REASON_XSETBV 55 69#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56
69#define EXIT_REASON_INVPCID 58 71#define EXIT_REASON_INVPCID 58
70 72
71#define VMX_EXIT_REASONS \ 73#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
103 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 105 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
104 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 106 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
105 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 107 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
106 { EXIT_REASON_WBINVD, "WBINVD" } 108 { EXIT_REASON_WBINVD, "WBINVD" }, \
109 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" }
107 114
108 115
109#endif /* _UAPIVMX_H */ 116#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 9f966dc0b9e4..0732f0089a3d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -218,6 +218,9 @@ static void kvm_shutdown(void)
218void __init kvmclock_init(void) 218void __init kvmclock_init(void)
219{ 219{
220 unsigned long mem; 220 unsigned long mem;
221 int size;
222
223 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
221 224
222 if (!kvm_para_available()) 225 if (!kvm_para_available())
223 return; 226 return;
@@ -231,16 +234,14 @@ void __init kvmclock_init(void)
231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 234 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
232 msr_kvm_system_time, msr_kvm_wall_clock); 235 msr_kvm_system_time, msr_kvm_wall_clock);
233 236
234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, 237 mem = memblock_alloc(size, PAGE_SIZE);
235 PAGE_SIZE);
236 if (!mem) 238 if (!mem)
237 return; 239 return;
238 hv_clock = __va(mem); 240 hv_clock = __va(mem);
239 241
240 if (kvm_register_clock("boot clock")) { 242 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL; 243 hv_clock = NULL;
242 memblock_free(mem, 244 memblock_free(mem, size);
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
244 return; 245 return;
245 } 246 }
246 pv_time_ops.sched_clock = kvm_clock_read; 247 pv_time_ops.sched_clock = kvm_clock_read;
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
275 struct pvclock_vcpu_time_info *vcpu_time; 276 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size; 277 unsigned int size;
277 278
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; 279 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
279 280
280 preempt_disable(); 281 preempt_disable();
281 cpu = smp_processor_id(); 282 cpu = smp_processor_id();
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a27e76371108..a335cc6cde72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
27#include <linux/stringify.h>
27 28
28#include "x86.h" 29#include "x86.h"
29#include "tss.h" 30#include "tss.h"
@@ -43,7 +44,7 @@
43#define OpCL 9ull /* CL register (for shifts) */ 44#define OpCL 9ull /* CL register (for shifts) */
44#define OpImmByte 10ull /* 8-bit sign extended immediate */ 45#define OpImmByte 10ull /* 8-bit sign extended immediate */
45#define OpOne 11ull /* Implied 1 */ 46#define OpOne 11ull /* Implied 1 */
46#define OpImm 12ull /* Sign extended immediate */ 47#define OpImm 12ull /* Sign extended up to 32-bit immediate */
47#define OpMem16 13ull /* Memory operand (16-bit). */ 48#define OpMem16 13ull /* Memory operand (16-bit). */
48#define OpMem32 14ull /* Memory operand (32-bit). */ 49#define OpMem32 14ull /* Memory operand (32-bit). */
49#define OpImmU 15ull /* Immediate operand, zero extended */ 50#define OpImmU 15ull /* Immediate operand, zero extended */
@@ -58,6 +59,7 @@
58#define OpFS 24ull /* FS */ 59#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 60#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
61 63
62#define OpBits 5 /* Width of operand field */ 64#define OpBits 5 /* Width of operand field */
63#define OpMask ((1ull << OpBits) - 1) 65#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +103,7 @@
101#define SrcMemFAddr (OpMemFAddr << SrcShift) 103#define SrcMemFAddr (OpMemFAddr << SrcShift)
102#define SrcAcc (OpAcc << SrcShift) 104#define SrcAcc (OpAcc << SrcShift)
103#define SrcImmU16 (OpImmU16 << SrcShift) 105#define SrcImmU16 (OpImmU16 << SrcShift)
106#define SrcImm64 (OpImm64 << SrcShift)
104#define SrcDX (OpDX << SrcShift) 107#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift) 108#define SrcMem8 (OpMem8 << SrcShift)
106#define SrcMask (OpMask << SrcShift) 109#define SrcMask (OpMask << SrcShift)
@@ -113,6 +116,7 @@
113#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ 116#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
114#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ 117#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
115#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 118#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
119#define Escape (5<<15) /* Escape to coprocessor instruction */
116#define Sse (1<<18) /* SSE Vector instruction */ 120#define Sse (1<<18) /* SSE Vector instruction */
117/* Generic ModRM decode. */ 121/* Generic ModRM decode. */
118#define ModRM (1<<19) 122#define ModRM (1<<19)
@@ -146,6 +150,8 @@
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ 150#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ 151#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 152#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
153#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
154#define NoWrite ((u64)1 << 45) /* No writeback */
149 155
150#define X2(x...) x, x 156#define X2(x...) x, x
151#define X3(x...) X2(x), x 157#define X3(x...) X2(x), x
@@ -156,6 +162,27 @@
156#define X8(x...) X4(x), X4(x) 162#define X8(x...) X4(x), X4(x)
157#define X16(x...) X8(x), X8(x) 163#define X16(x...) X8(x), X8(x)
158 164
165#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
166#define FASTOP_SIZE 8
167
168/*
169 * fastop functions have a special calling convention:
170 *
171 * dst: [rdx]:rax (in/out)
172 * src: rbx (in/out)
173 * src2: rcx (in)
174 * flags: rflags (in/out)
175 *
176 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
177 * different operand sizes can be reached by calculation, rather than a jump
178 * table (which would be bigger than the code).
179 *
180 * fastop functions are declared as taking a never-defined fastop parameter,
181 * so they can't be called from C directly.
182 */
183
184struct fastop;
185
159struct opcode { 186struct opcode {
160 u64 flags : 56; 187 u64 flags : 56;
161 u64 intercept : 8; 188 u64 intercept : 8;
@@ -164,6 +191,8 @@ struct opcode {
164 const struct opcode *group; 191 const struct opcode *group;
165 const struct group_dual *gdual; 192 const struct group_dual *gdual;
166 const struct gprefix *gprefix; 193 const struct gprefix *gprefix;
194 const struct escape *esc;
195 void (*fastop)(struct fastop *fake);
167 } u; 196 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 197 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 198};
@@ -180,6 +209,11 @@ struct gprefix {
180 struct opcode pfx_f3; 209 struct opcode pfx_f3;
181}; 210};
182 211
212struct escape {
213 struct opcode op[8];
214 struct opcode high[64];
215};
216
183/* EFLAGS bit definitions. */ 217/* EFLAGS bit definitions. */
184#define EFLG_ID (1<<21) 218#define EFLG_ID (1<<21)
185#define EFLG_VIP (1<<20) 219#define EFLG_VIP (1<<20)
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
407 } \ 441 } \
408 } while (0) 442 } while (0)
409 443
444static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
445
446#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
447#define FOP_RET "ret \n\t"
448
449#define FOP_START(op) \
450 extern void em_##op(struct fastop *fake); \
451 asm(".pushsection .text, \"ax\" \n\t" \
452 ".global em_" #op " \n\t" \
453 FOP_ALIGN \
454 "em_" #op ": \n\t"
455
456#define FOP_END \
457 ".popsection")
458
459#define FOPNOP() FOP_ALIGN FOP_RET
460
461#define FOP1E(op, dst) \
462 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
463
464#define FASTOP1(op) \
465 FOP_START(op) \
466 FOP1E(op##b, al) \
467 FOP1E(op##w, ax) \
468 FOP1E(op##l, eax) \
469 ON64(FOP1E(op##q, rax)) \
470 FOP_END
471
472#define FOP2E(op, dst, src) \
473 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
474
475#define FASTOP2(op) \
476 FOP_START(op) \
477 FOP2E(op##b, al, bl) \
478 FOP2E(op##w, ax, bx) \
479 FOP2E(op##l, eax, ebx) \
480 ON64(FOP2E(op##q, rax, rbx)) \
481 FOP_END
482
483/* 2 operand, word only */
484#define FASTOP2W(op) \
485 FOP_START(op) \
486 FOPNOP() \
487 FOP2E(op##w, ax, bx) \
488 FOP2E(op##l, eax, ebx) \
489 ON64(FOP2E(op##q, rax, rbx)) \
490 FOP_END
491
492/* 2 operand, src is CL */
493#define FASTOP2CL(op) \
494 FOP_START(op) \
495 FOP2E(op##b, al, cl) \
496 FOP2E(op##w, ax, cl) \
497 FOP2E(op##l, eax, cl) \
498 ON64(FOP2E(op##q, rax, cl)) \
499 FOP_END
500
501#define FOP3E(op, dst, src, src2) \
502 FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
503
504/* 3-operand, word-only, src2=cl */
505#define FASTOP3WCL(op) \
506 FOP_START(op) \
507 FOPNOP() \
508 FOP3E(op##w, ax, bx, cl) \
509 FOP3E(op##l, eax, ebx, cl) \
510 ON64(FOP3E(op##q, rax, rbx, cl)) \
511 FOP_END
512
513/* Special case for SETcc - 1 instruction per cc */
514#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
515
516FOP_START(setcc)
517FOP_SETCC(seto)
518FOP_SETCC(setno)
519FOP_SETCC(setc)
520FOP_SETCC(setnc)
521FOP_SETCC(setz)
522FOP_SETCC(setnz)
523FOP_SETCC(setbe)
524FOP_SETCC(setnbe)
525FOP_SETCC(sets)
526FOP_SETCC(setns)
527FOP_SETCC(setp)
528FOP_SETCC(setnp)
529FOP_SETCC(setl)
530FOP_SETCC(setnl)
531FOP_SETCC(setle)
532FOP_SETCC(setnle)
533FOP_END;
534
410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 535#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
411 do { \ 536 do { \
412 unsigned long _tmp; \ 537 unsigned long _tmp; \
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 ulong la; 788 ulong la;
664 u32 lim; 789 u32 lim;
665 u16 sel; 790 u16 sel;
666 unsigned cpl, rpl; 791 unsigned cpl;
667 792
668 la = seg_base(ctxt, addr.seg) + addr.ea; 793 la = seg_base(ctxt, addr.seg) + addr.ea;
669 switch (ctxt->mode) { 794 switch (ctxt->mode) {
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
697 goto bad; 822 goto bad;
698 } 823 }
699 cpl = ctxt->ops->cpl(ctxt); 824 cpl = ctxt->ops->cpl(ctxt);
700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
704 cpl = max(cpl, rpl);
705 if (!(desc.type & 8)) { 825 if (!(desc.type & 8)) {
706 /* data segment */ 826 /* data segment */
707 if (cpl > desc.dpl) 827 if (cpl > desc.dpl)
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
852 return rc; 972 return rc;
853} 973}
854 974
855static int test_cc(unsigned int condition, unsigned int flags) 975FASTOP2(add);
856{ 976FASTOP2(or);
857 int rc = 0; 977FASTOP2(adc);
858 978FASTOP2(sbb);
859 switch ((condition & 15) >> 1) { 979FASTOP2(and);
860 case 0: /* o */ 980FASTOP2(sub);
861 rc |= (flags & EFLG_OF); 981FASTOP2(xor);
862 break; 982FASTOP2(cmp);
863 case 1: /* b/c/nae */ 983FASTOP2(test);
864 rc |= (flags & EFLG_CF); 984
865 break; 985FASTOP3WCL(shld);
866 case 2: /* z/e */ 986FASTOP3WCL(shrd);
867 rc |= (flags & EFLG_ZF); 987
868 break; 988FASTOP2W(imul);
869 case 3: /* be/na */ 989
870 rc |= (flags & (EFLG_CF|EFLG_ZF)); 990FASTOP1(not);
871 break; 991FASTOP1(neg);
872 case 4: /* s */ 992FASTOP1(inc);
873 rc |= (flags & EFLG_SF); 993FASTOP1(dec);
874 break; 994
875 case 5: /* p/pe */ 995FASTOP2CL(rol);
876 rc |= (flags & EFLG_PF); 996FASTOP2CL(ror);
877 break; 997FASTOP2CL(rcl);
878 case 7: /* le/ng */ 998FASTOP2CL(rcr);
879 rc |= (flags & EFLG_ZF); 999FASTOP2CL(shl);
880 /* fall through */ 1000FASTOP2CL(shr);
881 case 6: /* l/nge */ 1001FASTOP2CL(sar);
882 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); 1002
883 break; 1003FASTOP2W(bsf);
884 } 1004FASTOP2W(bsr);
885 1005FASTOP2W(bt);
886 /* Odd condition identifiers (lsb == 1) have inverted sense. */ 1006FASTOP2W(bts);
887 return (!!rc ^ (condition & 1)); 1007FASTOP2W(btr);
1008FASTOP2W(btc);
1009
1010static u8 test_cc(unsigned int condition, unsigned long flags)
1011{
1012 u8 rc;
1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1014
1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1016 asm("push %[flags]; popf; call *%[fastop]"
1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1018 return rc;
888} 1019}
889 1020
890static void fetch_register_operand(struct operand *op) 1021static void fetch_register_operand(struct operand *op)
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
994 ctxt->ops->put_fpu(ctxt); 1125 ctxt->ops->put_fpu(ctxt);
995} 1126}
996 1127
1128static int em_fninit(struct x86_emulate_ctxt *ctxt)
1129{
1130 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1131 return emulate_nm(ctxt);
1132
1133 ctxt->ops->get_fpu(ctxt);
1134 asm volatile("fninit");
1135 ctxt->ops->put_fpu(ctxt);
1136 return X86EMUL_CONTINUE;
1137}
1138
1139static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
1140{
1141 u16 fcw;
1142
1143 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1144 return emulate_nm(ctxt);
1145
1146 ctxt->ops->get_fpu(ctxt);
1147 asm volatile("fnstcw %0": "+m"(fcw));
1148 ctxt->ops->put_fpu(ctxt);
1149
1150 /* force 2 byte destination */
1151 ctxt->dst.bytes = 2;
1152 ctxt->dst.val = fcw;
1153
1154 return X86EMUL_CONTINUE;
1155}
1156
1157static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
1158{
1159 u16 fsw;
1160
1161 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1162 return emulate_nm(ctxt);
1163
1164 ctxt->ops->get_fpu(ctxt);
1165 asm volatile("fnstsw %0": "+m"(fsw));
1166 ctxt->ops->put_fpu(ctxt);
1167
1168 /* force 2 byte destination */
1169 ctxt->dst.bytes = 2;
1170 ctxt->dst.val = fsw;
1171
1172 return X86EMUL_CONTINUE;
1173}
1174
997static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1175static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
998 struct operand *op) 1176 struct operand *op)
999{ 1177{
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1534{ 1712{
1535 int rc; 1713 int rc;
1536 1714
1715 if (ctxt->d & NoWrite)
1716 return X86EMUL_CONTINUE;
1717
1537 switch (ctxt->dst.type) { 1718 switch (ctxt->dst.type) {
1538 case OP_REG: 1719 case OP_REG:
1539 write_register_operand(&ctxt->dst); 1720 write_register_operand(&ctxt->dst);
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1918 return X86EMUL_CONTINUE; 2099 return X86EMUL_CONTINUE;
1919} 2100}
1920 2101
1921static int em_grp2(struct x86_emulate_ctxt *ctxt)
1922{
1923 switch (ctxt->modrm_reg) {
1924 case 0: /* rol */
1925 emulate_2op_SrcB(ctxt, "rol");
1926 break;
1927 case 1: /* ror */
1928 emulate_2op_SrcB(ctxt, "ror");
1929 break;
1930 case 2: /* rcl */
1931 emulate_2op_SrcB(ctxt, "rcl");
1932 break;
1933 case 3: /* rcr */
1934 emulate_2op_SrcB(ctxt, "rcr");
1935 break;
1936 case 4: /* sal/shl */
1937 case 6: /* sal/shl */
1938 emulate_2op_SrcB(ctxt, "sal");
1939 break;
1940 case 5: /* shr */
1941 emulate_2op_SrcB(ctxt, "shr");
1942 break;
1943 case 7: /* sar */
1944 emulate_2op_SrcB(ctxt, "sar");
1945 break;
1946 }
1947 return X86EMUL_CONTINUE;
1948}
1949
1950static int em_not(struct x86_emulate_ctxt *ctxt)
1951{
1952 ctxt->dst.val = ~ctxt->dst.val;
1953 return X86EMUL_CONTINUE;
1954}
1955
1956static int em_neg(struct x86_emulate_ctxt *ctxt)
1957{
1958 emulate_1op(ctxt, "neg");
1959 return X86EMUL_CONTINUE;
1960}
1961
1962static int em_mul_ex(struct x86_emulate_ctxt *ctxt) 2102static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
1963{ 2103{
1964 u8 ex = 0; 2104 u8 ex = 0;
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
2000 int rc = X86EMUL_CONTINUE; 2140 int rc = X86EMUL_CONTINUE;
2001 2141
2002 switch (ctxt->modrm_reg) { 2142 switch (ctxt->modrm_reg) {
2003 case 0: /* inc */
2004 emulate_1op(ctxt, "inc");
2005 break;
2006 case 1: /* dec */
2007 emulate_1op(ctxt, "dec");
2008 break;
2009 case 2: /* call near abs */ { 2143 case 2: /* call near abs */ {
2010 long int old_eip; 2144 long int old_eip;
2011 old_eip = ctxt->_eip; 2145 old_eip = ctxt->_eip;
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2075 /* Save real source value, then compare EAX against destination. */ 2209 /* Save real source value, then compare EAX against destination. */
2076 ctxt->src.orig_val = ctxt->src.val; 2210 ctxt->src.orig_val = ctxt->src.val;
2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); 2211 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2078 emulate_2op_SrcV(ctxt, "cmp"); 2212 fastop(ctxt, em_cmp);
2079 2213
2080 if (ctxt->eflags & EFLG_ZF) { 2214 if (ctxt->eflags & EFLG_ZF) {
2081 /* Success: write back to memory. */ 2215 /* Success: write back to memory. */
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2843 ctxt->src.type = OP_IMM; 2977 ctxt->src.type = OP_IMM;
2844 ctxt->src.val = 0; 2978 ctxt->src.val = 0;
2845 ctxt->src.bytes = 1; 2979 ctxt->src.bytes = 1;
2846 emulate_2op_SrcV(ctxt, "or"); 2980 fastop(ctxt, em_or);
2847 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2981 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2848 if (cf) 2982 if (cf)
2849 ctxt->eflags |= X86_EFLAGS_CF; 2983 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2852 return X86EMUL_CONTINUE; 2986 return X86EMUL_CONTINUE;
2853} 2987}
2854 2988
2989static int em_aad(struct x86_emulate_ctxt *ctxt)
2990{
2991 u8 al = ctxt->dst.val & 0xff;
2992 u8 ah = (ctxt->dst.val >> 8) & 0xff;
2993
2994 al = (al + (ah * ctxt->src.val)) & 0xff;
2995
2996 ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
2997
2998 /* Set PF, ZF, SF */
2999 ctxt->src.type = OP_IMM;
3000 ctxt->src.val = 0;
3001 ctxt->src.bytes = 1;
3002 fastop(ctxt, em_or);
3003
3004 return X86EMUL_CONTINUE;
3005}
3006
2855static int em_call(struct x86_emulate_ctxt *ctxt) 3007static int em_call(struct x86_emulate_ctxt *ctxt)
2856{ 3008{
2857 long rel = ctxt->src.val; 3009 long rel = ctxt->src.val;
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2900 return X86EMUL_CONTINUE; 3052 return X86EMUL_CONTINUE;
2901} 3053}
2902 3054
2903static int em_add(struct x86_emulate_ctxt *ctxt)
2904{
2905 emulate_2op_SrcV(ctxt, "add");
2906 return X86EMUL_CONTINUE;
2907}
2908
2909static int em_or(struct x86_emulate_ctxt *ctxt)
2910{
2911 emulate_2op_SrcV(ctxt, "or");
2912 return X86EMUL_CONTINUE;
2913}
2914
2915static int em_adc(struct x86_emulate_ctxt *ctxt)
2916{
2917 emulate_2op_SrcV(ctxt, "adc");
2918 return X86EMUL_CONTINUE;
2919}
2920
2921static int em_sbb(struct x86_emulate_ctxt *ctxt)
2922{
2923 emulate_2op_SrcV(ctxt, "sbb");
2924 return X86EMUL_CONTINUE;
2925}
2926
2927static int em_and(struct x86_emulate_ctxt *ctxt)
2928{
2929 emulate_2op_SrcV(ctxt, "and");
2930 return X86EMUL_CONTINUE;
2931}
2932
2933static int em_sub(struct x86_emulate_ctxt *ctxt)
2934{
2935 emulate_2op_SrcV(ctxt, "sub");
2936 return X86EMUL_CONTINUE;
2937}
2938
2939static int em_xor(struct x86_emulate_ctxt *ctxt)
2940{
2941 emulate_2op_SrcV(ctxt, "xor");
2942 return X86EMUL_CONTINUE;
2943}
2944
2945static int em_cmp(struct x86_emulate_ctxt *ctxt)
2946{
2947 emulate_2op_SrcV(ctxt, "cmp");
2948 /* Disable writeback. */
2949 ctxt->dst.type = OP_NONE;
2950 return X86EMUL_CONTINUE;
2951}
2952
2953static int em_test(struct x86_emulate_ctxt *ctxt)
2954{
2955 emulate_2op_SrcV(ctxt, "test");
2956 /* Disable writeback. */
2957 ctxt->dst.type = OP_NONE;
2958 return X86EMUL_CONTINUE;
2959}
2960
2961static int em_xchg(struct x86_emulate_ctxt *ctxt) 3055static int em_xchg(struct x86_emulate_ctxt *ctxt)
2962{ 3056{
2963 /* Write back the register source. */ 3057 /* Write back the register source. */
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
2970 return X86EMUL_CONTINUE; 3064 return X86EMUL_CONTINUE;
2971} 3065}
2972 3066
2973static int em_imul(struct x86_emulate_ctxt *ctxt)
2974{
2975 emulate_2op_SrcV_nobyte(ctxt, "imul");
2976 return X86EMUL_CONTINUE;
2977}
2978
2979static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 3067static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2980{ 3068{
2981 ctxt->dst.val = ctxt->src2.val; 3069 ctxt->dst.val = ctxt->src2.val;
2982 return em_imul(ctxt); 3070 return fastop(ctxt, em_imul);
2983} 3071}
2984 3072
2985static int em_cwd(struct x86_emulate_ctxt *ctxt) 3073static int em_cwd(struct x86_emulate_ctxt *ctxt)
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
3300 return X86EMUL_CONTINUE; 3388 return X86EMUL_CONTINUE;
3301} 3389}
3302 3390
3303static int em_bt(struct x86_emulate_ctxt *ctxt)
3304{
3305 /* Disable writeback. */
3306 ctxt->dst.type = OP_NONE;
3307 /* only subword offset */
3308 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
3309
3310 emulate_2op_SrcV_nobyte(ctxt, "bt");
3311 return X86EMUL_CONTINUE;
3312}
3313
3314static int em_bts(struct x86_emulate_ctxt *ctxt)
3315{
3316 emulate_2op_SrcV_nobyte(ctxt, "bts");
3317 return X86EMUL_CONTINUE;
3318}
3319
3320static int em_btr(struct x86_emulate_ctxt *ctxt)
3321{
3322 emulate_2op_SrcV_nobyte(ctxt, "btr");
3323 return X86EMUL_CONTINUE;
3324}
3325
3326static int em_btc(struct x86_emulate_ctxt *ctxt)
3327{
3328 emulate_2op_SrcV_nobyte(ctxt, "btc");
3329 return X86EMUL_CONTINUE;
3330}
3331
3332static int em_bsf(struct x86_emulate_ctxt *ctxt)
3333{
3334 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3335 return X86EMUL_CONTINUE;
3336}
3337
3338static int em_bsr(struct x86_emulate_ctxt *ctxt)
3339{
3340 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3341 return X86EMUL_CONTINUE;
3342}
3343
3344static int em_cpuid(struct x86_emulate_ctxt *ctxt) 3391static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3345{ 3392{
3346 u32 eax, ebx, ecx, edx; 3393 u32 eax, ebx, ecx, edx;
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3572#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3619#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3573#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3620#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3574#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3621#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3622#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
3575#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3623#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3624#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
3576#define II(_f, _e, _i) \ 3625#define II(_f, _e, _i) \
3577 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3626 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3578#define IIP(_f, _e, _i, _p) \ 3627#define IIP(_f, _e, _i, _p) \
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3583#define D2bv(_f) D((_f) | ByteOp), D(_f) 3632#define D2bv(_f) D((_f) | ByteOp), D(_f)
3584#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) 3633#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3585#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3634#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3635#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
3586#define I2bvIP(_f, _e, _i, _p) \ 3636#define I2bvIP(_f, _e, _i, _p) \
3587 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) 3637 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
3588 3638
3589#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ 3639#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3640 F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3641 F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3592 3642
3593static const struct opcode group7_rm1[] = { 3643static const struct opcode group7_rm1[] = {
3594 DI(SrcNone | Priv, monitor), 3644 DI(SrcNone | Priv, monitor),
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = {
3614}; 3664};
3615 3665
3616static const struct opcode group1[] = { 3666static const struct opcode group1[] = {
3617 I(Lock, em_add), 3667 F(Lock, em_add),
3618 I(Lock | PageTable, em_or), 3668 F(Lock | PageTable, em_or),
3619 I(Lock, em_adc), 3669 F(Lock, em_adc),
3620 I(Lock, em_sbb), 3670 F(Lock, em_sbb),
3621 I(Lock | PageTable, em_and), 3671 F(Lock | PageTable, em_and),
3622 I(Lock, em_sub), 3672 F(Lock, em_sub),
3623 I(Lock, em_xor), 3673 F(Lock, em_xor),
3624 I(0, em_cmp), 3674 F(NoWrite, em_cmp),
3625}; 3675};
3626 3676
3627static const struct opcode group1A[] = { 3677static const struct opcode group1A[] = {
3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3678 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3629}; 3679};
3630 3680
3681static const struct opcode group2[] = {
3682 F(DstMem | ModRM, em_rol),
3683 F(DstMem | ModRM, em_ror),
3684 F(DstMem | ModRM, em_rcl),
3685 F(DstMem | ModRM, em_rcr),
3686 F(DstMem | ModRM, em_shl),
3687 F(DstMem | ModRM, em_shr),
3688 F(DstMem | ModRM, em_shl),
3689 F(DstMem | ModRM, em_sar),
3690};
3691
3631static const struct opcode group3[] = { 3692static const struct opcode group3[] = {
3632 I(DstMem | SrcImm, em_test), 3693 F(DstMem | SrcImm | NoWrite, em_test),
3633 I(DstMem | SrcImm, em_test), 3694 F(DstMem | SrcImm | NoWrite, em_test),
3634 I(DstMem | SrcNone | Lock, em_not), 3695 F(DstMem | SrcNone | Lock, em_not),
3635 I(DstMem | SrcNone | Lock, em_neg), 3696 F(DstMem | SrcNone | Lock, em_neg),
3636 I(SrcMem, em_mul_ex), 3697 I(SrcMem, em_mul_ex),
3637 I(SrcMem, em_imul_ex), 3698 I(SrcMem, em_imul_ex),
3638 I(SrcMem, em_div_ex), 3699 I(SrcMem, em_div_ex),
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = {
3640}; 3701};
3641 3702
3642static const struct opcode group4[] = { 3703static const struct opcode group4[] = {
3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3704 F(ByteOp | DstMem | SrcNone | Lock, em_inc),
3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3705 F(ByteOp | DstMem | SrcNone | Lock, em_dec),
3645 N, N, N, N, N, N, 3706 N, N, N, N, N, N,
3646}; 3707};
3647 3708
3648static const struct opcode group5[] = { 3709static const struct opcode group5[] = {
3649 I(DstMem | SrcNone | Lock, em_grp45), 3710 F(DstMem | SrcNone | Lock, em_inc),
3650 I(DstMem | SrcNone | Lock, em_grp45), 3711 F(DstMem | SrcNone | Lock, em_dec),
3651 I(SrcMem | Stack, em_grp45), 3712 I(SrcMem | Stack, em_grp45),
3652 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3653 I(SrcMem | Stack, em_grp45), 3714 I(SrcMem | Stack, em_grp45),
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { {
3682 3743
3683static const struct opcode group8[] = { 3744static const struct opcode group8[] = {
3684 N, N, N, N, 3745 N, N, N, N,
3685 I(DstMem | SrcImmByte, em_bt), 3746 F(DstMem | SrcImmByte | NoWrite, em_bt),
3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3747 F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3687 I(DstMem | SrcImmByte | Lock, em_btr), 3748 F(DstMem | SrcImmByte | Lock, em_btr),
3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3749 F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3689}; 3750};
3690 3751
3691static const struct group_dual group9 = { { 3752static const struct group_dual group9 = { {
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = {
3707 I(0, em_mov), N, N, N, 3768 I(0, em_mov), N, N, N,
3708}; 3769};
3709 3770
3771static const struct escape escape_d9 = { {
3772 N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
3773}, {
3774 /* 0xC0 - 0xC7 */
3775 N, N, N, N, N, N, N, N,
3776 /* 0xC8 - 0xCF */
3777 N, N, N, N, N, N, N, N,
3778 /* 0xD0 - 0xC7 */
3779 N, N, N, N, N, N, N, N,
3780 /* 0xD8 - 0xDF */
3781 N, N, N, N, N, N, N, N,
3782 /* 0xE0 - 0xE7 */
3783 N, N, N, N, N, N, N, N,
3784 /* 0xE8 - 0xEF */
3785 N, N, N, N, N, N, N, N,
3786 /* 0xF0 - 0xF7 */
3787 N, N, N, N, N, N, N, N,
3788 /* 0xF8 - 0xFF */
3789 N, N, N, N, N, N, N, N,
3790} };
3791
3792static const struct escape escape_db = { {
3793 N, N, N, N, N, N, N, N,
3794}, {
3795 /* 0xC0 - 0xC7 */
3796 N, N, N, N, N, N, N, N,
3797 /* 0xC8 - 0xCF */
3798 N, N, N, N, N, N, N, N,
3799 /* 0xD0 - 0xC7 */
3800 N, N, N, N, N, N, N, N,
3801 /* 0xD8 - 0xDF */
3802 N, N, N, N, N, N, N, N,
3803 /* 0xE0 - 0xE7 */
3804 N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
3805 /* 0xE8 - 0xEF */
3806 N, N, N, N, N, N, N, N,
3807 /* 0xF0 - 0xF7 */
3808 N, N, N, N, N, N, N, N,
3809 /* 0xF8 - 0xFF */
3810 N, N, N, N, N, N, N, N,
3811} };
3812
3813static const struct escape escape_dd = { {
3814 N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
3815}, {
3816 /* 0xC0 - 0xC7 */
3817 N, N, N, N, N, N, N, N,
3818 /* 0xC8 - 0xCF */
3819 N, N, N, N, N, N, N, N,
3820 /* 0xD0 - 0xC7 */
3821 N, N, N, N, N, N, N, N,
3822 /* 0xD8 - 0xDF */
3823 N, N, N, N, N, N, N, N,
3824 /* 0xE0 - 0xE7 */
3825 N, N, N, N, N, N, N, N,
3826 /* 0xE8 - 0xEF */
3827 N, N, N, N, N, N, N, N,
3828 /* 0xF0 - 0xF7 */
3829 N, N, N, N, N, N, N, N,
3830 /* 0xF8 - 0xFF */
3831 N, N, N, N, N, N, N, N,
3832} };
3833
3710static const struct opcode opcode_table[256] = { 3834static const struct opcode opcode_table[256] = {
3711 /* 0x00 - 0x07 */ 3835 /* 0x00 - 0x07 */
3712 I6ALU(Lock, em_add), 3836 F6ALU(Lock, em_add),
3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3837 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3714 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), 3838 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3715 /* 0x08 - 0x0F */ 3839 /* 0x08 - 0x0F */
3716 I6ALU(Lock | PageTable, em_or), 3840 F6ALU(Lock | PageTable, em_or),
3717 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), 3841 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3718 N, 3842 N,
3719 /* 0x10 - 0x17 */ 3843 /* 0x10 - 0x17 */
3720 I6ALU(Lock, em_adc), 3844 F6ALU(Lock, em_adc),
3721 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), 3845 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
3722 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), 3846 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
3723 /* 0x18 - 0x1F */ 3847 /* 0x18 - 0x1F */
3724 I6ALU(Lock, em_sbb), 3848 F6ALU(Lock, em_sbb),
3725 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), 3849 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3726 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), 3850 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3727 /* 0x20 - 0x27 */ 3851 /* 0x20 - 0x27 */
3728 I6ALU(Lock | PageTable, em_and), N, N, 3852 F6ALU(Lock | PageTable, em_and), N, N,
3729 /* 0x28 - 0x2F */ 3853 /* 0x28 - 0x2F */
3730 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), 3854 F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3731 /* 0x30 - 0x37 */ 3855 /* 0x30 - 0x37 */
3732 I6ALU(Lock, em_xor), N, N, 3856 F6ALU(Lock, em_xor), N, N,
3733 /* 0x38 - 0x3F */ 3857 /* 0x38 - 0x3F */
3734 I6ALU(0, em_cmp), N, N, 3858 F6ALU(NoWrite, em_cmp), N, N,
3735 /* 0x40 - 0x4F */ 3859 /* 0x40 - 0x4F */
3736 X16(D(DstReg)), 3860 X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
3737 /* 0x50 - 0x57 */ 3861 /* 0x50 - 0x57 */
3738 X8(I(SrcReg | Stack, em_push)), 3862 X8(I(SrcReg | Stack, em_push)),
3739 /* 0x58 - 0x5F */ 3863 /* 0x58 - 0x5F */
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = {
3757 G(DstMem | SrcImm, group1), 3881 G(DstMem | SrcImm, group1),
3758 G(ByteOp | DstMem | SrcImm | No64, group1), 3882 G(ByteOp | DstMem | SrcImm | No64, group1),
3759 G(DstMem | SrcImmByte, group1), 3883 G(DstMem | SrcImmByte, group1),
3760 I2bv(DstMem | SrcReg | ModRM, em_test), 3884 F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
3761 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3885 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3762 /* 0x88 - 0x8F */ 3886 /* 0x88 - 0x8F */
3763 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), 3887 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = {
3777 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3901 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3778 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3902 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
3779 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3903 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3780 I2bv(SrcSI | DstDI | String, em_cmp), 3904 F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
3781 /* 0xA8 - 0xAF */ 3905 /* 0xA8 - 0xAF */
3782 I2bv(DstAcc | SrcImm, em_test), 3906 F2bv(DstAcc | SrcImm | NoWrite, em_test),
3783 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3907 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3784 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3908 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3785 I2bv(SrcAcc | DstDI | String, em_cmp), 3909 F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
3786 /* 0xB0 - 0xB7 */ 3910 /* 0xB0 - 0xB7 */
3787 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), 3911 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
3788 /* 0xB8 - 0xBF */ 3912 /* 0xB8 - 0xBF */
3789 X8(I(DstReg | SrcImm | Mov, em_mov)), 3913 X8(I(DstReg | SrcImm64 | Mov, em_mov)),
3790 /* 0xC0 - 0xC7 */ 3914 /* 0xC0 - 0xC7 */
3791 D2bv(DstMem | SrcImmByte | ModRM), 3915 G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
3792 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3916 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3793 I(ImplicitOps | Stack, em_ret), 3917 I(ImplicitOps | Stack, em_ret),
3794 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), 3918 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = {
3800 D(ImplicitOps), DI(SrcImmByte, intn), 3924 D(ImplicitOps), DI(SrcImmByte, intn),
3801 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3925 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3802 /* 0xD0 - 0xD7 */ 3926 /* 0xD0 - 0xD7 */
3803 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3927 G(Src2One | ByteOp, group2), G(Src2One, group2),
3804 N, N, N, N, 3928 G(Src2CL | ByteOp, group2), G(Src2CL, group2),
3929 N, I(DstAcc | SrcImmByte | No64, em_aad), N, N,
3805 /* 0xD8 - 0xDF */ 3930 /* 0xD8 - 0xDF */
3806 N, N, N, N, N, N, N, N, 3931 N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
3807 /* 0xE0 - 0xE7 */ 3932 /* 0xE0 - 0xE7 */
3808 X3(I(SrcImmByte, em_loop)), 3933 X3(I(SrcImmByte, em_loop)),
3809 I(SrcImmByte, em_jcxz), 3934 I(SrcImmByte, em_jcxz),
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = {
3870 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3995 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3871 /* 0xA0 - 0xA7 */ 3996 /* 0xA0 - 0xA7 */
3872 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3997 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3873 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3998 II(ImplicitOps, em_cpuid, cpuid),
3874 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3999 F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
3875 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 4000 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
4001 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
3876 /* 0xA8 - 0xAF */ 4002 /* 0xA8 - 0xAF */
3877 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4003 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3878 DI(ImplicitOps, rsm), 4004 DI(ImplicitOps, rsm),
3879 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4005 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
3880 D(DstMem | SrcReg | Src2ImmByte | ModRM), 4006 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
3881 D(DstMem | SrcReg | Src2CL | ModRM), 4007 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
3882 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 4008 D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
3883 /* 0xB0 - 0xB7 */ 4009 /* 0xB0 - 0xB7 */
3884 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4010 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
3885 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 4011 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3886 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 4012 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3887 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 4013 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3888 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 4014 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3889 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4015 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3890 /* 0xB8 - 0xBF */ 4016 /* 0xB8 - 0xBF */
3891 N, N, 4017 N, N,
3892 G(BitOp, group8), 4018 G(BitOp, group8),
3893 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 4019 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3894 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 4020 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
3895 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4021 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3896 /* 0xC0 - 0xC7 */ 4022 /* 0xC0 - 0xC7 */
3897 D2bv(DstMem | SrcReg | ModRM | Lock), 4023 D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3950 case 4: 4076 case 4:
3951 op->val = insn_fetch(s32, ctxt); 4077 op->val = insn_fetch(s32, ctxt);
3952 break; 4078 break;
4079 case 8:
4080 op->val = insn_fetch(s64, ctxt);
4081 break;
3953 } 4082 }
3954 if (!sign_extension) { 4083 if (!sign_extension) {
3955 switch (op->bytes) { 4084 switch (op->bytes) {
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4028 case OpImm: 4157 case OpImm:
4029 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 4158 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
4030 break; 4159 break;
4160 case OpImm64:
4161 rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
4162 break;
4031 case OpMem8: 4163 case OpMem8:
4032 ctxt->memop.bytes = 1; 4164 ctxt->memop.bytes = 1;
4033 goto mem_common; 4165 goto mem_common;
@@ -4222,6 +4354,12 @@ done_prefixes:
4222 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; 4354 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
4223 } 4355 }
4224 break; 4356 break;
4357 case Escape:
4358 if (ctxt->modrm > 0xbf)
4359 opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
4360 else
4361 opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
4362 break;
4225 default: 4363 default:
4226 return EMULATION_FAILED; 4364 return EMULATION_FAILED;
4227 } 4365 }
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4492 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4355} 4493}
4356 4494
4495static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4496{
4497 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4498 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4499 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4500 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
4501 : "c"(ctxt->src2.val), [fastop]"S"(fop));
4502 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4503 return X86EMUL_CONTINUE;
4504}
4357 4505
4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4506int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4359{ 4507{
@@ -4483,6 +4631,13 @@ special_insn:
4483 } 4631 }
4484 4632
4485 if (ctxt->execute) { 4633 if (ctxt->execute) {
4634 if (ctxt->d & Fastop) {
4635 void (*fop)(struct fastop *) = (void *)ctxt->execute;
4636 rc = fastop(ctxt, fop);
4637 if (rc != X86EMUL_CONTINUE)
4638 goto done;
4639 goto writeback;
4640 }
4486 rc = ctxt->execute(ctxt); 4641 rc = ctxt->execute(ctxt);
4487 if (rc != X86EMUL_CONTINUE) 4642 if (rc != X86EMUL_CONTINUE)
4488 goto done; 4643 goto done;
@@ -4493,12 +4648,6 @@ special_insn:
4493 goto twobyte_insn; 4648 goto twobyte_insn;
4494 4649
4495 switch (ctxt->b) { 4650 switch (ctxt->b) {
4496 case 0x40 ... 0x47: /* inc r16/r32 */
4497 emulate_1op(ctxt, "inc");
4498 break;
4499 case 0x48 ... 0x4f: /* dec r16/r32 */
4500 emulate_1op(ctxt, "dec");
4501 break;
4502 case 0x63: /* movsxd */ 4651 case 0x63: /* movsxd */
4503 if (ctxt->mode != X86EMUL_MODE_PROT64) 4652 if (ctxt->mode != X86EMUL_MODE_PROT64)
4504 goto cannot_emulate; 4653 goto cannot_emulate;
@@ -4523,9 +4672,6 @@ special_insn:
4523 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; 4672 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
4524 } 4673 }
4525 break; 4674 break;
4526 case 0xc0 ... 0xc1:
4527 rc = em_grp2(ctxt);
4528 break;
4529 case 0xcc: /* int3 */ 4675 case 0xcc: /* int3 */
4530 rc = emulate_int(ctxt, 3); 4676 rc = emulate_int(ctxt, 3);
4531 break; 4677 break;
@@ -4536,13 +4682,6 @@ special_insn:
4536 if (ctxt->eflags & EFLG_OF) 4682 if (ctxt->eflags & EFLG_OF)
4537 rc = emulate_int(ctxt, 4); 4683 rc = emulate_int(ctxt, 4);
4538 break; 4684 break;
4539 case 0xd0 ... 0xd1: /* Grp2 */
4540 rc = em_grp2(ctxt);
4541 break;
4542 case 0xd2 ... 0xd3: /* Grp2 */
4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4544 rc = em_grp2(ctxt);
4545 break;
4546 case 0xe9: /* jmp rel */ 4685 case 0xe9: /* jmp rel */
4547 case 0xeb: /* jmp rel short */ 4686 case 0xeb: /* jmp rel short */
4548 jmp_rel(ctxt, ctxt->src.val); 4687 jmp_rel(ctxt, ctxt->src.val);
@@ -4661,14 +4800,6 @@ twobyte_insn:
4661 case 0x90 ... 0x9f: /* setcc r/m8 */ 4800 case 0x90 ... 0x9f: /* setcc r/m8 */
4662 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4801 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4663 break; 4802 break;
4664 case 0xa4: /* shld imm8, r, r/m */
4665 case 0xa5: /* shld cl, r, r/m */
4666 emulate_2op_cl(ctxt, "shld");
4667 break;
4668 case 0xac: /* shrd imm8, r, r/m */
4669 case 0xad: /* shrd cl, r, r/m */
4670 emulate_2op_cl(ctxt, "shrd");
4671 break;
4672 case 0xae: /* clflush */ 4803 case 0xae: /* clflush */
4673 break; 4804 break;
4674 case 0xb6 ... 0xb7: /* movzx */ 4805 case 0xb6 ... 0xb7: /* movzx */
@@ -4682,7 +4813,7 @@ twobyte_insn:
4682 (s16) ctxt->src.val; 4813 (s16) ctxt->src.val;
4683 break; 4814 break;
4684 case 0xc0 ... 0xc1: /* xadd */ 4815 case 0xc0 ... 0xc1: /* xadd */
4685 emulate_2op_SrcV(ctxt, "add"); 4816 fastop(ctxt, em_add);
4686 /* Write back the register source. */ 4817 /* Write back the register source. */
4687 ctxt->src.val = ctxt->dst.orig_val; 4818 ctxt->src.val = ctxt->dst.orig_val;
4688 write_register_operand(&ctxt->src); 4819 write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11300d2fa714..c1d30b2fc9bb 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm)
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->period);
126 125
127 return elapsed; 126 return elapsed;
128} 127}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 848206df0967..cc31f7c06d3d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
241 int irq, irq2, intno; 241 int irq, irq2, intno;
242 struct kvm_pic *s = pic_irqchip(kvm); 242 struct kvm_pic *s = pic_irqchip(kvm);
243 243
244 s->output = 0;
245
244 pic_lock(s); 246 pic_lock(s);
245 irq = pic_get_irq(&s->pics[0]); 247 irq = pic_get_irq(&s->pics[0]);
246 if (irq >= 0) { 248 if (irq >= 0) {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1618bd..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is pending interrupt from
42 * non-APIC source without intack.
43 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{
46 if (kvm_apic_accept_pic_intr(v))
47 return pic_irqchip(v->kvm)->output; /* PIC */
48 else
49 return 0;
50}
51
52/*
53 * check if there is injectable interrupt:
54 * when virtual interrupt delivery enabled,
55 * interrupt from apic will handled by hardware,
56 * we don't need to check it here.
57 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{
60 if (!irqchip_in_kernel(v->kvm))
61 return v->arch.interrupt.pending;
62
63 if (kvm_cpu_has_extint(v))
64 return 1;
65
66 if (kvm_apic_vid_enabled(v->kvm))
67 return 0;
68
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
70}
71
72/*
41 * check if there is pending interrupt without 73 * check if there is pending interrupt without
42 * intack. 74 * intack.
43 */ 75 */
44int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 76int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
45{ 77{
46 struct kvm_pic *s;
47
48 if (!irqchip_in_kernel(v->kvm)) 78 if (!irqchip_in_kernel(v->kvm))
49 return v->arch.interrupt.pending; 79 return v->arch.interrupt.pending;
50 80
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 81 if (kvm_cpu_has_extint(v))
52 if (kvm_apic_accept_pic_intr(v)) { 82 return 1;
53 s = pic_irqchip(v->kvm); /* PIC */ 83
54 return s->output; 84 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
55 } else
56 return 0;
57 }
58 return 1;
59} 85}
60EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); 86EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
61 87
62/* 88/*
89 * Read pending interrupt(from non-APIC source)
90 * vector and intack.
91 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{
94 if (kvm_cpu_has_extint(v))
95 return kvm_pic_read_irq(v->kvm); /* PIC */
96 return -1;
97}
98
99/*
63 * Read pending interrupt vector and intack. 100 * Read pending interrupt vector and intack.
64 */ 101 */
65int kvm_cpu_get_interrupt(struct kvm_vcpu *v) 102int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
66{ 103{
67 struct kvm_pic *s;
68 int vector; 104 int vector;
69 105
70 if (!irqchip_in_kernel(v->kvm)) 106 if (!irqchip_in_kernel(v->kvm))
71 return v->arch.interrupt.nr; 107 return v->arch.interrupt.nr;
72 108
73 vector = kvm_get_apic_interrupt(v); /* APIC */ 109 vector = kvm_cpu_get_extint(v);
74 if (vector == -1) { 110
75 if (kvm_apic_accept_pic_intr(v)) { 111 if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
76 s = pic_irqchip(v->kvm); 112 return vector; /* PIC */
77 s->output = 0; /* PIC */ 113
78 vector = kvm_pic_read_irq(v->kvm); 114 return kvm_get_apic_interrupt(v); /* APIC */
79 }
80 }
81 return vector;
82} 115}
83EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
84 116
85void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 117void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
86{ 118{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f107..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
142 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
148static inline int kvm_apic_id(struct kvm_lapic *apic) 143static inline int kvm_apic_id(struct kvm_lapic *apic)
149{ 144{
150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151} 146}
152 147
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
154{ 151{
155 u16 cid; 152 struct kvm_lapic **dst;
156 ldr >>= 32 - map->ldr_bits; 153 struct kvm_apic_map *map;
157 cid = (ldr >> map->cid_shift) & map->cid_mask; 154 unsigned long bitmap = 1;
155 int i;
158 156
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
160 159
161 return cid; 160 if (unlikely(!map)) {
162} 161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
163 164
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 165 if (irq->dest_mode == 0) { /* physical mode */
165{ 166 if (irq->delivery_mode == APIC_DM_LOWEST ||
166 ldr >>= (32 - map->ldr_bits); 167 irq->dest_id == 0xff) {
167 return ldr & map->lid_mask; 168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
168} 193}
169 194
170static void recalculate_apic_map(struct kvm *kvm) 195static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
230 255
231 if (old) 256 if (old)
232 kfree_rcu(old, rcu); 257 kfree_rcu(old, rcu);
258
259 kvm_ioapic_make_eoibitmap_request(kvm);
233} 260}
234 261
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
345{ 372{
346 int result; 373 int result;
347 374
375 /*
376 * Note that irr_pending is just a hint. It will be always
377 * true with virtual interrupt delivery enabled.
378 */
348 if (!apic->irr_pending) 379 if (!apic->irr_pending)
349 return -1; 380 return -1;
350 381
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
461static inline int apic_find_highest_isr(struct kvm_lapic *apic) 492static inline int apic_find_highest_isr(struct kvm_lapic *apic)
462{ 493{
463 int result; 494 int result;
495
496 /* Note that isr_count is always 1 with vid enabled */
464 if (!apic->isr_count) 497 if (!apic->isr_count)
465 return -1; 498 return -1;
466 if (likely(apic->highest_isr_cache != -1)) 499 if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
740 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 773 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
741} 774}
742 775
776static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
777{
778 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
779 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
780 int trigger_mode;
781 if (apic_test_vector(vector, apic->regs + APIC_TMR))
782 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else
784 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
786 }
787}
788
743static int apic_set_eoi(struct kvm_lapic *apic) 789static int apic_set_eoi(struct kvm_lapic *apic)
744{ 790{
745 int vector = apic_find_highest_isr(apic); 791 int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
756 apic_clear_isr(vector, apic); 802 apic_clear_isr(vector, apic);
757 apic_update_ppr(apic); 803 apic_update_ppr(apic);
758 804
759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 805 kvm_ioapic_send_eoi(apic, vector);
760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
761 int trigger_mode;
762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
763 trigger_mode = IOAPIC_LEVEL_TRIG;
764 else
765 trigger_mode = IOAPIC_EDGE_TRIG;
766 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
767 }
768 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 806 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
769 return vector; 807 return vector;
770} 808}
771 809
810/*
811 * this interface assumes a trap-like exit, which has already finished
812 * desired side effect including vISR and vPPR update.
813 */
814void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
815{
816 struct kvm_lapic *apic = vcpu->arch.apic;
817
818 trace_kvm_eoi(apic, vector);
819
820 kvm_ioapic_send_eoi(apic, vector);
821 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
822}
823EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
824
772static void apic_send_ipi(struct kvm_lapic *apic) 825static void apic_send_ipi(struct kvm_lapic *apic)
773{ 826{
774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); 827 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1212} 1265}
1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1266EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1214 1267
1268/* emulate APIC access in a trap manner */
1269void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
1270{
1271 u32 val = 0;
1272
1273 /* hw has done the conditional check and inst decode */
1274 offset &= 0xff0;
1275
1276 apic_reg_read(vcpu->arch.apic, offset, 4, &val);
1277
1278 /* TODO: optimize to just emulate side effect w/o one more write */
1279 apic_reg_write(vcpu->arch.apic, offset, val);
1280}
1281EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
1282
1215void kvm_free_lapic(struct kvm_vcpu *vcpu) 1283void kvm_free_lapic(struct kvm_vcpu *vcpu)
1216{ 1284{
1217 struct kvm_lapic *apic = vcpu->arch.apic; 1285 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1288 1356
1289void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 1357void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1290{ 1358{
1359 u64 old_value = vcpu->arch.apic_base;
1291 struct kvm_lapic *apic = vcpu->arch.apic; 1360 struct kvm_lapic *apic = vcpu->arch.apic;
1292 1361
1293 if (!apic) { 1362 if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1309 value &= ~MSR_IA32_APICBASE_BSP; 1378 value &= ~MSR_IA32_APICBASE_BSP;
1310 1379
1311 vcpu->arch.apic_base = value; 1380 vcpu->arch.apic_base = value;
1312 if (apic_x2apic_mode(apic)) { 1381 if ((old_value ^ value) & X2APIC_ENABLE) {
1313 u32 id = kvm_apic_id(apic); 1382 if (value & X2APIC_ENABLE) {
1314 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1383 u32 id = kvm_apic_id(apic);
1315 kvm_apic_set_ldr(apic, ldr); 1384 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
1385 kvm_apic_set_ldr(apic, ldr);
1386 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
1387 } else
1388 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
1316 } 1389 }
1390
1317 apic->base_address = apic->vcpu->arch.apic_base & 1391 apic->base_address = apic->vcpu->arch.apic_base &
1318 MSR_IA32_APICBASE_BASE; 1392 MSR_IA32_APICBASE_BASE;
1319 1393
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1359 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1433 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1360 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1434 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1361 } 1435 }
1362 apic->irr_pending = false; 1436 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
1363 apic->isr_count = 0; 1437 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
1364 apic->highest_isr_cache = -1; 1438 apic->highest_isr_cache = -1;
1365 update_divide_count(apic); 1439 update_divide_count(apic);
1366 atomic_set(&apic->lapic_timer.pending, 0); 1440 atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1575 update_divide_count(apic); 1649 update_divide_count(apic);
1576 start_apic_timer(apic); 1650 start_apic_timer(apic);
1577 apic->irr_pending = true; 1651 apic->irr_pending = true;
1578 apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1652 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1653 1 : count_vectors(apic->regs + APIC_ISR);
1579 apic->highest_isr_cache = -1; 1654 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1580 kvm_make_request(KVM_REQ_EVENT, vcpu); 1656 kvm_make_request(KVM_REQ_EVENT, vcpu);
1581} 1657}
1582 1658
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571f..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
66 66
67void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
68void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
69
67void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 70void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
68void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 71void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
69void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 72void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); 127 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125} 128}
126 129
130static inline int apic_x2apic_mode(struct kvm_lapic *apic)
131{
132 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
133}
134
135static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
136{
137 return kvm_x86_ops->vm_has_apicv(kvm);
138}
139
140static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
141{
142 u16 cid;
143 ldr >>= 32 - map->ldr_bits;
144 cid = (ldr >> map->cid_shift) & map->cid_mask;
145
146 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
147
148 return cid;
149}
150
151static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
152{
153 ldr >>= (32 - map->ldr_bits);
154 return ldr & map->lid_mask;
155}
156
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
158 struct kvm_lapic_irq *irq,
159 u64 *eoi_bitmap);
160
127#endif 161#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f5..4ed3edbe06bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
448 448
449static bool spte_is_locklessly_modifiable(u64 spte) 449static bool spte_is_locklessly_modifiable(u64 spte)
450{ 450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 451 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
452 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
452} 453}
453 454
454static bool spte_has_volatile_bits(u64 spte) 455static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
831 if (host_level == PT_PAGE_TABLE_LEVEL) 832 if (host_level == PT_PAGE_TABLE_LEVEL)
832 return host_level; 833 return host_level;
833 834
834 max_level = kvm_x86_ops->get_lpage_level() < host_level ? 835 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
835 kvm_x86_ops->get_lpage_level() : host_level;
836 836
837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1142} 1142}
1143 1143
1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1145 int level, bool pt_protect) 1145 bool pt_protect)
1146{ 1146{
1147 u64 *sptep; 1147 u64 *sptep;
1148 struct rmap_iterator iter; 1148 struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1180 while (mask) { 1180 while (mask) {
1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot); 1182 PT_PAGE_TABLE_LEVEL, slot);
1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, false);
1184 1184
1185 /* clear the first set bit */ 1185 /* clear the first set bit */
1186 mask &= mask - 1; 1186 mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1199 for (i = PT_PAGE_TABLE_LEVEL; 1199 for (i = PT_PAGE_TABLE_LEVEL;
1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1201 rmapp = __gfn_to_rmap(gfn, i, slot); 1201 rmapp = __gfn_to_rmap(gfn, i, slot);
1202 write_protected |= __rmap_write_protect(kvm, rmapp, i, true); 1202 write_protected |= __rmap_write_protect(kvm, rmapp, true);
1203 } 1203 }
1204 1204
1205 return write_protected; 1205 return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461} 1461}
1462 1462
1463/* 1463static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1464 * Remove the sp from shadow page cache, after call it,
1465 * we can not find this sp from the cache, and the shadow
1466 * page table is still valid.
1467 * It should be under the protection of mmu lock.
1468 */
1469static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470{ 1464{
1471 ASSERT(is_empty_shadow_page(sp->spt)); 1465 ASSERT(is_empty_shadow_page(sp->spt));
1472 hlist_del(&sp->hash_link); 1466 hlist_del(&sp->hash_link);
1473 if (!sp->role.direct)
1474 free_page((unsigned long)sp->gfns);
1475}
1476
1477/*
1478 * Free the shadow page table and the sp, we can do it
1479 * out of the protection of mmu lock.
1480 */
1481static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482{
1483 list_del(&sp->link); 1467 list_del(&sp->link);
1484 free_page((unsigned long)sp->spt); 1468 free_page((unsigned long)sp->spt);
1469 if (!sp->role.direct)
1470 free_page((unsigned long)sp->gfns);
1485 kmem_cache_free(mmu_page_header_cache, sp); 1471 kmem_cache_free(mmu_page_header_cache, sp);
1486} 1472}
1487 1473
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1522 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1508 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1523 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1509 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1524 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1510 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1525 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1526 sp->parent_ptes = 0; 1511 sp->parent_ptes = 0;
1527 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1512 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1528 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1513 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1973{ 1958{
1974 u64 spte; 1959 u64 spte;
1975 1960
1976 spte = __pa(sp->spt) 1961 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1977 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1962 shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
1978 | PT_WRITABLE_MASK | PT_USER_MASK; 1963
1979 mmu_spte_set(sptep, spte); 1964 mmu_spte_set(sptep, spte);
1980} 1965}
1981 1966
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2126 do { 2111 do {
2127 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2112 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2128 WARN_ON(!sp->role.invalid || sp->root_count); 2113 WARN_ON(!sp->role.invalid || sp->root_count);
2129 kvm_mmu_isolate_page(sp);
2130 kvm_mmu_free_page(sp); 2114 kvm_mmu_free_page(sp);
2131 } while (!list_empty(invalid_list)); 2115 } while (!list_empty(invalid_list));
2132} 2116}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2144 * change the value 2128 * change the value
2145 */ 2129 */
2146 2130
2131 spin_lock(&kvm->mmu_lock);
2132
2147 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2133 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2148 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2134 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
2149 !list_empty(&kvm->arch.active_mmu_pages)) { 2135 !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2158 } 2144 }
2159 2145
2160 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2146 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2147
2148 spin_unlock(&kvm->mmu_lock);
2161} 2149}
2162 2150
2163int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2151int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2183} 2171}
2184EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2172EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2185 2173
2186static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2187{
2188 int slot = memslot_id(kvm, gfn);
2189 struct kvm_mmu_page *sp = page_header(__pa(pte));
2190
2191 __set_bit(slot, sp->slot_bitmap);
2192}
2193
2194/* 2174/*
2195 * The function is based on mtrr_type_lookup() in 2175 * The function is based on mtrr_type_lookup() in
2196 * arch/x86/kernel/cpu/mtrr/generic.c 2176 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2332 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2312 if (s->role.level != PT_PAGE_TABLE_LEVEL)
2333 return 1; 2313 return 1;
2334 2314
2335 if (!need_unsync && !s->unsync) { 2315 if (!s->unsync)
2336 need_unsync = true; 2316 need_unsync = true;
2337 }
2338 } 2317 }
2339 if (need_unsync) 2318 if (need_unsync)
2340 kvm_unsync_pages(vcpu, gfn); 2319 kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2342} 2321}
2343 2322
2344static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2323static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2345 unsigned pte_access, int user_fault, 2324 unsigned pte_access, int level,
2346 int write_fault, int level,
2347 gfn_t gfn, pfn_t pfn, bool speculative, 2325 gfn_t gfn, pfn_t pfn, bool speculative,
2348 bool can_unsync, bool host_writable) 2326 bool can_unsync, bool host_writable)
2349{ 2327{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2378 2356
2379 spte |= (u64)pfn << PAGE_SHIFT; 2357 spte |= (u64)pfn << PAGE_SHIFT;
2380 2358
2381 if ((pte_access & ACC_WRITE_MASK) 2359 if (pte_access & ACC_WRITE_MASK) {
2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2360
2385 /* 2361 /*
2386 * There are two cases: 2362 * Other vcpu creates new sp in the window between
2387 * - the one is other vcpu creates new sp in the window 2363 * mapping_level() and acquiring mmu-lock. We can
2388 * between mapping_level() and acquiring mmu-lock. 2364 * allow guest to retry the access, the mapping can
2389 * - the another case is the new sp is created by itself 2365 * be fixed if guest refault.
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */ 2366 */
2396 if (level > PT_PAGE_TABLE_LEVEL && 2367 if (level > PT_PAGE_TABLE_LEVEL &&
2397 has_wrprotected_page(vcpu->kvm, gfn, level)) 2368 has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2399 2370
2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2371 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2401 2372
2402 if (!vcpu->arch.mmu.direct_map
2403 && !(pte_access & ACC_WRITE_MASK)) {
2404 spte &= ~PT_USER_MASK;
2405 /*
2406 * If we converted a user page to a kernel page,
2407 * so that the kernel can write to it when cr0.wp=0,
2408 * then we should prevent the kernel from executing it
2409 * if SMEP is enabled.
2410 */
2411 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2412 spte |= PT64_NX_MASK;
2413 }
2414
2415 /* 2373 /*
2416 * Optimization: for pte sync, if spte was writable the hash 2374 * Optimization: for pte sync, if spte was writable the hash
2417 * lookup is unnecessary (and expensive). Write protection 2375 * lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
2441} 2399}
2442 2400
2443static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2401static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2444 unsigned pt_access, unsigned pte_access, 2402 unsigned pte_access, int write_fault, int *emulate,
2445 int user_fault, int write_fault, 2403 int level, gfn_t gfn, pfn_t pfn, bool speculative,
2446 int *emulate, int level, gfn_t gfn,
2447 pfn_t pfn, bool speculative,
2448 bool host_writable) 2404 bool host_writable)
2449{ 2405{
2450 int was_rmapped = 0; 2406 int was_rmapped = 0;
2451 int rmap_count; 2407 int rmap_count;
2452 2408
2453 pgprintk("%s: spte %llx access %x write_fault %d" 2409 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2454 " user_fault %d gfn %llx\n", 2410 *sptep, write_fault, gfn);
2455 __func__, *sptep, pt_access,
2456 write_fault, user_fault, gfn);
2457 2411
2458 if (is_rmap_spte(*sptep)) { 2412 if (is_rmap_spte(*sptep)) {
2459 /* 2413 /*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2477 was_rmapped = 1; 2431 was_rmapped = 1;
2478 } 2432 }
2479 2433
2480 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2434 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
2481 level, gfn, pfn, speculative, true, 2435 true, host_writable)) {
2482 host_writable)) {
2483 if (write_fault) 2436 if (write_fault)
2484 *emulate = 1; 2437 *emulate = 1;
2485 kvm_mmu_flush_tlb(vcpu); 2438 kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2497 ++vcpu->kvm->stat.lpages; 2450 ++vcpu->kvm->stat.lpages;
2498 2451
2499 if (is_shadow_present_pte(*sptep)) { 2452 if (is_shadow_present_pte(*sptep)) {
2500 page_header_update_slot(vcpu->kvm, sptep, gfn);
2501 if (!was_rmapped) { 2453 if (!was_rmapped) {
2502 rmap_count = rmap_add(vcpu, sptep, gfn); 2454 rmap_count = rmap_add(vcpu, sptep, gfn);
2503 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2455 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2571 return -1; 2523 return -1;
2572 2524
2573 for (i = 0; i < ret; i++, gfn++, start++) 2525 for (i = 0; i < ret; i++, gfn++, start++)
2574 mmu_set_spte(vcpu, start, ACC_ALL, 2526 mmu_set_spte(vcpu, start, access, 0, NULL,
2575 access, 0, 0, NULL, 2527 sp->role.level, gfn, page_to_pfn(pages[i]),
2576 sp->role.level, gfn, 2528 true, true);
2577 page_to_pfn(pages[i]), true, true);
2578 2529
2579 return 0; 2530 return 0;
2580} 2531}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2633 2584
2634 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2585 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2635 if (iterator.level == level) { 2586 if (iterator.level == level) {
2636 unsigned pte_access = ACC_ALL; 2587 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
2637 2588 write, &emulate, level, gfn, pfn,
2638 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2589 prefault, map_writable);
2639 0, write, &emulate,
2640 level, gfn, pfn, prefault, map_writable);
2641 direct_pte_prefetch(vcpu, iterator.sptep); 2590 direct_pte_prefetch(vcpu, iterator.sptep);
2642 ++vcpu->stat.pf_fixed; 2591 ++vcpu->stat.pf_fixed;
2643 break; 2592 break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2652 iterator.level - 1, 2601 iterator.level - 1,
2653 1, ACC_ALL, iterator.sptep); 2602 1, ACC_ALL, iterator.sptep);
2654 2603
2655 mmu_spte_set(iterator.sptep, 2604 link_shadow_page(iterator.sptep, sp);
2656 __pa(sp->spt)
2657 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2658 | shadow_user_mask | shadow_x_mask
2659 | shadow_accessed_mask);
2660 } 2605 }
2661 } 2606 }
2662 return emulate; 2607 return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3719 else 3664 else
3720 r = paging32_init_context(vcpu, context); 3665 r = paging32_init_context(vcpu, context);
3721 3666
3667 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
3722 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3668 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3723 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3669 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3724 vcpu->arch.mmu.base_role.smep_andnot_wp 3670 vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3885 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3831 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3886 *gpa &= ~(gpa_t)7; 3832 *gpa &= ~(gpa_t)7;
3887 *bytes = 8; 3833 *bytes = 8;
3888 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); 3834 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
3889 if (r) 3835 if (r)
3890 gentry = 0; 3836 gentry = 0;
3891 new = (const u8 *)&gentry; 3837 new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4039 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3985 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4040 & mask.word) && rmap_can_add(vcpu)) 3986 & mask.word) && rmap_can_add(vcpu))
4041 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3987 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4042 if (!remote_flush && need_remote_flush(entry, *spte)) 3988 if (need_remote_flush(entry, *spte))
4043 remote_flush = true; 3989 remote_flush = true;
4044 ++spte; 3990 ++spte;
4045 } 3991 }
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
4198 4144
4199void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4145void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4200{ 4146{
4201 struct kvm_mmu_page *sp; 4147 struct kvm_memory_slot *memslot;
4202 bool flush = false; 4148 gfn_t last_gfn;
4149 int i;
4203 4150
4204 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4151 memslot = id_to_memslot(kvm->memslots, slot);
4205 int i; 4152 last_gfn = memslot->base_gfn + memslot->npages - 1;
4206 u64 *pt;
4207 4153
4208 if (!test_bit(slot, sp->slot_bitmap)) 4154 spin_lock(&kvm->mmu_lock);
4209 continue;
4210 4155
4211 pt = sp->spt; 4156 for (i = PT_PAGE_TABLE_LEVEL;
4212 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 4157 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4213 if (!is_shadow_present_pte(pt[i]) || 4158 unsigned long *rmapp;
4214 !is_last_spte(pt[i], sp->role.level)) 4159 unsigned long last_index, index;
4215 continue;
4216 4160
4217 spte_write_protect(kvm, &pt[i], &flush, false); 4161 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4162 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4163
4164 for (index = 0; index <= last_index; ++index, ++rmapp) {
4165 if (*rmapp)
4166 __rmap_write_protect(kvm, rmapp, false);
4167
4168 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4169 kvm_flush_remote_tlbs(kvm);
4170 cond_resched_lock(&kvm->mmu_lock);
4171 }
4218 } 4172 }
4219 } 4173 }
4174
4220 kvm_flush_remote_tlbs(kvm); 4175 kvm_flush_remote_tlbs(kvm);
4176 spin_unlock(&kvm->mmu_lock);
4221} 4177}
4222 4178
4223void kvm_mmu_zap_all(struct kvm *kvm) 4179void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index cd6e98333ba3..b8f6172f4174 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 TP_ARGS(sp) 195 TP_ARGS(sp)
196); 196);
197 197
198DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
199 TP_PROTO(struct kvm_mmu_page *sp),
200
201 TP_ARGS(sp)
202);
203
204TRACE_EVENT( 198TRACE_EVENT(
205 mark_mmio_spte, 199 mark_mmio_spte,
206 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 891eb6d93b8b..105dd5bd550e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
151 pt_element_t pte; 151 pt_element_t pte;
152 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
153 gfn_t table_gfn; 153 gfn_t table_gfn;
154 unsigned index, pt_access, pte_access, accessed_dirty, shift; 154 unsigned index, pt_access, pte_access, accessed_dirty;
155 gpa_t pte_gpa; 155 gpa_t pte_gpa;
156 int offset; 156 int offset;
157 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
@@ -249,16 +249,12 @@ retry_walk:
249 249
250 if (!write_fault) 250 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 251 protect_clean_gpte(&pte_access, pte);
252 252 else
253 /* 253 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one 254 * On a write fault, fold the dirty bit into accessed_dirty by
255 * place right. 255 * shifting it one place right.
256 * 256 */
257 * On a read fault, do nothing. 257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
262 258
263 if (unlikely(!accessed_dirty)) { 259 if (unlikely(!accessed_dirty)) {
264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 * we call mmu_set_spte() with host_writable = true because 326 * we call mmu_set_spte() with host_writable = true because
331 * pte_prefetch_gfn_to_pfn always gets a writable pfn. 327 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
332 */ 328 */
333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 329 mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); 330 gfn, pfn, true, true);
335 331
336 return true; 332 return true;
337} 333}
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
405 */ 401 */
406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 402static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
407 struct guest_walker *gw, 403 struct guest_walker *gw,
408 int user_fault, int write_fault, int hlevel, 404 int write_fault, int hlevel,
409 pfn_t pfn, bool map_writable, bool prefault) 405 pfn_t pfn, bool map_writable, bool prefault)
410{ 406{
411 struct kvm_mmu_page *sp = NULL; 407 struct kvm_mmu_page *sp = NULL;
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
413 unsigned direct_access, access = gw->pt_access; 409 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0; 410 int top_level, emulate = 0;
415 411
416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
417 return 0;
418
419 direct_access = gw->pte_access; 412 direct_access = gw->pte_access;
420 413
421 top_level = vcpu->arch.mmu.root_level; 414 top_level = vcpu->arch.mmu.root_level;
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
477 } 470 }
478 471
479 clear_sp_write_flooding_count(it.sptep); 472 clear_sp_write_flooding_count(it.sptep);
480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 473 mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
481 user_fault, write_fault, &emulate, it.level, 474 it.level, gw->gfn, pfn, prefault, map_writable);
482 gw->gfn, pfn, prefault, map_writable);
483 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 475 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
484 476
485 return emulate; 477 return emulate;
@@ -491,6 +483,46 @@ out_gpte_changed:
491 return 0; 483 return 0;
492} 484}
493 485
486 /*
487 * To see whether the mapped gfn can write its page table in the current
488 * mapping.
489 *
490 * It is the helper function of FNAME(page_fault). When guest uses large page
491 * size to map the writable gfn which is used as current page table, we should
492 * force kvm to use small page size to map it because new shadow page will be
493 * created when kvm establishes shadow page table that stop kvm using large
494 * page size. Do it early can avoid unnecessary #PF and emulation.
495 *
496 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
497 * currently used as its page table.
498 *
499 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
500 * since the PDPT is always shadowed, that means, we can not use large page
501 * size to map the gfn which is used as PDPT.
502 */
503static bool
504FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
505 struct guest_walker *walker, int user_fault,
506 bool *write_fault_to_shadow_pgtable)
507{
508 int level;
509 gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
510 bool self_changed = false;
511
512 if (!(walker->pte_access & ACC_WRITE_MASK ||
513 (!is_write_protection(vcpu) && !user_fault)))
514 return false;
515
516 for (level = walker->level; level <= walker->max_level; level++) {
517 gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
518
519 self_changed |= !(gfn & mask);
520 *write_fault_to_shadow_pgtable |= !gfn;
521 }
522
523 return self_changed;
524}
525
494/* 526/*
495 * Page fault handler. There are several causes for a page fault: 527 * Page fault handler. There are several causes for a page fault:
496 * - there is no shadow pte for the guest pte 528 * - there is no shadow pte for the guest pte
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
516 int level = PT_PAGE_TABLE_LEVEL; 548 int level = PT_PAGE_TABLE_LEVEL;
517 int force_pt_level; 549 int force_pt_level;
518 unsigned long mmu_seq; 550 unsigned long mmu_seq;
519 bool map_writable; 551 bool map_writable, is_self_change_mapping;
520 552
521 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
522 554
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
544 return 0; 576 return 0;
545 } 577 }
546 578
579 vcpu->arch.write_fault_to_shadow_pgtable = false;
580
581 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
582 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
583
547 if (walker.level >= PT_DIRECTORY_LEVEL) 584 if (walker.level >= PT_DIRECTORY_LEVEL)
548 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); 585 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
586 || is_self_change_mapping;
549 else 587 else
550 force_pt_level = 1; 588 force_pt_level = 1;
551 if (!force_pt_level) { 589 if (!force_pt_level) {
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
564 walker.gfn, pfn, walker.pte_access, &r)) 602 walker.gfn, pfn, walker.pte_access, &r))
565 return r; 603 return r;
566 604
605 /*
606 * Do not change pte_access if the pfn is a mmio page, otherwise
607 * we will cache the incorrect access into mmio spte.
608 */
609 if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
610 !is_write_protection(vcpu) && !user_fault &&
611 !is_noslot_pfn(pfn)) {
612 walker.pte_access |= ACC_WRITE_MASK;
613 walker.pte_access &= ~ACC_USER_MASK;
614
615 /*
616 * If we converted a user page to a kernel page,
617 * so that the kernel can write to it when cr0.wp=0,
618 * then we should prevent the kernel from executing it
619 * if SMEP is enabled.
620 */
621 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
622 walker.pte_access &= ~ACC_EXEC_MASK;
623 }
624
567 spin_lock(&vcpu->kvm->mmu_lock); 625 spin_lock(&vcpu->kvm->mmu_lock);
568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 626 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
569 goto out_unlock; 627 goto out_unlock;
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
572 kvm_mmu_free_some_pages(vcpu); 630 kvm_mmu_free_some_pages(vcpu);
573 if (!force_pt_level) 631 if (!force_pt_level)
574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
576 level, pfn, map_writable, prefault); 634 level, pfn, map_writable, prefault);
577 ++vcpu->stat.pf_fixed; 635 ++vcpu->stat.pf_fixed;
578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 636 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
747 805
748 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 806 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
749 807
750 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 808 set_spte(vcpu, &sp->spt[i], pte_access,
751 PT_PAGE_TABLE_LEVEL, gfn, 809 PT_PAGE_TABLE_LEVEL, gfn,
752 spte_to_pfn(sp->spt[i]), true, false, 810 spte_to_pfn(sp->spt[i]), true, false,
753 host_writable); 811 host_writable);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c156..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3572} 3572}
3573 3573
3574static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3575{
3576 return;
3577}
3578
3579static int svm_vm_has_apicv(struct kvm *kvm)
3580{
3581 return 0;
3582}
3583
3584static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3585{
3586 return;
3587}
3588
3589static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3590{
3591 return;
3592}
3593
3574static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3575{ 3595{
3576 struct vcpu_svm *svm = to_svm(vcpu); 3596 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
4290 .enable_nmi_window = enable_nmi_window, 4310 .enable_nmi_window = enable_nmi_window,
4291 .enable_irq_window = enable_irq_window, 4311 .enable_irq_window = enable_irq_window,
4292 .update_cr8_intercept = update_cr8_intercept, 4312 .update_cr8_intercept = update_cr8_intercept,
4313 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4314 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update,
4293 4317
4294 .set_tss_addr = svm_set_tss_addr, 4318 .set_tss_addr = svm_set_tss_addr,
4295 .get_tdp_level = get_npt_level, 4319 .get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e4..6667042714cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid;
88
87/* 89/*
88 * If nested=1, nested virtualization is supported, i.e., guests may use 90 * If nested=1, nested virtualization is supported, i.e., guests may use
89 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 91 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO);
92static bool __read_mostly nested = 0; 94static bool __read_mostly nested = 0;
93module_param(nested, bool, S_IRUGO); 95module_param(nested, bool, S_IRUGO);
94 96
95#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 97#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
96 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 98#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
97#define KVM_GUEST_CR0_MASK \
98 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
99#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
100 (X86_CR0_WP | X86_CR0_NE)
101#define KVM_VM_CR0_ALWAYS_ON \ 99#define KVM_VM_CR0_ALWAYS_ON \
102 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 100 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
103#define KVM_CR4_GUEST_OWNED_BITS \ 101#define KVM_CR4_GUEST_OWNED_BITS \
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg); 622 struct kvm_segment *var, int seg);
625static void vmx_get_segment(struct kvm_vcpu *vcpu, 623static void vmx_get_segment(struct kvm_vcpu *vcpu,
626 struct kvm_segment *var, int seg); 624 struct kvm_segment *var, int seg);
625static bool guest_state_valid(struct kvm_vcpu *vcpu);
626static u32 vmx_segment_access_rights(struct kvm_segment *var);
627 627
628static DEFINE_PER_CPU(struct vmcs *, vmxarea); 628static DEFINE_PER_CPU(struct vmcs *, vmxarea);
629static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 629static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a;
638static unsigned long *vmx_io_bitmap_b; 638static unsigned long *vmx_io_bitmap_b;
639static unsigned long *vmx_msr_bitmap_legacy; 639static unsigned long *vmx_msr_bitmap_legacy;
640static unsigned long *vmx_msr_bitmap_longmode; 640static unsigned long *vmx_msr_bitmap_longmode;
641static unsigned long *vmx_msr_bitmap_legacy_x2apic;
642static unsigned long *vmx_msr_bitmap_longmode_x2apic;
641 643
642static bool cpu_has_load_ia32_efer; 644static bool cpu_has_load_ia32_efer;
643static bool cpu_has_load_perf_global_ctrl; 645static bool cpu_has_load_perf_global_ctrl;
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
762 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
763} 765}
764 766
767static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
768{
769 return vmcs_config.cpu_based_2nd_exec_ctrl &
770 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
771}
772
773static inline bool cpu_has_vmx_apic_register_virt(void)
774{
775 return vmcs_config.cpu_based_2nd_exec_ctrl &
776 SECONDARY_EXEC_APIC_REGISTER_VIRT;
777}
778
779static inline bool cpu_has_vmx_virtual_intr_delivery(void)
780{
781 return vmcs_config.cpu_based_2nd_exec_ctrl &
782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
783}
784
765static inline bool cpu_has_vmx_flexpriority(void) 785static inline bool cpu_has_vmx_flexpriority(void)
766{ 786{
767 return cpu_has_vmx_tpr_shadow() && 787 return cpu_has_vmx_tpr_shadow() &&
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1694static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1714static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1695{ 1715{
1696 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1716 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1697 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1698 to_vmx(vcpu)->rflags = rflags; 1717 to_vmx(vcpu)->rflags = rflags;
1699 if (to_vmx(vcpu)->rmode.vm86_active) { 1718 if (to_vmx(vcpu)->rmode.vm86_active) {
1700 to_vmx(vcpu)->rmode.save_rflags = rflags; 1719 to_vmx(vcpu)->rmode.save_rflags = rflags;
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1820 vmx->guest_msrs[from] = tmp; 1839 vmx->guest_msrs[from] = tmp;
1821} 1840}
1822 1841
1842static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
1843{
1844 unsigned long *msr_bitmap;
1845
1846 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
1847 if (is_long_mode(vcpu))
1848 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
1849 else
1850 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
1851 } else {
1852 if (is_long_mode(vcpu))
1853 msr_bitmap = vmx_msr_bitmap_longmode;
1854 else
1855 msr_bitmap = vmx_msr_bitmap_legacy;
1856 }
1857
1858 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1859}
1860
1823/* 1861/*
1824 * Set up the vmcs to automatically save and restore system 1862 * Set up the vmcs to automatically save and restore system
1825 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1863 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1828static void setup_msrs(struct vcpu_vmx *vmx) 1866static void setup_msrs(struct vcpu_vmx *vmx)
1829{ 1867{
1830 int save_nmsrs, index; 1868 int save_nmsrs, index;
1831 unsigned long *msr_bitmap;
1832 1869
1833 save_nmsrs = 0; 1870 save_nmsrs = 0;
1834#ifdef CONFIG_X86_64 1871#ifdef CONFIG_X86_64
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1860 1897
1861 vmx->save_nmsrs = save_nmsrs; 1898 vmx->save_nmsrs = save_nmsrs;
1862 1899
1863 if (cpu_has_vmx_msr_bitmap()) { 1900 if (cpu_has_vmx_msr_bitmap())
1864 if (is_long_mode(&vmx->vcpu)) 1901 vmx_set_msr_bitmap(&vmx->vcpu);
1865 msr_bitmap = vmx_msr_bitmap_longmode;
1866 else
1867 msr_bitmap = vmx_msr_bitmap_legacy;
1868
1869 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1870 }
1871} 1902}
1872 1903
1873/* 1904/*
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2533 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2564 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2534 min2 = 0; 2565 min2 = 0;
2535 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2566 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2567 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2536 SECONDARY_EXEC_WBINVD_EXITING | 2568 SECONDARY_EXEC_WBINVD_EXITING |
2537 SECONDARY_EXEC_ENABLE_VPID | 2569 SECONDARY_EXEC_ENABLE_VPID |
2538 SECONDARY_EXEC_ENABLE_EPT | 2570 SECONDARY_EXEC_ENABLE_EPT |
2539 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2571 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2540 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2572 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2541 SECONDARY_EXEC_RDTSCP | 2573 SECONDARY_EXEC_RDTSCP |
2542 SECONDARY_EXEC_ENABLE_INVPCID; 2574 SECONDARY_EXEC_ENABLE_INVPCID |
2575 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
2543 if (adjust_vmx_controls(min2, opt2, 2577 if (adjust_vmx_controls(min2, opt2,
2544 MSR_IA32_VMX_PROCBASED_CTLS2, 2578 MSR_IA32_VMX_PROCBASED_CTLS2,
2545 &_cpu_based_2nd_exec_control) < 0) 2579 &_cpu_based_2nd_exec_control) < 0)
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2550 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2584 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2551 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2585 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2552#endif 2586#endif
2587
2588 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2589 _cpu_based_2nd_exec_control &= ~(
2590 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2591 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2592 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2593
2553 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2594 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2554 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2595 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2555 enabled */ 2596 enabled */
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void)
2747 if (!cpu_has_vmx_ple()) 2788 if (!cpu_has_vmx_ple())
2748 ple_gap = 0; 2789 ple_gap = 0;
2749 2790
2791 if (!cpu_has_vmx_apic_register_virt() ||
2792 !cpu_has_vmx_virtual_intr_delivery())
2793 enable_apicv_reg_vid = 0;
2794
2795 if (enable_apicv_reg_vid)
2796 kvm_x86_ops->update_cr8_intercept = NULL;
2797 else
2798 kvm_x86_ops->hwapic_irr_update = NULL;
2799
2750 if (nested) 2800 if (nested)
2751 nested_vmx_setup_ctls_msrs(); 2801 nested_vmx_setup_ctls_msrs();
2752 2802
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void)
2758 free_kvm_area(); 2808 free_kvm_area();
2759} 2809}
2760 2810
2761static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) 2811static bool emulation_required(struct kvm_vcpu *vcpu)
2762{ 2812{
2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2813 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2764 struct kvm_segment tmp = *save; 2814}
2765 2815
2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2816static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2767 tmp.base = vmcs_readl(sf->base); 2817 struct kvm_segment *save)
2768 tmp.selector = vmcs_read16(sf->selector); 2818{
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; 2819 if (!emulate_invalid_guest_state) {
2770 tmp.s = 1; 2820 /*
2821 * CS and SS RPL should be equal during guest entry according
2822 * to VMX spec, but in reality it is not always so. Since vcpu
2823 * is in the middle of the transition from real mode to
2824 * protected mode it is safe to assume that RPL 0 is a good
2825 * default value.
2826 */
2827 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2828 save->selector &= ~SELECTOR_RPL_MASK;
2829 save->dpl = save->selector & SELECTOR_RPL_MASK;
2830 save->s = 1;
2771 } 2831 }
2772 vmx_set_segment(vcpu, &tmp, seg); 2832 vmx_set_segment(vcpu, save, seg);
2773} 2833}
2774 2834
2775static void enter_pmode(struct kvm_vcpu *vcpu) 2835static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2777 unsigned long flags; 2837 unsigned long flags;
2778 struct vcpu_vmx *vmx = to_vmx(vcpu); 2838 struct vcpu_vmx *vmx = to_vmx(vcpu);
2779 2839
2780 vmx->emulation_required = 1; 2840 /*
2841 * Update real mode segment cache. It may be not up-to-date if sement
2842 * register was written while vcpu was in a guest mode.
2843 */
2844 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2845 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2846 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2847 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2848 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2849 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2850
2781 vmx->rmode.vm86_active = 0; 2851 vmx->rmode.vm86_active = 0;
2782 2852
2783 vmx_segment_cache_clear(vmx); 2853 vmx_segment_cache_clear(vmx);
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2794 2864
2795 update_exception_bitmap(vcpu); 2865 update_exception_bitmap(vcpu);
2796 2866
2797 if (emulate_invalid_guest_state) 2867 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2798 return; 2868 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2799 2869 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2800 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2870 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2801 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2871 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2802 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2872 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2803 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2804
2805 vmx_segment_cache_clear(vmx);
2806 2873
2807 vmcs_write16(GUEST_SS_SELECTOR, 0); 2874 /* CPL is always 0 when CPU enters protected mode */
2808 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 2875 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2809 2876 vmx->cpl = 0;
2810 vmcs_write16(GUEST_CS_SELECTOR,
2811 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
2812 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2813} 2877}
2814 2878
2815static gva_t rmode_tss_base(struct kvm *kvm) 2879static gva_t rmode_tss_base(struct kvm *kvm)
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2831static void fix_rmode_seg(int seg, struct kvm_segment *save) 2895static void fix_rmode_seg(int seg, struct kvm_segment *save)
2832{ 2896{
2833 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2897 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2834 2898 struct kvm_segment var = *save;
2835 vmcs_write16(sf->selector, save->base >> 4); 2899
2836 vmcs_write32(sf->base, save->base & 0xffff0); 2900 var.dpl = 0x3;
2837 vmcs_write32(sf->limit, 0xffff); 2901 if (seg == VCPU_SREG_CS)
2838 vmcs_write32(sf->ar_bytes, 0xf3); 2902 var.type = 0x3;
2839 if (save->base & 0xf) 2903
2840 printk_once(KERN_WARNING "kvm: segment base is not paragraph" 2904 if (!emulate_invalid_guest_state) {
2841 " aligned when entering protected mode (seg=%d)", 2905 var.selector = var.base >> 4;
2842 seg); 2906 var.base = var.base & 0xffff0;
2907 var.limit = 0xffff;
2908 var.g = 0;
2909 var.db = 0;
2910 var.present = 1;
2911 var.s = 1;
2912 var.l = 0;
2913 var.unusable = 0;
2914 var.type = 0x3;
2915 var.avl = 0;
2916 if (save->base & 0xf)
2917 printk_once(KERN_WARNING "kvm: segment base is not "
2918 "paragraph aligned when entering "
2919 "protected mode (seg=%d)", seg);
2920 }
2921
2922 vmcs_write16(sf->selector, var.selector);
2923 vmcs_write32(sf->base, var.base);
2924 vmcs_write32(sf->limit, var.limit);
2925 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2843} 2926}
2844 2927
2845static void enter_rmode(struct kvm_vcpu *vcpu) 2928static void enter_rmode(struct kvm_vcpu *vcpu)
2846{ 2929{
2847 unsigned long flags; 2930 unsigned long flags;
2848 struct vcpu_vmx *vmx = to_vmx(vcpu); 2931 struct vcpu_vmx *vmx = to_vmx(vcpu);
2849 struct kvm_segment var;
2850
2851 if (enable_unrestricted_guest)
2852 return;
2853 2932
2854 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2933 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2855 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2934 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2856 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2935 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2857 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2936 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2858 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2937 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2938 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2939 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2859 2940
2860 vmx->emulation_required = 1;
2861 vmx->rmode.vm86_active = 1; 2941 vmx->rmode.vm86_active = 1;
2862 2942
2863
2864 /* 2943 /*
2865 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2866 * vcpu. Call it here with phys address pointing 16M below 4G. 2945 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2888 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 2967 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2889 update_exception_bitmap(vcpu); 2968 update_exception_bitmap(vcpu);
2890 2969
2891 if (emulate_invalid_guest_state) 2970 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2892 goto continue_rmode; 2971 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2893 2972 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2894 vmx_get_segment(vcpu, &var, VCPU_SREG_SS); 2973 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2895 vmx_set_segment(vcpu, &var, VCPU_SREG_SS); 2974 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2896 2975 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2897 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2898 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2899
2900 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2901 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2902
2903 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2904 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2905 2976
2906 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2907 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2908
2909 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2910 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2911
2912continue_rmode:
2913 kvm_mmu_reset_context(vcpu); 2977 kvm_mmu_reset_context(vcpu);
2914} 2978}
2915 2979
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3068 struct vcpu_vmx *vmx = to_vmx(vcpu); 3132 struct vcpu_vmx *vmx = to_vmx(vcpu);
3069 unsigned long hw_cr0; 3133 unsigned long hw_cr0;
3070 3134
3135 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3071 if (enable_unrestricted_guest) 3136 if (enable_unrestricted_guest)
3072 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) 3137 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3073 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3138 else {
3074 else 3139 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3075 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
3076 3140
3077 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3141 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3078 enter_pmode(vcpu); 3142 enter_pmode(vcpu);
3079 3143
3080 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3144 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3081 enter_rmode(vcpu); 3145 enter_rmode(vcpu);
3146 }
3082 3147
3083#ifdef CONFIG_X86_64 3148#ifdef CONFIG_X86_64
3084 if (vcpu->arch.efer & EFER_LME) { 3149 if (vcpu->arch.efer & EFER_LME) {
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3098 vmcs_writel(CR0_READ_SHADOW, cr0); 3163 vmcs_writel(CR0_READ_SHADOW, cr0);
3099 vmcs_writel(GUEST_CR0, hw_cr0); 3164 vmcs_writel(GUEST_CR0, hw_cr0);
3100 vcpu->arch.cr0 = cr0; 3165 vcpu->arch.cr0 = cr0;
3101 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3166
3167 /* depends on vcpu->arch.cr0 to be set to a new value */
3168 vmx->emulation_required = emulation_required(vcpu);
3102} 3169}
3103 3170
3104static u64 construct_eptp(unsigned long root_hpa) 3171static u64 construct_eptp(unsigned long root_hpa)
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3155 if (!is_paging(vcpu)) { 3222 if (!is_paging(vcpu)) {
3156 hw_cr4 &= ~X86_CR4_PAE; 3223 hw_cr4 &= ~X86_CR4_PAE;
3157 hw_cr4 |= X86_CR4_PSE; 3224 hw_cr4 |= X86_CR4_PSE;
3225 /*
3226 * SMEP is disabled if CPU is in non-paging mode in
3227 * hardware. However KVM always uses paging mode to
3228 * emulate guest non-paging mode with TDP.
3229 * To emulate this behavior, SMEP needs to be manually
3230 * disabled when guest switches to non-paging mode.
3231 */
3232 hw_cr4 &= ~X86_CR4_SMEP;
3158 } else if (!(cr4 & X86_CR4_PAE)) { 3233 } else if (!(cr4 & X86_CR4_PAE)) {
3159 hw_cr4 &= ~X86_CR4_PAE; 3234 hw_cr4 &= ~X86_CR4_PAE;
3160 } 3235 }
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3171 struct vcpu_vmx *vmx = to_vmx(vcpu); 3246 struct vcpu_vmx *vmx = to_vmx(vcpu);
3172 u32 ar; 3247 u32 ar;
3173 3248
3174 if (vmx->rmode.vm86_active 3249 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3175 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3176 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3177 || seg == VCPU_SREG_GS)) {
3178 *var = vmx->rmode.segs[seg]; 3250 *var = vmx->rmode.segs[seg];
3179 if (seg == VCPU_SREG_TR 3251 if (seg == VCPU_SREG_TR
3180 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3252 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3187 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3259 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3188 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3260 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3189 ar = vmx_read_guest_seg_ar(vmx, seg); 3261 ar = vmx_read_guest_seg_ar(vmx, seg);
3190 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3191 ar = 0;
3192 var->type = ar & 15; 3262 var->type = ar & 15;
3193 var->s = (ar >> 4) & 1; 3263 var->s = (ar >> 4) & 1;
3194 var->dpl = (ar >> 5) & 3; 3264 var->dpl = (ar >> 5) & 3;
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3211 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3281 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3212} 3282}
3213 3283
3214static int __vmx_get_cpl(struct kvm_vcpu *vcpu) 3284static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3215{ 3285{
3286 struct vcpu_vmx *vmx = to_vmx(vcpu);
3287
3216 if (!is_protmode(vcpu)) 3288 if (!is_protmode(vcpu))
3217 return 0; 3289 return 0;
3218 3290
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3220 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ 3292 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
3221 return 3; 3293 return 3;
3222 3294
3223 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
3224}
3225
3226static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3227{
3228 struct vcpu_vmx *vmx = to_vmx(vcpu);
3229
3230 /*
3231 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3232 * fail; use the cache instead.
3233 */
3234 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3235 return vmx->cpl;
3236 }
3237
3238 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3295 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3239 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3296 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3240 vmx->cpl = __vmx_get_cpl(vcpu); 3297 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3241 } 3298 }
3242 3299
3243 return vmx->cpl; 3300 return vmx->cpl;
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3269{ 3326{
3270 struct vcpu_vmx *vmx = to_vmx(vcpu); 3327 struct vcpu_vmx *vmx = to_vmx(vcpu);
3271 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3328 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3272 u32 ar;
3273 3329
3274 vmx_segment_cache_clear(vmx); 3330 vmx_segment_cache_clear(vmx);
3331 if (seg == VCPU_SREG_CS)
3332 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275 3333
3276 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3334 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3277 vmcs_write16(sf->selector, var->selector); 3335 vmx->rmode.segs[seg] = *var;
3278 vmx->rmode.segs[VCPU_SREG_TR] = *var; 3336 if (seg == VCPU_SREG_TR)
3279 return; 3337 vmcs_write16(sf->selector, var->selector);
3338 else if (var->s)
3339 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3340 goto out;
3280 } 3341 }
3342
3281 vmcs_writel(sf->base, var->base); 3343 vmcs_writel(sf->base, var->base);
3282 vmcs_write32(sf->limit, var->limit); 3344 vmcs_write32(sf->limit, var->limit);
3283 vmcs_write16(sf->selector, var->selector); 3345 vmcs_write16(sf->selector, var->selector);
3284 if (vmx->rmode.vm86_active && var->s) {
3285 vmx->rmode.segs[seg] = *var;
3286 /*
3287 * Hack real-mode segments into vm86 compatibility.
3288 */
3289 if (var->base == 0xffff0000 && var->selector == 0xf000)
3290 vmcs_writel(sf->base, 0xf0000);
3291 ar = 0xf3;
3292 } else
3293 ar = vmx_segment_access_rights(var);
3294 3346
3295 /* 3347 /*
3296 * Fix the "Accessed" bit in AR field of segment registers for older 3348 * Fix the "Accessed" bit in AR field of segment registers for older
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3304 * kvm hack. 3356 * kvm hack.
3305 */ 3357 */
3306 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3358 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3307 ar |= 0x1; /* Accessed */ 3359 var->type |= 0x1; /* Accessed */
3308 3360
3309 vmcs_write32(sf->ar_bytes, ar); 3361 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3310 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3311 3362
3312 /* 3363out:
3313 * Fix segments for real mode guest in hosts that don't have 3364 vmx->emulation_required |= emulation_required(vcpu);
3314 * "unrestricted_mode" or it was disabled.
3315 * This is done to allow migration of the guests from hosts with
3316 * unrestricted guest like Westmere to older host that don't have
3317 * unrestricted guest like Nehelem.
3318 */
3319 if (vmx->rmode.vm86_active) {
3320 switch (seg) {
3321 case VCPU_SREG_CS:
3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3323 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3324 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3325 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3326 vmcs_write16(GUEST_CS_SELECTOR,
3327 vmcs_readl(GUEST_CS_BASE) >> 4);
3328 break;
3329 case VCPU_SREG_ES:
3330 case VCPU_SREG_DS:
3331 case VCPU_SREG_GS:
3332 case VCPU_SREG_FS:
3333 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3334 break;
3335 case VCPU_SREG_SS:
3336 vmcs_write16(GUEST_SS_SELECTOR,
3337 vmcs_readl(GUEST_SS_BASE) >> 4);
3338 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3339 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3340 break;
3341 }
3342 }
3343} 3365}
3344 3366
3345static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3367static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3380 u32 ar; 3402 u32 ar;
3381 3403
3382 vmx_get_segment(vcpu, &var, seg); 3404 vmx_get_segment(vcpu, &var, seg);
3405 var.dpl = 0x3;
3406 if (seg == VCPU_SREG_CS)
3407 var.type = 0x3;
3383 ar = vmx_segment_access_rights(&var); 3408 ar = vmx_segment_access_rights(&var);
3384 3409
3385 if (var.base != (var.selector << 4)) 3410 if (var.base != (var.selector << 4))
3386 return false; 3411 return false;
3387 if (var.limit < 0xffff) 3412 if (var.limit != 0xffff)
3388 return false; 3413 return false;
3389 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) 3414 if (ar != 0xf3)
3390 return false; 3415 return false;
3391 3416
3392 return true; 3417 return true;
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3521 */ 3546 */
3522static bool guest_state_valid(struct kvm_vcpu *vcpu) 3547static bool guest_state_valid(struct kvm_vcpu *vcpu)
3523{ 3548{
3549 if (enable_unrestricted_guest)
3550 return true;
3551
3524 /* real mode guest state checks */ 3552 /* real mode guest state checks */
3525 if (!is_protmode(vcpu)) { 3553 if (!is_protmode(vcpu)) {
3526 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg)
3644 vmcs_write16(sf->selector, 0); 3672 vmcs_write16(sf->selector, 0);
3645 vmcs_writel(sf->base, 0); 3673 vmcs_writel(sf->base, 0);
3646 vmcs_write32(sf->limit, 0xffff); 3674 vmcs_write32(sf->limit, 0xffff);
3647 if (enable_unrestricted_guest) { 3675 ar = 0x93;
3648 ar = 0x93; 3676 if (seg == VCPU_SREG_CS)
3649 if (seg == VCPU_SREG_CS) 3677 ar |= 0x08; /* code segment */
3650 ar |= 0x08; /* code segment */
3651 } else
3652 ar = 0xf3;
3653 3678
3654 vmcs_write32(sf->ar_bytes, ar); 3679 vmcs_write32(sf->ar_bytes, ar);
3655} 3680}
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
3667 kvm_userspace_mem.flags = 0; 3692 kvm_userspace_mem.flags = 0;
3668 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3669 kvm_userspace_mem.memory_size = PAGE_SIZE; 3694 kvm_userspace_mem.memory_size = PAGE_SIZE;
3670 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3695 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3671 if (r) 3696 if (r)
3672 goto out; 3697 goto out;
3673 3698
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
3697 kvm_userspace_mem.guest_phys_addr = 3722 kvm_userspace_mem.guest_phys_addr =
3698 kvm->arch.ept_identity_map_addr; 3723 kvm->arch.ept_identity_map_addr;
3699 kvm_userspace_mem.memory_size = PAGE_SIZE; 3724 kvm_userspace_mem.memory_size = PAGE_SIZE;
3700 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3725 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3701 if (r) 3726 if (r)
3702 goto out; 3727 goto out;
3703 3728
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
3739 spin_unlock(&vmx_vpid_lock); 3764 spin_unlock(&vmx_vpid_lock);
3740} 3765}
3741 3766
3742static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 3767#define MSR_TYPE_R 1
3768#define MSR_TYPE_W 2
3769static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3770 u32 msr, int type)
3743{ 3771{
3744 int f = sizeof(unsigned long); 3772 int f = sizeof(unsigned long);
3745 3773
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3752 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3780 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3753 */ 3781 */
3754 if (msr <= 0x1fff) { 3782 if (msr <= 0x1fff) {
3755 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ 3783 if (type & MSR_TYPE_R)
3756 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ 3784 /* read-low */
3785 __clear_bit(msr, msr_bitmap + 0x000 / f);
3786
3787 if (type & MSR_TYPE_W)
3788 /* write-low */
3789 __clear_bit(msr, msr_bitmap + 0x800 / f);
3790
3757 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3791 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3758 msr &= 0x1fff; 3792 msr &= 0x1fff;
3759 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ 3793 if (type & MSR_TYPE_R)
3760 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ 3794 /* read-high */
3795 __clear_bit(msr, msr_bitmap + 0x400 / f);
3796
3797 if (type & MSR_TYPE_W)
3798 /* write-high */
3799 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3800
3801 }
3802}
3803
3804static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3805 u32 msr, int type)
3806{
3807 int f = sizeof(unsigned long);
3808
3809 if (!cpu_has_vmx_msr_bitmap())
3810 return;
3811
3812 /*
3813 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3814 * have the write-low and read-high bitmap offsets the wrong way round.
3815 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3816 */
3817 if (msr <= 0x1fff) {
3818 if (type & MSR_TYPE_R)
3819 /* read-low */
3820 __set_bit(msr, msr_bitmap + 0x000 / f);
3821
3822 if (type & MSR_TYPE_W)
3823 /* write-low */
3824 __set_bit(msr, msr_bitmap + 0x800 / f);
3825
3826 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3827 msr &= 0x1fff;
3828 if (type & MSR_TYPE_R)
3829 /* read-high */
3830 __set_bit(msr, msr_bitmap + 0x400 / f);
3831
3832 if (type & MSR_TYPE_W)
3833 /* write-high */
3834 __set_bit(msr, msr_bitmap + 0xc00 / f);
3835
3761 } 3836 }
3762} 3837}
3763 3838
3764static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 3839static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3765{ 3840{
3766 if (!longmode_only) 3841 if (!longmode_only)
3767 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); 3842 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
3768 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); 3843 msr, MSR_TYPE_R | MSR_TYPE_W);
3844 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
3845 msr, MSR_TYPE_R | MSR_TYPE_W);
3846}
3847
3848static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
3849{
3850 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3851 msr, MSR_TYPE_R);
3852 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3853 msr, MSR_TYPE_R);
3854}
3855
3856static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
3857{
3858 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3859 msr, MSR_TYPE_R);
3860 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3861 msr, MSR_TYPE_R);
3862}
3863
3864static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3865{
3866 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3867 msr, MSR_TYPE_W);
3868 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3869 msr, MSR_TYPE_W);
3769} 3870}
3770 3871
3771/* 3872/*
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3844 return exec_control; 3945 return exec_control;
3845} 3946}
3846 3947
3948static int vmx_vm_has_apicv(struct kvm *kvm)
3949{
3950 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3951}
3952
3847static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 3953static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3848{ 3954{
3849 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3955 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3861 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3967 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3862 if (!ple_gap) 3968 if (!ple_gap)
3863 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3969 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3970 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3864 return exec_control; 3974 return exec_control;
3865} 3975}
3866 3976
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3905 vmx_secondary_exec_control(vmx)); 4015 vmx_secondary_exec_control(vmx));
3906 } 4016 }
3907 4017
4018 if (enable_apicv_reg_vid) {
4019 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4020 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4021 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4022 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4023
4024 vmcs_write16(GUEST_INTR_STATUS, 0);
4025 }
4026
3908 if (ple_gap) { 4027 if (ple_gap) {
3909 vmcs_write32(PLE_GAP, ple_gap); 4028 vmcs_write32(PLE_GAP, ple_gap);
3910 vmcs_write32(PLE_WINDOW, ple_window); 4029 vmcs_write32(PLE_WINDOW, ple_window);
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3990 vmx_segment_cache_clear(vmx); 4109 vmx_segment_cache_clear(vmx);
3991 4110
3992 seg_setup(VCPU_SREG_CS); 4111 seg_setup(VCPU_SREG_CS);
3993 /* 4112 if (kvm_vcpu_is_bsp(&vmx->vcpu))
3994 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
3995 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
3996 */
3997 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
3998 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4113 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
3999 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 4114 else {
4000 } else {
4001 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 4115 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4002 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 4116 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4003 } 4117 }
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4073 4187
4074 ret = 0; 4188 ret = 0;
4075 4189
4076 /* HACK: Don't enable emulation on guest boot/reset */
4077 vmx->emulation_required = 0;
4078
4079 return ret; 4190 return ret;
4080} 4191}
4081 4192
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4251 .flags = 0, 4362 .flags = 0,
4252 }; 4363 };
4253 4364
4254 ret = kvm_set_memory_region(kvm, &tss_mem, 0); 4365 ret = kvm_set_memory_region(kvm, &tss_mem, false);
4255 if (ret) 4366 if (ret)
4256 return ret; 4367 return ret;
4257 kvm->arch.tss_addr = addr; 4368 kvm->arch.tss_addr = addr;
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4261 return 0; 4372 return 0;
4262} 4373}
4263 4374
4264static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4375static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4265 int vec, u32 err_code)
4266{ 4376{
4267 /*
4268 * Instruction with address size override prefix opcode 0x67
4269 * Cause the #SS fault with 0 error code in VM86 mode.
4270 */
4271 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
4272 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
4273 return 1;
4274 /*
4275 * Forward all other exceptions that are valid in real mode.
4276 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4277 * the required debugging infrastructure rework.
4278 */
4279 switch (vec) { 4377 switch (vec) {
4280 case DB_VECTOR:
4281 if (vcpu->guest_debug &
4282 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4283 return 0;
4284 kvm_queue_exception(vcpu, vec);
4285 return 1;
4286 case BP_VECTOR: 4378 case BP_VECTOR:
4287 /* 4379 /*
4288 * Update instruction length as we may reinject the exception 4380 * Update instruction length as we may reinject the exception
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4291 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4383 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4384 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4293 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4385 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4294 return 0; 4386 return false;
4387 /* fall through */
4388 case DB_VECTOR:
4389 if (vcpu->guest_debug &
4390 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4391 return false;
4295 /* fall through */ 4392 /* fall through */
4296 case DE_VECTOR: 4393 case DE_VECTOR:
4297 case OF_VECTOR: 4394 case OF_VECTOR:
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4301 case SS_VECTOR: 4398 case SS_VECTOR:
4302 case GP_VECTOR: 4399 case GP_VECTOR:
4303 case MF_VECTOR: 4400 case MF_VECTOR:
4304 kvm_queue_exception(vcpu, vec); 4401 return true;
4305 return 1; 4402 break;
4306 } 4403 }
4307 return 0; 4404 return false;
4405}
4406
4407static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4408 int vec, u32 err_code)
4409{
4410 /*
4411 * Instruction with address size override prefix opcode 0x67
4412 * Cause the #SS fault with 0 error code in VM86 mode.
4413 */
4414 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4415 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4416 if (vcpu->arch.halt_request) {
4417 vcpu->arch.halt_request = 0;
4418 return kvm_emulate_halt(vcpu);
4419 }
4420 return 1;
4421 }
4422 return 0;
4423 }
4424
4425 /*
4426 * Forward all other exceptions that are valid in real mode.
4427 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4428 * the required debugging infrastructure rework.
4429 */
4430 kvm_queue_exception(vcpu, vec);
4431 return 1;
4308} 4432}
4309 4433
4310/* 4434/*
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4392 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 4516 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
4393 } 4517 }
4394 4518
4395 if (vmx->rmode.vm86_active &&
4396 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
4397 error_code)) {
4398 if (vcpu->arch.halt_request) {
4399 vcpu->arch.halt_request = 0;
4400 return kvm_emulate_halt(vcpu);
4401 }
4402 return 1;
4403 }
4404
4405 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4519 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4520
4521 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4522 return handle_rmode_exception(vcpu, ex_no, error_code);
4523
4406 switch (ex_no) { 4524 switch (ex_no) {
4407 case DB_VECTOR: 4525 case DB_VECTOR:
4408 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4526 dr6 = vmcs_readl(EXIT_QUALIFICATION);
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
4820 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4938 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4821} 4939}
4822 4940
4941static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
4942{
4943 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4944 int vector = exit_qualification & 0xff;
4945
4946 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
4947 kvm_apic_set_eoi_accelerated(vcpu, vector);
4948 return 1;
4949}
4950
4951static int handle_apic_write(struct kvm_vcpu *vcpu)
4952{
4953 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4954 u32 offset = exit_qualification & 0xfff;
4955
4956 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
4957 kvm_apic_write_nodecode(vcpu, offset);
4958 return 1;
4959}
4960
4823static int handle_task_switch(struct kvm_vcpu *vcpu) 4961static int handle_task_switch(struct kvm_vcpu *vcpu)
4824{ 4962{
4825 struct vcpu_vmx *vmx = to_vmx(vcpu); 4963 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5065 schedule(); 5203 schedule();
5066 } 5204 }
5067 5205
5068 vmx->emulation_required = !guest_state_valid(vcpu); 5206 vmx->emulation_required = emulation_required(vcpu);
5069out: 5207out:
5070 return ret; 5208 return ret;
5071} 5209}
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5754 [EXIT_REASON_VMON] = handle_vmon, 5892 [EXIT_REASON_VMON] = handle_vmon,
5755 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5893 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5756 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5894 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5895 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5896 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5757 [EXIT_REASON_WBINVD] = handle_wbinvd, 5897 [EXIT_REASON_WBINVD] = handle_wbinvd,
5758 [EXIT_REASON_XSETBV] = handle_xsetbv, 5898 [EXIT_REASON_XSETBV] = handle_xsetbv,
5759 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5899 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5780 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 5920 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5781 gpa_t bitmap; 5921 gpa_t bitmap;
5782 5922
5783 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) 5923 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5784 return 1; 5924 return 1;
5785 5925
5786 /* 5926 /*
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6008 u32 vectoring_info = vmx->idt_vectoring_info; 6148 u32 vectoring_info = vmx->idt_vectoring_info;
6009 6149
6010 /* If guest state is invalid, start emulating */ 6150 /* If guest state is invalid, start emulating */
6011 if (vmx->emulation_required && emulate_invalid_guest_state) 6151 if (vmx->emulation_required)
6012 return handle_invalid_guest_state(vcpu); 6152 return handle_invalid_guest_state(vcpu);
6013 6153
6014 /* 6154 /*
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6103 vmcs_write32(TPR_THRESHOLD, irr); 6243 vmcs_write32(TPR_THRESHOLD, irr);
6104} 6244}
6105 6245
6246static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
6247{
6248 u32 sec_exec_control;
6249
6250 /*
6251 * There is not point to enable virtualize x2apic without enable
6252 * apicv
6253 */
6254 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
6255 !vmx_vm_has_apicv(vcpu->kvm))
6256 return;
6257
6258 if (!vm_need_tpr_shadow(vcpu->kvm))
6259 return;
6260
6261 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6262
6263 if (set) {
6264 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6265 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6266 } else {
6267 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6268 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6269 }
6270 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
6271
6272 vmx_set_msr_bitmap(vcpu);
6273}
6274
6275static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
6276{
6277 u16 status;
6278 u8 old;
6279
6280 if (!vmx_vm_has_apicv(kvm))
6281 return;
6282
6283 if (isr == -1)
6284 isr = 0;
6285
6286 status = vmcs_read16(GUEST_INTR_STATUS);
6287 old = status >> 8;
6288 if (isr != old) {
6289 status &= 0xff;
6290 status |= isr << 8;
6291 vmcs_write16(GUEST_INTR_STATUS, status);
6292 }
6293}
6294
6295static void vmx_set_rvi(int vector)
6296{
6297 u16 status;
6298 u8 old;
6299
6300 status = vmcs_read16(GUEST_INTR_STATUS);
6301 old = (u8)status & 0xff;
6302 if ((u8)vector != old) {
6303 status &= ~0xff;
6304 status |= (u8)vector;
6305 vmcs_write16(GUEST_INTR_STATUS, status);
6306 }
6307}
6308
6309static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6310{
6311 if (max_irr == -1)
6312 return;
6313
6314 vmx_set_rvi(max_irr);
6315}
6316
6317static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6318{
6319 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6320 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6321 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6322 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6323}
6324
6106static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6325static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6107{ 6326{
6108 u32 exit_intr_info; 6327 u32 exit_intr_info;
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6291 6510
6292 /* Don't enter VMX if guest state is invalid, let the exit handler 6511 /* Don't enter VMX if guest state is invalid, let the exit handler
6293 start emulation until we arrive back to a valid state */ 6512 start emulation until we arrive back to a valid state */
6294 if (vmx->emulation_required && emulate_invalid_guest_state) 6513 if (vmx->emulation_required)
6295 return; 6514 return;
6296 6515
6297 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6516 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
7366 .enable_nmi_window = enable_nmi_window, 7585 .enable_nmi_window = enable_nmi_window,
7367 .enable_irq_window = enable_irq_window, 7586 .enable_irq_window = enable_irq_window,
7368 .update_cr8_intercept = update_cr8_intercept, 7587 .update_cr8_intercept = update_cr8_intercept,
7588 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
7589 .vm_has_apicv = vmx_vm_has_apicv,
7590 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7591 .hwapic_irr_update = vmx_hwapic_irr_update,
7592 .hwapic_isr_update = vmx_hwapic_isr_update,
7369 7593
7370 .set_tss_addr = vmx_set_tss_addr, 7594 .set_tss_addr = vmx_set_tss_addr,
7371 .get_tdp_level = get_ept_level, 7595 .get_tdp_level = get_ept_level,
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7398 7622
7399static int __init vmx_init(void) 7623static int __init vmx_init(void)
7400{ 7624{
7401 int r, i; 7625 int r, i, msr;
7402 7626
7403 rdmsrl_safe(MSR_EFER, &host_efer); 7627 rdmsrl_safe(MSR_EFER, &host_efer);
7404 7628
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void)
7419 if (!vmx_msr_bitmap_legacy) 7643 if (!vmx_msr_bitmap_legacy)
7420 goto out1; 7644 goto out1;
7421 7645
7646 vmx_msr_bitmap_legacy_x2apic =
7647 (unsigned long *)__get_free_page(GFP_KERNEL);
7648 if (!vmx_msr_bitmap_legacy_x2apic)
7649 goto out2;
7422 7650
7423 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7651 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7424 if (!vmx_msr_bitmap_longmode) 7652 if (!vmx_msr_bitmap_longmode)
7425 goto out2; 7653 goto out3;
7426 7654
7655 vmx_msr_bitmap_longmode_x2apic =
7656 (unsigned long *)__get_free_page(GFP_KERNEL);
7657 if (!vmx_msr_bitmap_longmode_x2apic)
7658 goto out4;
7427 7659
7428 /* 7660 /*
7429 * Allow direct access to the PC debug port (it is often used for I/O 7661 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void)
7455 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 7687 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
7456 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 7688 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
7457 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7689 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7690 memcpy(vmx_msr_bitmap_legacy_x2apic,
7691 vmx_msr_bitmap_legacy, PAGE_SIZE);
7692 memcpy(vmx_msr_bitmap_longmode_x2apic,
7693 vmx_msr_bitmap_longmode, PAGE_SIZE);
7694
7695 if (enable_apicv_reg_vid) {
7696 for (msr = 0x800; msr <= 0x8ff; msr++)
7697 vmx_disable_intercept_msr_read_x2apic(msr);
7698
7699 /* According SDM, in x2apic mode, the whole id reg is used.
7700 * But in KVM, it only use the highest eight bits. Need to
7701 * intercept it */
7702 vmx_enable_intercept_msr_read_x2apic(0x802);
7703 /* TMCCT */
7704 vmx_enable_intercept_msr_read_x2apic(0x839);
7705 /* TPR */
7706 vmx_disable_intercept_msr_write_x2apic(0x808);
7707 /* EOI */
7708 vmx_disable_intercept_msr_write_x2apic(0x80b);
7709 /* SELF-IPI */
7710 vmx_disable_intercept_msr_write_x2apic(0x83f);
7711 }
7458 7712
7459 if (enable_ept) { 7713 if (enable_ept) {
7460 kvm_mmu_set_mask_ptes(0ull, 7714 kvm_mmu_set_mask_ptes(0ull,
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void)
7468 7722
7469 return 0; 7723 return 0;
7470 7724
7471out3: 7725out4:
7472 free_page((unsigned long)vmx_msr_bitmap_longmode); 7726 free_page((unsigned long)vmx_msr_bitmap_longmode);
7727out3:
7728 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7473out2: 7729out2:
7474 free_page((unsigned long)vmx_msr_bitmap_legacy); 7730 free_page((unsigned long)vmx_msr_bitmap_legacy);
7475out1: 7731out1:
@@ -7481,6 +7737,8 @@ out:
7481 7737
7482static void __exit vmx_exit(void) 7738static void __exit vmx_exit(void)
7483{ 7739{
7740 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7741 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7484 free_page((unsigned long)vmx_msr_bitmap_legacy); 7742 free_page((unsigned long)vmx_msr_bitmap_legacy);
7485 free_page((unsigned long)vmx_msr_bitmap_longmode); 7743 free_page((unsigned long)vmx_msr_bitmap_longmode);
7486 free_page((unsigned long)vmx_io_bitmap_b); 7744 free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37040079cd6b..f71500af1f81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
872 872
873 kvm_x86_ops->set_efer(vcpu, efer); 873 kvm_x86_ops->set_efer(vcpu, efer);
874 874
875 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
876
877 /* Update reserved bits */ 875 /* Update reserved bits */
878 if ((efer ^ old_efer) & EFER_NX) 876 if ((efer ^ old_efer) & EFER_NX)
879 kvm_mmu_reset_context(vcpu); 877 kvm_mmu_reset_context(vcpu);
@@ -2522,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2522 r = KVM_MAX_VCPUS; 2520 r = KVM_MAX_VCPUS;
2523 break; 2521 break;
2524 case KVM_CAP_NR_MEMSLOTS: 2522 case KVM_CAP_NR_MEMSLOTS:
2525 r = KVM_MEMORY_SLOTS; 2523 r = KVM_USER_MEM_SLOTS;
2526 break; 2524 break;
2527 case KVM_CAP_PV_MMU: /* obsolete */ 2525 case KVM_CAP_PV_MMU: /* obsolete */
2528 r = 0; 2526 r = 0;
@@ -3274,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3274 return -EINVAL; 3272 return -EINVAL;
3275 3273
3276 mutex_lock(&kvm->slots_lock); 3274 mutex_lock(&kvm->slots_lock);
3277 spin_lock(&kvm->mmu_lock);
3278 3275
3279 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 3276 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3280 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 3277 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3281 3278
3282 spin_unlock(&kvm->mmu_lock);
3283 mutex_unlock(&kvm->slots_lock); 3279 mutex_unlock(&kvm->slots_lock);
3284 return 0; 3280 return 0;
3285} 3281}
@@ -3439,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3439 mutex_lock(&kvm->slots_lock); 3435 mutex_lock(&kvm->slots_lock);
3440 3436
3441 r = -EINVAL; 3437 r = -EINVAL;
3442 if (log->slot >= KVM_MEMORY_SLOTS) 3438 if (log->slot >= KVM_USER_MEM_SLOTS)
3443 goto out; 3439 goto out;
3444 3440
3445 memslot = id_to_memslot(kvm->memslots, log->slot); 3441 memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4495,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4495 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); 4491 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4496 *selector = var.selector; 4492 *selector = var.selector;
4497 4493
4498 if (var.unusable) 4494 if (var.unusable) {
4495 memset(desc, 0, sizeof(*desc));
4499 return false; 4496 return false;
4497 }
4500 4498
4501 if (var.g) 4499 if (var.g)
4502 var.limit >>= 12; 4500 var.limit >>= 12;
@@ -4757,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4757 return r; 4755 return r;
4758} 4756}
4759 4757
4760static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4758static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4759 bool write_fault_to_shadow_pgtable)
4761{ 4760{
4762 gpa_t gpa; 4761 gpa_t gpa = cr2;
4763 pfn_t pfn; 4762 pfn_t pfn;
4764 4763
4765 if (tdp_enabled) 4764 if (!vcpu->arch.mmu.direct_map) {
4766 return false; 4765 /*
4767 4766 * Write permission should be allowed since only
4768 /* 4767 * write access need to be emulated.
4769 * if emulation was due to access to shadowed page table 4768 */
4770 * and it failed try to unshadow page and re-enter the 4769 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4771 * guest to let CPU execute the instruction.
4772 */
4773 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4774 return true;
4775
4776 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
4777 4770
4778 if (gpa == UNMAPPED_GVA) 4771 /*
4779 return true; /* let cpu generate fault */ 4772 * If the mapping is invalid in guest, let cpu retry
4773 * it to generate fault.
4774 */
4775 if (gpa == UNMAPPED_GVA)
4776 return true;
4777 }
4780 4778
4781 /* 4779 /*
4782 * Do not retry the unhandleable instruction if it faults on the 4780 * Do not retry the unhandleable instruction if it faults on the
@@ -4785,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4785 * instruction -> ... 4783 * instruction -> ...
4786 */ 4784 */
4787 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4785 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4788 if (!is_error_noslot_pfn(pfn)) { 4786
4789 kvm_release_pfn_clean(pfn); 4787 /*
4788 * If the instruction failed on the error pfn, it can not be fixed,
4789 * report the error to userspace.
4790 */
4791 if (is_error_noslot_pfn(pfn))
4792 return false;
4793
4794 kvm_release_pfn_clean(pfn);
4795
4796 /* The instructions are well-emulated on direct mmu. */
4797 if (vcpu->arch.mmu.direct_map) {
4798 unsigned int indirect_shadow_pages;
4799
4800 spin_lock(&vcpu->kvm->mmu_lock);
4801 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
4802 spin_unlock(&vcpu->kvm->mmu_lock);
4803
4804 if (indirect_shadow_pages)
4805 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4806
4790 return true; 4807 return true;
4791 } 4808 }
4792 4809
4793 return false; 4810 /*
4811 * if emulation was due to access to shadowed page table
4812 * and it failed try to unshadow page and re-enter the
4813 * guest to let CPU execute the instruction.
4814 */
4815 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4816
4817 /*
4818 * If the access faults on its page table, it can not
4819 * be fixed by unprotecting shadow page and it should
4820 * be reported to userspace.
4821 */
4822 return !write_fault_to_shadow_pgtable;
4794} 4823}
4795 4824
4796static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 4825static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4832,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4832 if (!vcpu->arch.mmu.direct_map) 4861 if (!vcpu->arch.mmu.direct_map)
4833 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 4862 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4834 4863
4835 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4864 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4836 4865
4837 return true; 4866 return true;
4838} 4867}
@@ -4849,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4849 int r; 4878 int r;
4850 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4879 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4851 bool writeback = true; 4880 bool writeback = true;
4881 bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
4852 4882
4883 /*
4884 * Clear write_fault_to_shadow_pgtable here to ensure it is
4885 * never reused.
4886 */
4887 vcpu->arch.write_fault_to_shadow_pgtable = false;
4853 kvm_clear_exception_queue(vcpu); 4888 kvm_clear_exception_queue(vcpu);
4854 4889
4855 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4890 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4868,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4868 if (r != EMULATION_OK) { 4903 if (r != EMULATION_OK) {
4869 if (emulation_type & EMULTYPE_TRAP_UD) 4904 if (emulation_type & EMULTYPE_TRAP_UD)
4870 return EMULATE_FAIL; 4905 return EMULATE_FAIL;
4871 if (reexecute_instruction(vcpu, cr2)) 4906 if (reexecute_instruction(vcpu, cr2,
4907 write_fault_to_spt))
4872 return EMULATE_DONE; 4908 return EMULATE_DONE;
4873 if (emulation_type & EMULTYPE_SKIP) 4909 if (emulation_type & EMULTYPE_SKIP)
4874 return EMULATE_FAIL; 4910 return EMULATE_FAIL;
@@ -4898,7 +4934,7 @@ restart:
4898 return EMULATE_DONE; 4934 return EMULATE_DONE;
4899 4935
4900 if (r == EMULATION_FAILED) { 4936 if (r == EMULATION_FAILED) {
4901 if (reexecute_instruction(vcpu, cr2)) 4937 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
4902 return EMULATE_DONE; 4938 return EMULATE_DONE;
4903 4939
4904 return handle_emulation_failure(vcpu); 4940 return handle_emulation_failure(vcpu);
@@ -5541,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5541 vcpu->arch.nmi_injected = true; 5577 vcpu->arch.nmi_injected = true;
5542 kvm_x86_ops->set_nmi(vcpu); 5578 kvm_x86_ops->set_nmi(vcpu);
5543 } 5579 }
5544 } else if (kvm_cpu_has_interrupt(vcpu)) { 5580 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5545 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5581 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5546 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5582 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5547 false); 5583 false);
@@ -5609,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5609#endif 5645#endif
5610} 5646}
5611 5647
5648static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
5649{
5650 u64 eoi_exit_bitmap[4];
5651
5652 memset(eoi_exit_bitmap, 0, 32);
5653
5654 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
5655 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5656}
5657
5612static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5658static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5613{ 5659{
5614 int r; 5660 int r;
@@ -5662,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5662 kvm_handle_pmu_event(vcpu); 5708 kvm_handle_pmu_event(vcpu);
5663 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5709 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5664 kvm_deliver_pmi(vcpu); 5710 kvm_deliver_pmi(vcpu);
5711 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
5712 update_eoi_exitmap(vcpu);
5665 } 5713 }
5666 5714
5667 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5715 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5670,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5670 /* enable NMI/IRQ window open exits if needed */ 5718 /* enable NMI/IRQ window open exits if needed */
5671 if (vcpu->arch.nmi_pending) 5719 if (vcpu->arch.nmi_pending)
5672 kvm_x86_ops->enable_nmi_window(vcpu); 5720 kvm_x86_ops->enable_nmi_window(vcpu);
5673 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5721 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5674 kvm_x86_ops->enable_irq_window(vcpu); 5722 kvm_x86_ops->enable_irq_window(vcpu);
5675 5723
5676 if (kvm_lapic_enabled(vcpu)) { 5724 if (kvm_lapic_enabled(vcpu)) {
5725 /*
5726 * Update architecture specific hints for APIC
5727 * virtual interrupt delivery.
5728 */
5729 if (kvm_x86_ops->hwapic_irr_update)
5730 kvm_x86_ops->hwapic_irr_update(vcpu,
5731 kvm_lapic_find_highest_irr(vcpu));
5677 update_cr8_intercept(vcpu); 5732 update_cr8_intercept(vcpu);
5678 kvm_lapic_sync_to_vapic(vcpu); 5733 kvm_lapic_sync_to_vapic(vcpu);
5679 } 5734 }
@@ -6853,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6853 struct kvm_memory_slot *memslot, 6908 struct kvm_memory_slot *memslot,
6854 struct kvm_memory_slot old, 6909 struct kvm_memory_slot old,
6855 struct kvm_userspace_memory_region *mem, 6910 struct kvm_userspace_memory_region *mem,
6856 int user_alloc) 6911 bool user_alloc)
6857{ 6912{
6858 int npages = memslot->npages; 6913 int npages = memslot->npages;
6859 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
6860 6914
6861 /* Prevent internal slot pages from being moved by fork()/COW. */ 6915 /*
6862 if (memslot->id >= KVM_MEMORY_SLOTS) 6916 * Only private memory slots need to be mapped here since
6863 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6917 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
6864
6865 /*To keep backward compatibility with older userspace,
6866 *x86 needs to handle !user_alloc case.
6867 */ 6918 */
6868 if (!user_alloc) { 6919 if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
6869 if (npages && !old.npages) { 6920 unsigned long userspace_addr;
6870 unsigned long userspace_addr;
6871 6921
6872 userspace_addr = vm_mmap(NULL, 0, 6922 /*
6873 npages * PAGE_SIZE, 6923 * MAP_SHARED to prevent internal slot pages from being moved
6874 PROT_READ | PROT_WRITE, 6924 * by fork()/COW.
6875 map_flags, 6925 */
6876 0); 6926 userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
6927 PROT_READ | PROT_WRITE,
6928 MAP_SHARED | MAP_ANONYMOUS, 0);
6877 6929
6878 if (IS_ERR((void *)userspace_addr)) 6930 if (IS_ERR((void *)userspace_addr))
6879 return PTR_ERR((void *)userspace_addr); 6931 return PTR_ERR((void *)userspace_addr);
6880 6932
6881 memslot->userspace_addr = userspace_addr; 6933 memslot->userspace_addr = userspace_addr;
6882 }
6883 } 6934 }
6884 6935
6885
6886 return 0; 6936 return 0;
6887} 6937}
6888 6938
6889void kvm_arch_commit_memory_region(struct kvm *kvm, 6939void kvm_arch_commit_memory_region(struct kvm *kvm,
6890 struct kvm_userspace_memory_region *mem, 6940 struct kvm_userspace_memory_region *mem,
6891 struct kvm_memory_slot old, 6941 struct kvm_memory_slot old,
6892 int user_alloc) 6942 bool user_alloc)
6893{ 6943{
6894 6944
6895 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6945 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6896 6946
6897 if (!user_alloc && !old.user_alloc && old.npages && !npages) { 6947 if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
6898 int ret; 6948 int ret;
6899 6949
6900 ret = vm_munmap(old.userspace_addr, 6950 ret = vm_munmap(old.userspace_addr,
@@ -6908,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6908 if (!kvm->arch.n_requested_mmu_pages) 6958 if (!kvm->arch.n_requested_mmu_pages)
6909 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 6959 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6910 6960
6911 spin_lock(&kvm->mmu_lock);
6912 if (nr_mmu_pages) 6961 if (nr_mmu_pages)
6913 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6962 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6914 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6963 /*
6915 spin_unlock(&kvm->mmu_lock); 6964 * Write protect all pages for dirty logging.
6965 * Existing largepage mappings are destroyed here and new ones will
6966 * not be created until the end of the logging.
6967 */
6968 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6969 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6916 /* 6970 /*
6917 * If memory slot is created, or moved, we need to clear all 6971 * If memory slot is created, or moved, we need to clear all
6918 * mmio sptes. 6972 * mmio sptes.