aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/virtual/kvm/api.txt71
-rw-r--r--arch/powerpc/include/asm/kvm.h13
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h40
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h30
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c13
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S10
-rw-r--r--arch/powerpc/kvm/44x.c2
-rw-r--r--arch/powerpc/kvm/Makefile4
-rw-r--r--arch/powerpc/kvm/book3s_32_sr.S2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S2
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c29
-rw-r--r--arch/powerpc/kvm/book3s_exports.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c343
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c33
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S300
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S129
-rw-r--r--arch/powerpc/kvm/book3s_pr.c58
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c158
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S54
-rw-r--r--arch/powerpc/kvm/book3s_segment.S117
-rw-r--r--arch/powerpc/kvm/booke.c10
-rw-r--r--arch/powerpc/kvm/e500.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c55
-rw-r--r--arch/s390/include/asm/kvm_host.h7
-rw-r--r--arch/s390/kvm/interrupt.c30
-rw-r--r--arch/s390/kvm/kvm-s390.c20
-rw-r--r--arch/s390/kvm/sigp.c45
-rw-r--r--arch/x86/include/asm/apicdef.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kvm/emulate.c867
-rw-r--r--arch/x86/kvm/i8254.c6
-rw-r--r--arch/x86/kvm/i8259.c123
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h7
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c167
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c5
-rw-r--r--arch/x86/kvm/mmu_audit.c6
-rw-r--r--arch/x86/kvm/paging_tmpl.h24
-rw-r--r--arch/x86/kvm/svm.c93
-rw-r--r--arch/x86/kvm/trace.h118
-rw-r--r--arch/x86/kvm/vmx.c131
-rw-r--r--arch/x86/kvm/x86.c274
-rw-r--r--include/linux/kvm.h6
-rw-r--r--include/linux/kvm_host.h32
-rw-r--r--virt/kvm/assigned-dev.c62
-rw-r--r--virt/kvm/coalesced_mmio.c131
-rw-r--r--virt/kvm/coalesced_mmio.h7
-rw-r--r--virt/kvm/eventfd.c3
-rw-r--r--virt/kvm/ioapic.c3
-rw-r--r--virt/kvm/kvm_main.c112
60 files changed, 2449 insertions, 1369 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 93413ce96883..27e0488d54d2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1201,6 +1201,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1201 [KVM,Intel] Disable FlexPriority feature (TPR shadow). 1201 [KVM,Intel] Disable FlexPriority feature (TPR shadow).
1202 Default is 1 (enabled) 1202 Default is 1 (enabled)
1203 1203
1204 kvm-intel.nested=
1205 [KVM,Intel] Enable VMX nesting (nVMX).
1206 Default is 0 (disabled)
1207
1204 kvm-intel.unrestricted_guest= 1208 kvm-intel.unrestricted_guest=
1205 [KVM,Intel] Disable unrestricted guest feature 1209 [KVM,Intel] Disable unrestricted guest feature
1206 (virtualized real and unpaged mode) on capable 1210 (virtualized real and unpaged mode) on capable
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index b0e4b9cd6a66..7945b0bd35e2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -175,10 +175,30 @@ Parameters: vcpu id (apic id on x86)
175Returns: vcpu fd on success, -1 on error 175Returns: vcpu fd on success, -1 on error
176 176
177This API adds a vcpu to a virtual machine. The vcpu id is a small integer 177This API adds a vcpu to a virtual machine. The vcpu id is a small integer
178in the range [0, max_vcpus). You can use KVM_CAP_NR_VCPUS of the 178in the range [0, max_vcpus).
179KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. 179
180The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
181the KVM_CHECK_EXTENSION ioctl() at run-time.
182The maximum possible value for max_vcpus can be retrieved using the
183KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time.
184
180If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 185If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
181cpus max. 186cpus max.
187If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
188same as the value returned from KVM_CAP_NR_VCPUS.
189
190On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
191threads in one or more virtual CPU cores. (This is because the
192hardware requires all the hardware threads in a CPU core to be in the
193same partition.) The KVM_CAP_PPC_SMT capability indicates the number
194of vcpus per virtual core (vcore). The vcore id is obtained by
195dividing the vcpu id by the number of vcpus per vcore. The vcpus in a
196given vcore will always be in the same physical core as each other
197(though that might be a different physical core from time to time).
198Userspace can control the threading (SMT) mode of the guest by its
199allocation of vcpu ids. For example, if userspace wants
200single-threaded guest vcpus, it should make all vcpu ids be a multiple
201of the number of vcpus per vcore.
182 202
183On powerpc using book3s_hv mode, the vcpus are mapped onto virtual 203On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
184threads in one or more virtual CPU cores. (This is because the 204threads in one or more virtual CPU cores. (This is because the
@@ -1633,3 +1653,50 @@ developer registration required to access it).
1633 char padding[256]; 1653 char padding[256];
1634 }; 1654 };
1635}; 1655};
1656
16576. Capabilities that can be enabled
1658
1659There are certain capabilities that change the behavior of the virtual CPU when
1660enabled. To enable them, please see section 4.37. Below you can find a list of
1661capabilities and what their effect on the vCPU is when enabling them.
1662
1663The following information is provided along with the description:
1664
1665 Architectures: which instruction set architectures provide this ioctl.
1666 x86 includes both i386 and x86_64.
1667
1668 Parameters: what parameters are accepted by the capability.
1669
1670 Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL)
1671 are not detailed, but errors with specific meanings are.
1672
16736.1 KVM_CAP_PPC_OSI
1674
1675Architectures: ppc
1676Parameters: none
1677Returns: 0 on success; -1 on error
1678
1679This capability enables interception of OSI hypercalls that otherwise would
1680be treated as normal system calls to be injected into the guest. OSI hypercalls
1681were invented by Mac-on-Linux to have a standardized communication mechanism
1682between the guest and the host.
1683
1684When this capability is enabled, KVM_EXIT_OSI can occur.
1685
16866.2 KVM_CAP_PPC_PAPR
1687
1688Architectures: ppc
1689Parameters: none
1690Returns: 0 on success; -1 on error
1691
1692This capability enables interception of PAPR hypercalls. PAPR hypercalls are
1693done using the hypercall instruction "sc 1".
1694
1695It also sets the guest privilege level to "supervisor" mode. Usually the guest
1696runs in "hypervisor" privilege mode with a few missing features.
1697
1698In addition to the above, it changes the semantics of SDR1. In this mode, the
1699HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the
1700HTAB invisible to the guest.
1701
1702When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur.
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a4f6c85431f8..08fe69edcd10 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -149,6 +149,12 @@ struct kvm_regs {
149#define KVM_SREGS_E_UPDATE_DBSR (1 << 3) 149#define KVM_SREGS_E_UPDATE_DBSR (1 << 3)
150 150
151/* 151/*
152 * Book3S special bits to indicate contents in the struct by maintaining
153 * backwards compatibility with older structs. If adding a new field,
154 * please make sure to add a flag for that new field */
155#define KVM_SREGS_S_HIOR (1 << 0)
156
157/*
152 * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a 158 * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
153 * previous KVM_GET_REGS. 159 * previous KVM_GET_REGS.
154 * 160 *
@@ -173,6 +179,8 @@ struct kvm_sregs {
173 __u64 ibat[8]; 179 __u64 ibat[8];
174 __u64 dbat[8]; 180 __u64 dbat[8];
175 } ppc32; 181 } ppc32;
182 __u64 flags; /* KVM_SREGS_S_ */
183 __u64 hior;
176 } s; 184 } s;
177 struct { 185 struct {
178 union { 186 union {
@@ -276,6 +284,11 @@ struct kvm_guest_debug_arch {
276#define KVM_INTERRUPT_UNSET -2U 284#define KVM_INTERRUPT_UNSET -2U
277#define KVM_INTERRUPT_SET_LEVEL -3U 285#define KVM_INTERRUPT_SET_LEVEL -3U
278 286
287#define KVM_CPU_440 1
288#define KVM_CPU_E500V2 2
289#define KVM_CPU_3S_32 3
290#define KVM_CPU_3S_64 4
291
279/* for KVM_CAP_SPAPR_TCE */ 292/* for KVM_CAP_SPAPR_TCE */
280struct kvm_create_spapr_tce { 293struct kvm_create_spapr_tce {
281 __u64 liobn; 294 __u64 liobn;
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 98da010252a3..a384ffdf33de 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
90#endif 90#endif
91 int context_id[SID_CONTEXTS]; 91 int context_id[SID_CONTEXTS];
92 92
93 bool hior_sregs; /* HIOR is set by SREGS, not PVR */
94
93 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 95 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
94 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; 96 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
95 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 97 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -139,15 +141,14 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
139extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 141extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
140extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 142extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
141 143
142extern void kvmppc_handler_lowmem_trampoline(void); 144extern void kvmppc_entry_trampoline(void);
143extern void kvmppc_handler_trampoline_enter(void);
144extern void kvmppc_rmcall(ulong srr0, ulong srr1);
145extern void kvmppc_hv_entry_trampoline(void); 145extern void kvmppc_hv_entry_trampoline(void);
146extern void kvmppc_load_up_fpu(void); 146extern void kvmppc_load_up_fpu(void);
147extern void kvmppc_load_up_altivec(void); 147extern void kvmppc_load_up_altivec(void);
148extern void kvmppc_load_up_vsx(void); 148extern void kvmppc_load_up_vsx(void);
149extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst); 149extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
150extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst); 150extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
151extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
151 152
152static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) 153static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
153{ 154{
@@ -382,6 +383,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
382} 383}
383#endif 384#endif
384 385
386static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
387 unsigned long pte_index)
388{
389 unsigned long rb, va_low;
390
391 rb = (v & ~0x7fUL) << 16; /* AVA field */
392 va_low = pte_index >> 3;
393 if (v & HPTE_V_SECONDARY)
394 va_low = ~va_low;
395 /* xor vsid from AVA */
396 if (!(v & HPTE_V_1TB_SEG))
397 va_low ^= v >> 12;
398 else
399 va_low ^= v >> 24;
400 va_low &= 0x7ff;
401 if (v & HPTE_V_LARGE) {
402 rb |= 1; /* L field */
403 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
404 (r & 0xff000)) {
405 /* non-16MB large page, must be 64k */
406 /* (masks depend on page size) */
407 rb |= 0x1000; /* page encoding in LP field */
408 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
409 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
410 }
411 } else {
412 /* 4kB page */
413 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
414 }
415 rb |= (v >> 54) & 0x300; /* B field */
416 return rb;
417}
418
385/* Magic register values loaded into r3 and r4 before the 'sc' assembly 419/* Magic register values loaded into r3 and r4 before the 'sc' assembly
386 * instruction for the OSI hypercalls */ 420 * instruction for the OSI hypercalls */
387#define OSI_SC_MAGIC_R3 0x113724FA 421#define OSI_SC_MAGIC_R3 0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index ef7b3688c3b6..1f2f5b6156bd 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -75,6 +75,8 @@ struct kvmppc_host_state {
75 ulong scratch0; 75 ulong scratch0;
76 ulong scratch1; 76 ulong scratch1;
77 u8 in_guest; 77 u8 in_guest;
78 u8 restore_hid5;
79 u8 napping;
78 80
79#ifdef CONFIG_KVM_BOOK3S_64_HV 81#ifdef CONFIG_KVM_BOOK3S_64_HV
80 struct kvm_vcpu *kvm_vcpu; 82 struct kvm_vcpu *kvm_vcpu;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index cc22b282d755..bf8af5d5d5dc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -198,21 +198,29 @@ struct kvm_arch {
198 */ 198 */
199struct kvmppc_vcore { 199struct kvmppc_vcore {
200 int n_runnable; 200 int n_runnable;
201 int n_blocked; 201 int n_busy;
202 int num_threads; 202 int num_threads;
203 int entry_exit_count; 203 int entry_exit_count;
204 int n_woken; 204 int n_woken;
205 int nap_count; 205 int nap_count;
206 int napping_threads;
206 u16 pcpu; 207 u16 pcpu;
207 u8 vcore_running; 208 u8 vcore_state;
208 u8 in_guest; 209 u8 in_guest;
209 struct list_head runnable_threads; 210 struct list_head runnable_threads;
210 spinlock_t lock; 211 spinlock_t lock;
212 wait_queue_head_t wq;
211}; 213};
212 214
213#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) 215#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff)
214#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) 216#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8)
215 217
218/* Values for vcore_state */
219#define VCORE_INACTIVE 0
220#define VCORE_RUNNING 1
221#define VCORE_EXITING 2
222#define VCORE_SLEEPING 3
223
216struct kvmppc_pte { 224struct kvmppc_pte {
217 ulong eaddr; 225 ulong eaddr;
218 u64 vpage; 226 u64 vpage;
@@ -258,14 +266,6 @@ struct kvm_vcpu_arch {
258 ulong host_stack; 266 ulong host_stack;
259 u32 host_pid; 267 u32 host_pid;
260#ifdef CONFIG_PPC_BOOK3S 268#ifdef CONFIG_PPC_BOOK3S
261 ulong host_msr;
262 ulong host_r2;
263 void *host_retip;
264 ulong trampoline_lowmem;
265 ulong trampoline_enter;
266 ulong highmem_handler;
267 ulong rmcall;
268 ulong host_paca_phys;
269 struct kvmppc_slb slb[64]; 269 struct kvmppc_slb slb[64];
270 int slb_max; /* 1 + index of last valid entry in slb[] */ 270 int slb_max; /* 1 + index of last valid entry in slb[] */
271 int slb_nr; /* total number of entries in SLB */ 271 int slb_nr; /* total number of entries in SLB */
@@ -389,6 +389,9 @@ struct kvm_vcpu_arch {
389 u8 dcr_is_write; 389 u8 dcr_is_write;
390 u8 osi_needed; 390 u8 osi_needed;
391 u8 osi_enabled; 391 u8 osi_enabled;
392 u8 papr_enabled;
393 u8 sane;
394 u8 cpu_type;
392 u8 hcall_needed; 395 u8 hcall_needed;
393 396
394 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 397 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -408,11 +411,13 @@ struct kvm_vcpu_arch {
408 struct dtl *dtl; 411 struct dtl *dtl;
409 struct dtl *dtl_end; 412 struct dtl *dtl_end;
410 413
414 wait_queue_head_t *wqp;
411 struct kvmppc_vcore *vcore; 415 struct kvmppc_vcore *vcore;
412 int ret; 416 int ret;
413 int trap; 417 int trap;
414 int state; 418 int state;
415 int ptid; 419 int ptid;
420 bool timer_running;
416 wait_queue_head_t cpu_run; 421 wait_queue_head_t cpu_run;
417 422
418 struct kvm_vcpu_arch_shared *shared; 423 struct kvm_vcpu_arch_shared *shared;
@@ -428,8 +433,9 @@ struct kvm_vcpu_arch {
428#endif 433#endif
429}; 434};
430 435
431#define KVMPPC_VCPU_BUSY_IN_HOST 0 436/* Values for vcpu->arch.state */
432#define KVMPPC_VCPU_BLOCKED 1 437#define KVMPPC_VCPU_STOPPED 0
438#define KVMPPC_VCPU_BUSY_IN_HOST 1
433#define KVMPPC_VCPU_RUNNABLE 2 439#define KVMPPC_VCPU_RUNNABLE 2
434 440
435#endif /* __POWERPC_KVM_HOST_H__ */ 441#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index d121f49d62b8..46efd1a265c9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); 66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); 67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
69extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
69 70
70/* Core-specific hooks */ 71/* Core-specific hooks */
71 72
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 5f078bc2063e..69f7ffe7f674 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -44,6 +44,7 @@
44#include <asm/compat.h> 44#include <asm/compat.h>
45#include <asm/mmu.h> 45#include <asm/mmu.h>
46#include <asm/hvcall.h> 46#include <asm/hvcall.h>
47#include <asm/xics.h>
47#endif 48#endif
48#ifdef CONFIG_PPC_ISERIES 49#ifdef CONFIG_PPC_ISERIES
49#include <asm/iseries/alpaca.h> 50#include <asm/iseries/alpaca.h>
@@ -449,8 +450,6 @@ int main(void)
449#ifdef CONFIG_PPC_BOOK3S 450#ifdef CONFIG_PPC_BOOK3S
450 DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); 451 DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
451 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); 452 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
452 DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
453 DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
454 DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); 453 DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
455 DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); 454 DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
456 DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); 455 DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
@@ -458,14 +457,12 @@ int main(void)
458 DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); 457 DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
459 DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); 458 DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
460 DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); 459 DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
461 DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
462 DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
463 DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
464 DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
465 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); 460 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
466 DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); 461 DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
467 DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); 462 DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
468 DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); 463 DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
464 DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
465 DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
469 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); 466 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
470 DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); 467 DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
471 DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); 468 DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
@@ -481,6 +478,7 @@ int main(void)
481 DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); 478 DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
482 DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); 479 DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
483 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); 480 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
481 DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
484 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - 482 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
485 offsetof(struct kvmppc_vcpu_book3s, vcpu)); 483 offsetof(struct kvmppc_vcpu_book3s, vcpu));
486 DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); 484 DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -537,6 +535,8 @@ int main(void)
537 HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); 535 HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
538 HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); 536 HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
539 HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); 537 HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
538 HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
539 HSTATE_FIELD(HSTATE_NAPPING, napping);
540 540
541#ifdef CONFIG_KVM_BOOK3S_64_HV 541#ifdef CONFIG_KVM_BOOK3S_64_HV
542 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); 542 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
@@ -549,6 +549,7 @@ int main(void)
549 HSTATE_FIELD(HSTATE_DSCR, host_dscr); 549 HSTATE_FIELD(HSTATE_DSCR, host_dscr);
550 HSTATE_FIELD(HSTATE_DABR, dabr); 550 HSTATE_FIELD(HSTATE_DABR, dabr);
551 HSTATE_FIELD(HSTATE_DECEXP, dec_expires); 551 HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
552 DEFINE(IPI_PRIORITY, IPI_PRIORITY);
552#endif /* CONFIG_KVM_BOOK3S_64_HV */ 553#endif /* CONFIG_KVM_BOOK3S_64_HV */
553 554
554#else /* CONFIG_PPC_BOOK3S */ 555#else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 41b02c792aa3..29ddd8b1c274 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -427,16 +427,6 @@ slb_miss_user_pseries:
427 b . /* prevent spec. execution */ 427 b . /* prevent spec. execution */
428#endif /* __DISABLED__ */ 428#endif /* __DISABLED__ */
429 429
430/* KVM's trampoline code needs to be close to the interrupt handlers */
431
432#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
433#ifdef CONFIG_KVM_BOOK3S_PR
434#include "../kvm/book3s_rmhandlers.S"
435#else
436#include "../kvm/book3s_hv_rmhandlers.S"
437#endif
438#endif
439
440 .align 7 430 .align 7
441 .globl __end_interrupts 431 .globl __end_interrupts
442__end_interrupts: 432__end_interrupts:
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index da3a1225c0ac..ca1f88b3dc59 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
78 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) 78 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
79 vcpu_44x->shadow_refs[i].gtlb_index = -1; 79 vcpu_44x->shadow_refs[i].gtlb_index = -1;
80 80
81 vcpu->arch.cpu_type = KVM_CPU_440;
82
81 return 0; 83 return 0;
82} 84}
83 85
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 08428e2c188d..3688aeecc4b2 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -43,18 +43,22 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
43 fpu.o \ 43 fpu.o \
44 book3s_paired_singles.o \ 44 book3s_paired_singles.o \
45 book3s_pr.o \ 45 book3s_pr.o \
46 book3s_pr_papr.o \
46 book3s_emulate.o \ 47 book3s_emulate.o \
47 book3s_interrupts.o \ 48 book3s_interrupts.o \
48 book3s_mmu_hpte.o \ 49 book3s_mmu_hpte.o \
49 book3s_64_mmu_host.o \ 50 book3s_64_mmu_host.o \
50 book3s_64_mmu.o \ 51 book3s_64_mmu.o \
51 book3s_32_mmu.o 52 book3s_32_mmu.o
53kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
54 book3s_rmhandlers.o
52 55
53kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 56kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
54 book3s_hv.o \ 57 book3s_hv.o \
55 book3s_hv_interrupts.o \ 58 book3s_hv_interrupts.o \
56 book3s_64_mmu_hv.o 59 book3s_64_mmu_hv.o
57kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 60kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
61 book3s_hv_rmhandlers.o \
58 book3s_hv_rm_mmu.o \ 62 book3s_hv_rm_mmu.o \
59 book3s_64_vio_hv.o \ 63 book3s_64_vio_hv.o \
60 book3s_hv_builtin.o 64 book3s_hv_builtin.o
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
index 3608471ad2d8..7e06a6fc8d07 100644
--- a/arch/powerpc/kvm/book3s_32_sr.S
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -31,7 +31,7 @@
31 * R1 = host R1 31 * R1 = host R1
32 * R2 = host R2 32 * R2 = host R2
33 * R3 = shadow vcpu 33 * R3 = shadow vcpu
34 * all other volatile GPRS = free 34 * all other volatile GPRS = free except R4, R6
35 * SVCPU[CR] = guest CR 35 * SVCPU[CR] = guest CR
36 * SVCPU[XER] = guest XER 36 * SVCPU[XER] = guest XER
37 * SVCPU[CTR] = guest CTR 37 * SVCPU[CTR] = guest CTR
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index c6d3e194b6b4..b871721c0050 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
128 dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", 128 dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
129 page, vcpu_book3s->sdr1, pteg, slbe->vsid); 129 page, vcpu_book3s->sdr1, pteg, slbe->vsid);
130 130
131 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); 131 /* When running a PAPR guest, SDR1 contains a HVA address instead
132 of a GPA */
133 if (vcpu_book3s->vcpu.arch.papr_enabled)
134 r = pteg;
135 else
136 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
137
132 if (kvm_is_error_hva(r)) 138 if (kvm_is_error_hva(r))
133 return r; 139 return r;
134 return r | (pteg & ~PAGE_MASK); 140 return r | (pteg & ~PAGE_MASK);
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 04e7d3bbfe8b..f2e6e48ea463 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -53,7 +53,7 @@ slb_exit_skip_ ## num:
53 * R1 = host R1 53 * R1 = host R1
54 * R2 = host R2 54 * R2 = host R2
55 * R3 = shadow vcpu 55 * R3 = shadow vcpu
56 * all other volatile GPRS = free 56 * all other volatile GPRS = free except R4, R6
57 * SVCPU[CR] = guest CR 57 * SVCPU[CR] = guest CR
58 * SVCPU[XER] = guest XER 58 * SVCPU[XER] = guest XER
59 * SVCPU[CTR] = guest CTR 59 * SVCPU[CTR] = guest CTR
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 466846557089..0c9dc62532d0 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -63,6 +63,25 @@
63 * function pointers, so let's just disable the define. */ 63 * function pointers, so let's just disable the define. */
64#undef mfsrin 64#undef mfsrin
65 65
66enum priv_level {
67 PRIV_PROBLEM = 0,
68 PRIV_SUPER = 1,
69 PRIV_HYPER = 2,
70};
71
72static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
73{
74 /* PAPR VMs only access supervisor SPRs */
75 if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
76 return false;
77
78 /* Limit user space to its own small SPR set */
79 if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM)
80 return false;
81
82 return true;
83}
84
66int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 85int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
67 unsigned int inst, int *advance) 86 unsigned int inst, int *advance)
68{ 87{
@@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
296 315
297 switch (sprn) { 316 switch (sprn) {
298 case SPRN_SDR1: 317 case SPRN_SDR1:
318 if (!spr_allowed(vcpu, PRIV_HYPER))
319 goto unprivileged;
299 to_book3s(vcpu)->sdr1 = spr_val; 320 to_book3s(vcpu)->sdr1 = spr_val;
300 break; 321 break;
301 case SPRN_DSISR: 322 case SPRN_DSISR:
@@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
390 case SPRN_PMC4_GEKKO: 411 case SPRN_PMC4_GEKKO:
391 case SPRN_WPAR_GEKKO: 412 case SPRN_WPAR_GEKKO:
392 break; 413 break;
414unprivileged:
393 default: 415 default:
394 printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); 416 printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
395#ifndef DEBUG_SPR 417#ifndef DEBUG_SPR
@@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
421 break; 443 break;
422 } 444 }
423 case SPRN_SDR1: 445 case SPRN_SDR1:
446 if (!spr_allowed(vcpu, PRIV_HYPER))
447 goto unprivileged;
424 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); 448 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
425 break; 449 break;
426 case SPRN_DSISR: 450 case SPRN_DSISR:
@@ -449,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
449 case SPRN_HID5: 473 case SPRN_HID5:
450 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); 474 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
451 break; 475 break;
476 case SPRN_CFAR:
477 case SPRN_PURR:
478 kvmppc_set_gpr(vcpu, rt, 0);
479 break;
452 case SPRN_GQR0: 480 case SPRN_GQR0:
453 case SPRN_GQR1: 481 case SPRN_GQR1:
454 case SPRN_GQR2: 482 case SPRN_GQR2:
@@ -476,6 +504,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
476 kvmppc_set_gpr(vcpu, rt, 0); 504 kvmppc_set_gpr(vcpu, rt, 0);
477 break; 505 break;
478 default: 506 default:
507unprivileged:
479 printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); 508 printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
480#ifndef DEBUG_SPR 509#ifndef DEBUG_SPR
481 emulated = EMULATE_FAIL; 510 emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 88c8f26add02..f7f63a00ab1f 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -23,9 +23,7 @@
23#ifdef CONFIG_KVM_BOOK3S_64_HV 23#ifdef CONFIG_KVM_BOOK3S_64_HV
24EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); 24EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
25#else 25#else
26EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter); 26EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
27EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
28EXPORT_SYMBOL_GPL(kvmppc_rmcall);
29EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); 27EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
30#ifdef CONFIG_ALTIVEC 28#ifdef CONFIG_ALTIVEC
31EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); 29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cc0d7f1b19ab..4644c7986d80 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -62,6 +62,8 @@
62/* #define EXIT_DEBUG_SIMPLE */ 62/* #define EXIT_DEBUG_SIMPLE */
63/* #define EXIT_DEBUG_INT */ 63/* #define EXIT_DEBUG_INT */
64 64
65static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
66
65void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 67void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
66{ 68{
67 local_paca->kvm_hstate.kvm_vcpu = vcpu; 69 local_paca->kvm_hstate.kvm_vcpu = vcpu;
@@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
72{ 74{
73} 75}
74 76
75static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
76static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
77
78void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
79{
80 u64 now;
81 unsigned long dec_nsec;
82
83 now = get_tb();
84 if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
85 kvmppc_core_queue_dec(vcpu);
86 if (vcpu->arch.pending_exceptions)
87 return;
88 if (vcpu->arch.dec_expires != ~(u64)0) {
89 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
90 tb_ticks_per_sec;
91 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
92 HRTIMER_MODE_REL);
93 }
94
95 kvmppc_vcpu_blocked(vcpu);
96
97 kvm_vcpu_block(vcpu);
98 vcpu->stat.halt_wakeup++;
99
100 if (vcpu->arch.dec_expires != ~(u64)0)
101 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
102
103 kvmppc_vcpu_unblocked(vcpu);
104}
105
106void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) 77void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
107{ 78{
108 vcpu->arch.shregs.msr = msr; 79 vcpu->arch.shregs.msr = msr;
80 kvmppc_end_cede(vcpu);
109} 81}
110 82
111void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) 83void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
257 229
258 switch (req) { 230 switch (req) {
259 case H_CEDE: 231 case H_CEDE:
260 vcpu->arch.shregs.msr |= MSR_EE;
261 vcpu->arch.ceded = 1;
262 smp_mb();
263 if (!vcpu->arch.prodded)
264 kvmppc_vcpu_block(vcpu);
265 else
266 vcpu->arch.prodded = 0;
267 smp_mb();
268 vcpu->arch.ceded = 0;
269 break; 232 break;
270 case H_PROD: 233 case H_PROD:
271 target = kvmppc_get_gpr(vcpu, 4); 234 target = kvmppc_get_gpr(vcpu, 4);
@@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
388 break; 351 break;
389 } 352 }
390 353
391
392 if (!(r & RESUME_HOST)) {
393 /* To avoid clobbering exit_reason, only check for signals if
394 * we aren't already exiting to userspace for some other
395 * reason. */
396 if (signal_pending(tsk)) {
397 vcpu->stat.signal_exits++;
398 run->exit_reason = KVM_EXIT_INTR;
399 r = -EINTR;
400 } else {
401 kvmppc_core_deliver_interrupts(vcpu);
402 }
403 }
404
405 return r; 354 return r;
406} 355}
407 356
@@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
479 kvmppc_mmu_book3s_hv_init(vcpu); 428 kvmppc_mmu_book3s_hv_init(vcpu);
480 429
481 /* 430 /*
482 * Some vcpus may start out in stopped state. If we initialize 431 * We consider the vcpu stopped until we see the first run ioctl for it.
483 * them to busy-in-host state they will stop other vcpus in the
484 * vcore from running. Instead we initialize them to blocked
485 * state, effectively considering them to be stopped until we
486 * see the first run ioctl for them.
487 */ 432 */
488 vcpu->arch.state = KVMPPC_VCPU_BLOCKED; 433 vcpu->arch.state = KVMPPC_VCPU_STOPPED;
489 434
490 init_waitqueue_head(&vcpu->arch.cpu_run); 435 init_waitqueue_head(&vcpu->arch.cpu_run);
491 436
@@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
496 if (vcore) { 441 if (vcore) {
497 INIT_LIST_HEAD(&vcore->runnable_threads); 442 INIT_LIST_HEAD(&vcore->runnable_threads);
498 spin_lock_init(&vcore->lock); 443 spin_lock_init(&vcore->lock);
444 init_waitqueue_head(&vcore->wq);
499 } 445 }
500 kvm->arch.vcores[core] = vcore; 446 kvm->arch.vcores[core] = vcore;
501 } 447 }
@@ -506,10 +452,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
506 452
507 spin_lock(&vcore->lock); 453 spin_lock(&vcore->lock);
508 ++vcore->num_threads; 454 ++vcore->num_threads;
509 ++vcore->n_blocked;
510 spin_unlock(&vcore->lock); 455 spin_unlock(&vcore->lock);
511 vcpu->arch.vcore = vcore; 456 vcpu->arch.vcore = vcore;
512 457
458 vcpu->arch.cpu_type = KVM_CPU_3S_64;
459 kvmppc_sanity_check(vcpu);
460
513 return vcpu; 461 return vcpu;
514 462
515free_vcpu: 463free_vcpu:
@@ -524,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
524 kfree(vcpu); 472 kfree(vcpu);
525} 473}
526 474
527static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) 475static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
528{ 476{
529 struct kvmppc_vcore *vc = vcpu->arch.vcore; 477 unsigned long dec_nsec, now;
530 478
531 spin_lock(&vc->lock); 479 now = get_tb();
532 vcpu->arch.state = KVMPPC_VCPU_BLOCKED; 480 if (now > vcpu->arch.dec_expires) {
533 ++vc->n_blocked; 481 /* decrementer has already gone negative */
534 if (vc->n_runnable > 0 && 482 kvmppc_core_queue_dec(vcpu);
535 vc->n_runnable + vc->n_blocked == vc->num_threads) { 483 kvmppc_core_deliver_interrupts(vcpu);
536 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, 484 return;
537 arch.run_list);
538 wake_up(&vcpu->arch.cpu_run);
539 } 485 }
540 spin_unlock(&vc->lock); 486 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
487 / tb_ticks_per_sec;
488 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
489 HRTIMER_MODE_REL);
490 vcpu->arch.timer_running = 1;
541} 491}
542 492
543static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) 493static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
544{ 494{
545 struct kvmppc_vcore *vc = vcpu->arch.vcore; 495 vcpu->arch.ceded = 0;
546 496 if (vcpu->arch.timer_running) {
547 spin_lock(&vc->lock); 497 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
548 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 498 vcpu->arch.timer_running = 0;
549 --vc->n_blocked; 499 }
550 spin_unlock(&vc->lock);
551} 500}
552 501
553extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 502extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -562,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
562 return; 511 return;
563 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 512 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
564 --vc->n_runnable; 513 --vc->n_runnable;
514 ++vc->n_busy;
565 /* decrement the physical thread id of each following vcpu */ 515 /* decrement the physical thread id of each following vcpu */
566 v = vcpu; 516 v = vcpu;
567 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) 517 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
@@ -575,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
575 struct paca_struct *tpaca; 525 struct paca_struct *tpaca;
576 struct kvmppc_vcore *vc = vcpu->arch.vcore; 526 struct kvmppc_vcore *vc = vcpu->arch.vcore;
577 527
528 if (vcpu->arch.timer_running) {
529 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
530 vcpu->arch.timer_running = 0;
531 }
578 cpu = vc->pcpu + vcpu->arch.ptid; 532 cpu = vc->pcpu + vcpu->arch.ptid;
579 tpaca = &paca[cpu]; 533 tpaca = &paca[cpu];
580 tpaca->kvm_hstate.kvm_vcpu = vcpu; 534 tpaca->kvm_hstate.kvm_vcpu = vcpu;
581 tpaca->kvm_hstate.kvm_vcore = vc; 535 tpaca->kvm_hstate.kvm_vcore = vc;
536 tpaca->kvm_hstate.napping = 0;
537 vcpu->cpu = vc->pcpu;
582 smp_wmb(); 538 smp_wmb();
583#ifdef CONFIG_PPC_ICP_NATIVE 539#ifdef CONFIG_PPC_ICP_NATIVE
584 if (vcpu->arch.ptid) { 540 if (vcpu->arch.ptid) {
585 tpaca->cpu_start = 0x80; 541 tpaca->cpu_start = 0x80;
586 tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
587 wmb(); 542 wmb();
588 xics_wake_cpu(cpu); 543 xics_wake_cpu(cpu);
589 ++vc->n_woken; 544 ++vc->n_woken;
@@ -631,9 +586,10 @@ static int on_primary_thread(void)
631 */ 586 */
632static int kvmppc_run_core(struct kvmppc_vcore *vc) 587static int kvmppc_run_core(struct kvmppc_vcore *vc)
633{ 588{
634 struct kvm_vcpu *vcpu, *vnext; 589 struct kvm_vcpu *vcpu, *vcpu0, *vnext;
635 long ret; 590 long ret;
636 u64 now; 591 u64 now;
592 int ptid;
637 593
638 /* don't start if any threads have a signal pending */ 594 /* don't start if any threads have a signal pending */
639 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 595 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -652,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
652 goto out; 608 goto out;
653 } 609 }
654 610
611 /*
612 * Assign physical thread IDs, first to non-ceded vcpus
613 * and then to ceded ones.
614 */
615 ptid = 0;
616 vcpu0 = NULL;
617 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
618 if (!vcpu->arch.ceded) {
619 if (!ptid)
620 vcpu0 = vcpu;
621 vcpu->arch.ptid = ptid++;
622 }
623 }
624 if (!vcpu0)
625 return 0; /* nothing to run */
626 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
627 if (vcpu->arch.ceded)
628 vcpu->arch.ptid = ptid++;
629
655 vc->n_woken = 0; 630 vc->n_woken = 0;
656 vc->nap_count = 0; 631 vc->nap_count = 0;
657 vc->entry_exit_count = 0; 632 vc->entry_exit_count = 0;
658 vc->vcore_running = 1; 633 vc->vcore_state = VCORE_RUNNING;
659 vc->in_guest = 0; 634 vc->in_guest = 0;
660 vc->pcpu = smp_processor_id(); 635 vc->pcpu = smp_processor_id();
636 vc->napping_threads = 0;
661 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 637 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
662 kvmppc_start_thread(vcpu); 638 kvmppc_start_thread(vcpu);
663 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
664 arch.run_list);
665 639
640 preempt_disable();
666 spin_unlock(&vc->lock); 641 spin_unlock(&vc->lock);
667 642
668 preempt_disable();
669 kvm_guest_enter(); 643 kvm_guest_enter();
670 __kvmppc_vcore_entry(NULL, vcpu); 644 __kvmppc_vcore_entry(NULL, vcpu0);
671 645
672 /* wait for secondary threads to finish writing their state to memory */
673 spin_lock(&vc->lock); 646 spin_lock(&vc->lock);
647 /* disable sending of IPIs on virtual external irqs */
648 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
649 vcpu->cpu = -1;
650 /* wait for secondary threads to finish writing their state to memory */
674 if (vc->nap_count < vc->n_woken) 651 if (vc->nap_count < vc->n_woken)
675 kvmppc_wait_for_nap(vc); 652 kvmppc_wait_for_nap(vc);
676 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 653 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
677 vc->vcore_running = 2; 654 vc->vcore_state = VCORE_EXITING;
678 spin_unlock(&vc->lock); 655 spin_unlock(&vc->lock);
679 656
680 /* make sure updates to secondary vcpu structs are visible now */ 657 /* make sure updates to secondary vcpu structs are visible now */
@@ -690,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
690 if (now < vcpu->arch.dec_expires && 667 if (now < vcpu->arch.dec_expires &&
691 kvmppc_core_pending_dec(vcpu)) 668 kvmppc_core_pending_dec(vcpu))
692 kvmppc_core_dequeue_dec(vcpu); 669 kvmppc_core_dequeue_dec(vcpu);
693 if (!vcpu->arch.trap) { 670
694 if (signal_pending(vcpu->arch.run_task)) { 671 ret = RESUME_GUEST;
695 vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; 672 if (vcpu->arch.trap)
696 vcpu->arch.ret = -EINTR; 673 ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
697 } 674 vcpu->arch.run_task);
698 continue; /* didn't get to run */ 675
699 }
700 ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
701 vcpu->arch.run_task);
702 vcpu->arch.ret = ret; 676 vcpu->arch.ret = ret;
703 vcpu->arch.trap = 0; 677 vcpu->arch.trap = 0;
678
679 if (vcpu->arch.ceded) {
680 if (ret != RESUME_GUEST)
681 kvmppc_end_cede(vcpu);
682 else
683 kvmppc_set_timer(vcpu);
684 }
704 } 685 }
705 686
706 spin_lock(&vc->lock); 687 spin_lock(&vc->lock);
707 out: 688 out:
708 vc->vcore_running = 0; 689 vc->vcore_state = VCORE_INACTIVE;
709 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 690 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
710 arch.run_list) { 691 arch.run_list) {
711 if (vcpu->arch.ret != RESUME_GUEST) { 692 if (vcpu->arch.ret != RESUME_GUEST) {
@@ -717,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
717 return 1; 698 return 1;
718} 699}
719 700
720static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 701/*
702 * Wait for some other vcpu thread to execute us, and
703 * wake us up when we need to handle something in the host.
704 */
705static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
721{ 706{
722 int ptid;
723 int wait_state;
724 struct kvmppc_vcore *vc;
725 DEFINE_WAIT(wait); 707 DEFINE_WAIT(wait);
726 708
727 /* No need to go into the guest when all we do is going out */ 709 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
728 if (signal_pending(current)) { 710 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
729 kvm_run->exit_reason = KVM_EXIT_INTR; 711 schedule();
730 return -EINTR; 712 finish_wait(&vcpu->arch.cpu_run, &wait);
713}
714
715/*
716 * All the vcpus in this vcore are idle, so wait for a decrementer
717 * or external interrupt to one of the vcpus. vc->lock is held.
718 */
719static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
720{
721 DEFINE_WAIT(wait);
722 struct kvm_vcpu *v;
723 int all_idle = 1;
724
725 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
726 vc->vcore_state = VCORE_SLEEPING;
727 spin_unlock(&vc->lock);
728 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
729 if (!v->arch.ceded || v->arch.pending_exceptions) {
730 all_idle = 0;
731 break;
732 }
731 } 733 }
734 if (all_idle)
735 schedule();
736 finish_wait(&vc->wq, &wait);
737 spin_lock(&vc->lock);
738 vc->vcore_state = VCORE_INACTIVE;
739}
732 740
733 /* On PPC970, check that we have an RMA region */ 741static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
734 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) 742{
735 return -EPERM; 743 int n_ceded;
744 int prev_state;
745 struct kvmppc_vcore *vc;
746 struct kvm_vcpu *v, *vn;
736 747
737 kvm_run->exit_reason = 0; 748 kvm_run->exit_reason = 0;
738 vcpu->arch.ret = RESUME_GUEST; 749 vcpu->arch.ret = RESUME_GUEST;
739 vcpu->arch.trap = 0; 750 vcpu->arch.trap = 0;
740 751
741 flush_fp_to_thread(current);
742 flush_altivec_to_thread(current);
743 flush_vsx_to_thread(current);
744
745 /* 752 /*
746 * Synchronize with other threads in this virtual core 753 * Synchronize with other threads in this virtual core
747 */ 754 */
748 vc = vcpu->arch.vcore; 755 vc = vcpu->arch.vcore;
749 spin_lock(&vc->lock); 756 spin_lock(&vc->lock);
750 /* This happens the first time this is called for a vcpu */ 757 vcpu->arch.ceded = 0;
751 if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
752 --vc->n_blocked;
753 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
754 ptid = vc->n_runnable;
755 vcpu->arch.run_task = current; 758 vcpu->arch.run_task = current;
756 vcpu->arch.kvm_run = kvm_run; 759 vcpu->arch.kvm_run = kvm_run;
757 vcpu->arch.ptid = ptid; 760 prev_state = vcpu->arch.state;
761 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
758 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); 762 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
759 ++vc->n_runnable; 763 ++vc->n_runnable;
760 764
761 wait_state = TASK_INTERRUPTIBLE; 765 /*
762 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 766 * This happens the first time this is called for a vcpu.
763 if (signal_pending(current)) { 767 * If the vcore is already running, we may be able to start
764 if (!vc->vcore_running) { 768 * this thread straight away and have it join in.
765 kvm_run->exit_reason = KVM_EXIT_INTR; 769 */
766 vcpu->arch.ret = -EINTR; 770 if (prev_state == KVMPPC_VCPU_STOPPED) {
767 break; 771 if (vc->vcore_state == VCORE_RUNNING &&
768 } 772 VCORE_EXIT_COUNT(vc) == 0) {
769 /* have to wait for vcore to stop executing guest */ 773 vcpu->arch.ptid = vc->n_runnable - 1;
770 wait_state = TASK_UNINTERRUPTIBLE; 774 kvmppc_start_thread(vcpu);
771 smp_send_reschedule(vc->pcpu);
772 } 775 }
773 776
774 if (!vc->vcore_running && 777 } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
775 vc->n_runnable + vc->n_blocked == vc->num_threads) { 778 --vc->n_busy;
776 /* we can run now */
777 if (kvmppc_run_core(vc))
778 continue;
779 }
780 779
781 if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) 780 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
782 kvmppc_start_thread(vcpu); 781 !signal_pending(current)) {
782 if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
783 spin_unlock(&vc->lock);
784 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
785 spin_lock(&vc->lock);
786 continue;
787 }
788 n_ceded = 0;
789 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
790 n_ceded += v->arch.ceded;
791 if (n_ceded == vc->n_runnable)
792 kvmppc_vcore_blocked(vc);
793 else
794 kvmppc_run_core(vc);
795
796 list_for_each_entry_safe(v, vn, &vc->runnable_threads,
797 arch.run_list) {
798 kvmppc_core_deliver_interrupts(v);
799 if (signal_pending(v->arch.run_task)) {
800 kvmppc_remove_runnable(vc, v);
801 v->stat.signal_exits++;
802 v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
803 v->arch.ret = -EINTR;
804 wake_up(&v->arch.cpu_run);
805 }
806 }
807 }
783 808
784 /* wait for other threads to come in, or wait for vcore */ 809 if (signal_pending(current)) {
785 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); 810 if (vc->vcore_state == VCORE_RUNNING ||
786 spin_unlock(&vc->lock); 811 vc->vcore_state == VCORE_EXITING) {
787 schedule(); 812 spin_unlock(&vc->lock);
788 finish_wait(&vcpu->arch.cpu_run, &wait); 813 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
789 spin_lock(&vc->lock); 814 spin_lock(&vc->lock);
815 }
816 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
817 kvmppc_remove_runnable(vc, vcpu);
818 vcpu->stat.signal_exits++;
819 kvm_run->exit_reason = KVM_EXIT_INTR;
820 vcpu->arch.ret = -EINTR;
821 }
790 } 822 }
791 823
792 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
793 kvmppc_remove_runnable(vc, vcpu);
794 spin_unlock(&vc->lock); 824 spin_unlock(&vc->lock);
795
796 return vcpu->arch.ret; 825 return vcpu->arch.ret;
797} 826}
798 827
@@ -800,6 +829,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
800{ 829{
801 int r; 830 int r;
802 831
832 if (!vcpu->arch.sane) {
833 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
834 return -EINVAL;
835 }
836
837 /* No need to go into the guest when all we'll do is come back out */
838 if (signal_pending(current)) {
839 run->exit_reason = KVM_EXIT_INTR;
840 return -EINTR;
841 }
842
843 /* On PPC970, check that we have an RMA region */
844 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
845 return -EPERM;
846
847 flush_fp_to_thread(current);
848 flush_altivec_to_thread(current);
849 flush_vsx_to_thread(current);
850 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
851
803 do { 852 do {
804 r = kvmppc_run_vcpu(run, vcpu); 853 r = kvmppc_run_vcpu(run, vcpu);
805 854
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fcfe6b055558..bacb0cfa3602 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -110,39 +110,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
110 return H_SUCCESS; 110 return H_SUCCESS;
111} 111}
112 112
113static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
114 unsigned long pte_index)
115{
116 unsigned long rb, va_low;
117
118 rb = (v & ~0x7fUL) << 16; /* AVA field */
119 va_low = pte_index >> 3;
120 if (v & HPTE_V_SECONDARY)
121 va_low = ~va_low;
122 /* xor vsid from AVA */
123 if (!(v & HPTE_V_1TB_SEG))
124 va_low ^= v >> 12;
125 else
126 va_low ^= v >> 24;
127 va_low &= 0x7ff;
128 if (v & HPTE_V_LARGE) {
129 rb |= 1; /* L field */
130 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
131 (r & 0xff000)) {
132 /* non-16MB large page, must be 64k */
133 /* (masks depend on page size) */
134 rb |= 0x1000; /* page encoding in LP field */
135 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
136 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
137 }
138 } else {
139 /* 4kB page */
140 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
141 }
142 rb |= (v >> 54) & 0x300; /* B field */
143 return rb;
144}
145
146#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 113#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
147 114
148static inline int try_lock_tlbie(unsigned int *lock) 115static inline int try_lock_tlbie(unsigned int *lock)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index de2950135e6e..f422231d9235 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -20,7 +20,10 @@
20#include <asm/ppc_asm.h> 20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
22#include <asm/reg.h> 22#include <asm/reg.h>
23#include <asm/mmu.h>
23#include <asm/page.h> 24#include <asm/page.h>
25#include <asm/ptrace.h>
26#include <asm/hvcall.h>
24#include <asm/asm-offsets.h> 27#include <asm/asm-offsets.h>
25#include <asm/exception-64s.h> 28#include <asm/exception-64s.h>
26 29
@@ -49,7 +52,7 @@ kvmppc_skip_Hinterrupt:
49 b . 52 b .
50 53
51/* 54/*
52 * Call kvmppc_handler_trampoline_enter in real mode. 55 * Call kvmppc_hv_entry in real mode.
53 * Must be called with interrupts hard-disabled. 56 * Must be called with interrupts hard-disabled.
54 * 57 *
55 * Input Registers: 58 * Input Registers:
@@ -89,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
89kvm_start_guest: 92kvm_start_guest:
90 ld r1,PACAEMERGSP(r13) 93 ld r1,PACAEMERGSP(r13)
91 subi r1,r1,STACK_FRAME_OVERHEAD 94 subi r1,r1,STACK_FRAME_OVERHEAD
95 ld r2,PACATOC(r13)
96
97 /* were we napping due to cede? */
98 lbz r0,HSTATE_NAPPING(r13)
99 cmpwi r0,0
100 bne kvm_end_cede
92 101
93 /* get vcpu pointer */ 102 /* get vcpu pointer */
94 ld r4, HSTATE_KVM_VCPU(r13) 103 ld r4, HSTATE_KVM_VCPU(r13)
@@ -276,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
276 cmpwi r0,0 285 cmpwi r0,0
277 beq 20b 286 beq 20b
278 287
279 /* Set LPCR. Set the MER bit if there is a pending external irq. */ 288 /* Set LPCR and RMOR. */
28010: ld r8,KVM_LPCR(r9) 28910: ld r8,KVM_LPCR(r9)
281 ld r0,VCPU_PENDING_EXC(r4) 290 mtspr SPRN_LPCR,r8
282 li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
283 oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
284 and. r0,r0,r7
285 beq 11f
286 ori r8,r8,LPCR_MER
28711: mtspr SPRN_LPCR,r8
288 ld r8,KVM_RMOR(r9) 291 ld r8,KVM_RMOR(r9)
289 mtspr SPRN_RMOR,r8 292 mtspr SPRN_RMOR,r8
290 isync 293 isync
@@ -448,19 +451,50 @@ toc_tlbie_lock:
448 mtctr r6 451 mtctr r6
449 mtxer r7 452 mtxer r7
450 453
451 /* Move SRR0 and SRR1 into the respective regs */ 454kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
452 ld r6, VCPU_SRR0(r4) 455 ld r6, VCPU_SRR0(r4)
453 ld r7, VCPU_SRR1(r4) 456 ld r7, VCPU_SRR1(r4)
454 mtspr SPRN_SRR0, r6
455 mtspr SPRN_SRR1, r7
456
457 ld r10, VCPU_PC(r4) 457 ld r10, VCPU_PC(r4)
458 ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */
458 459
459 ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */
460 rldicl r11, r11, 63 - MSR_HV_LG, 1 460 rldicl r11, r11, 63 - MSR_HV_LG, 1
461 rotldi r11, r11, 1 + MSR_HV_LG 461 rotldi r11, r11, 1 + MSR_HV_LG
462 ori r11, r11, MSR_ME 462 ori r11, r11, MSR_ME
463 463
464 /* Check if we can deliver an external or decrementer interrupt now */
465 ld r0,VCPU_PENDING_EXC(r4)
466 li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
467 oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
468 and r0,r0,r8
469 cmpdi cr1,r0,0
470 andi. r0,r11,MSR_EE
471 beq cr1,11f
472BEGIN_FTR_SECTION
473 mfspr r8,SPRN_LPCR
474 ori r8,r8,LPCR_MER
475 mtspr SPRN_LPCR,r8
476 isync
477END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
478 beq 5f
479 li r0,BOOK3S_INTERRUPT_EXTERNAL
48012: mr r6,r10
481 mr r10,r0
482 mr r7,r11
483 li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
484 rotldi r11,r11,63
485 b 5f
48611: beq 5f
487 mfspr r0,SPRN_DEC
488 cmpwi r0,0
489 li r0,BOOK3S_INTERRUPT_DECREMENTER
490 blt 12b
491
492 /* Move SRR0 and SRR1 into the respective regs */
4935: mtspr SPRN_SRR0, r6
494 mtspr SPRN_SRR1, r7
495 li r0,0
496 stb r0,VCPU_CEDED(r4) /* cancel cede */
497
464fast_guest_return: 498fast_guest_return:
465 mtspr SPRN_HSRR0,r10 499 mtspr SPRN_HSRR0,r10
466 mtspr SPRN_HSRR1,r11 500 mtspr SPRN_HSRR1,r11
@@ -574,21 +608,20 @@ kvmppc_interrupt:
574 /* See if this is something we can handle in real mode */ 608 /* See if this is something we can handle in real mode */
575 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 609 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
576 beq hcall_try_real_mode 610 beq hcall_try_real_mode
577hcall_real_cont:
578 611
579 /* Check for mediated interrupts (could be done earlier really ...) */ 612 /* Check for mediated interrupts (could be done earlier really ...) */
580BEGIN_FTR_SECTION 613BEGIN_FTR_SECTION
581 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL 614 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL
582 bne+ 1f 615 bne+ 1f
583 ld r5,VCPU_KVM(r9)
584 ld r5,KVM_LPCR(r5)
585 andi. r0,r11,MSR_EE 616 andi. r0,r11,MSR_EE
586 beq 1f 617 beq 1f
618 mfspr r5,SPRN_LPCR
587 andi. r0,r5,LPCR_MER 619 andi. r0,r5,LPCR_MER
588 bne bounce_ext_interrupt 620 bne bounce_ext_interrupt
5891: 6211:
590END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 622END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
591 623
624hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
592 /* Save DEC */ 625 /* Save DEC */
593 mfspr r5,SPRN_DEC 626 mfspr r5,SPRN_DEC
594 mftb r6 627 mftb r6
@@ -682,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
682 slbia 715 slbia
683 ptesync 716 ptesync
684 717
685hdec_soon: 718hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */
686BEGIN_FTR_SECTION 719BEGIN_FTR_SECTION
687 b 32f 720 b 32f
688END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 721END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
@@ -700,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
700 addi r0,r3,0x100 733 addi r0,r3,0x100
701 stwcx. r0,0,r6 734 stwcx. r0,0,r6
702 bne 41b 735 bne 41b
736 lwsync
703 737
704 /* 738 /*
705 * At this point we have an interrupt that we have to pass 739 * At this point we have an interrupt that we have to pass
@@ -713,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
713 * interrupt, since the other threads will already be on their 747 * interrupt, since the other threads will already be on their
714 * way here in that case. 748 * way here in that case.
715 */ 749 */
750 cmpwi r3,0x100 /* Are we the first here? */
751 bge 43f
752 cmpwi r3,1 /* Are any other threads in the guest? */
753 ble 43f
716 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 754 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
717 beq 40f 755 beq 40f
718 cmpwi r3,0x100 /* Are we the first here? */
719 bge 40f
720 cmpwi r3,1
721 ble 40f
722 li r0,0 756 li r0,0
723 mtspr SPRN_HDEC,r0 757 mtspr SPRN_HDEC,r0
72440: 75840:
759 /*
760 * Send an IPI to any napping threads, since an HDEC interrupt
761 * doesn't wake CPUs up from nap.
762 */
763 lwz r3,VCORE_NAPPING_THREADS(r5)
764 lwz r4,VCPU_PTID(r9)
765 li r0,1
766 sldi r0,r0,r4
767 andc. r3,r3,r0 /* no sense IPI'ing ourselves */
768 beq 43f
769 mulli r4,r4,PACA_SIZE /* get paca for thread 0 */
770 subf r6,r4,r13
77142: andi. r0,r3,1
772 beq 44f
773 ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
774 li r0,IPI_PRIORITY
775 li r7,XICS_QIRR
776 stbcix r0,r7,r8 /* trigger the IPI */
77744: srdi. r3,r3,1
778 addi r6,r6,PACA_SIZE
779 bne 42b
725 780
726 /* Secondary threads wait for primary to do partition switch */ 781 /* Secondary threads wait for primary to do partition switch */
727 ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ 78243: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */
728 ld r5,HSTATE_KVM_VCORE(r13) 783 ld r5,HSTATE_KVM_VCORE(r13)
729 lwz r3,VCPU_PTID(r9) 784 lwz r3,VCPU_PTID(r9)
730 cmpwi r3,0 785 cmpwi r3,0
@@ -1077,7 +1132,6 @@ hcall_try_real_mode:
1077hcall_real_fallback: 1132hcall_real_fallback:
1078 li r12,BOOK3S_INTERRUPT_SYSCALL 1133 li r12,BOOK3S_INTERRUPT_SYSCALL
1079 ld r9, HSTATE_KVM_VCPU(r13) 1134 ld r9, HSTATE_KVM_VCPU(r13)
1080 ld r11, VCPU_MSR(r9)
1081 1135
1082 b hcall_real_cont 1136 b hcall_real_cont
1083 1137
@@ -1139,7 +1193,7 @@ hcall_real_table:
1139 .long 0 /* 0xd4 */ 1193 .long 0 /* 0xd4 */
1140 .long 0 /* 0xd8 */ 1194 .long 0 /* 0xd8 */
1141 .long 0 /* 0xdc */ 1195 .long 0 /* 0xdc */
1142 .long 0 /* 0xe0 */ 1196 .long .kvmppc_h_cede - hcall_real_table
1143 .long 0 /* 0xe4 */ 1197 .long 0 /* 0xe4 */
1144 .long 0 /* 0xe8 */ 1198 .long 0 /* 0xe8 */
1145 .long 0 /* 0xec */ 1199 .long 0 /* 0xec */
@@ -1168,7 +1222,8 @@ bounce_ext_interrupt:
1168 mtspr SPRN_SRR0,r10 1222 mtspr SPRN_SRR0,r10
1169 mtspr SPRN_SRR1,r11 1223 mtspr SPRN_SRR1,r11
1170 li r10,BOOK3S_INTERRUPT_EXTERNAL 1224 li r10,BOOK3S_INTERRUPT_EXTERNAL
1171 LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); 1225 li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1226 rotldi r11,r11,63
1172 b fast_guest_return 1227 b fast_guest_return
1173 1228
1174_GLOBAL(kvmppc_h_set_dabr) 1229_GLOBAL(kvmppc_h_set_dabr)
@@ -1177,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr)
1177 li r3,0 1232 li r3,0
1178 blr 1233 blr
1179 1234
1235_GLOBAL(kvmppc_h_cede)
1236 ori r11,r11,MSR_EE
1237 std r11,VCPU_MSR(r3)
1238 li r0,1
1239 stb r0,VCPU_CEDED(r3)
1240 sync /* order setting ceded vs. testing prodded */
1241 lbz r5,VCPU_PRODDED(r3)
1242 cmpwi r5,0
1243 bne 1f
1244 li r0,0 /* set trap to 0 to say hcall is handled */
1245 stw r0,VCPU_TRAP(r3)
1246 li r0,H_SUCCESS
1247 std r0,VCPU_GPR(r3)(r3)
1248BEGIN_FTR_SECTION
1249 b 2f /* just send it up to host on 970 */
1250END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
1251
1252 /*
1253 * Set our bit in the bitmask of napping threads unless all the
1254 * other threads are already napping, in which case we send this
1255 * up to the host.
1256 */
1257 ld r5,HSTATE_KVM_VCORE(r13)
1258 lwz r6,VCPU_PTID(r3)
1259 lwz r8,VCORE_ENTRY_EXIT(r5)
1260 clrldi r8,r8,56
1261 li r0,1
1262 sld r0,r0,r6
1263 addi r6,r5,VCORE_NAPPING_THREADS
126431: lwarx r4,0,r6
1265 or r4,r4,r0
1266 popcntw r7,r4
1267 cmpw r7,r8
1268 bge 2f
1269 stwcx. r4,0,r6
1270 bne 31b
1271 li r0,1
1272 stb r0,HSTATE_NAPPING(r13)
1273 /* order napping_threads update vs testing entry_exit_count */
1274 lwsync
1275 mr r4,r3
1276 lwz r7,VCORE_ENTRY_EXIT(r5)
1277 cmpwi r7,0x100
1278 bge 33f /* another thread already exiting */
1279
1280/*
1281 * Although not specifically required by the architecture, POWER7
1282 * preserves the following registers in nap mode, even if an SMT mode
1283 * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
1284 * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
1285 */
1286 /* Save non-volatile GPRs */
1287 std r14, VCPU_GPR(r14)(r3)
1288 std r15, VCPU_GPR(r15)(r3)
1289 std r16, VCPU_GPR(r16)(r3)
1290 std r17, VCPU_GPR(r17)(r3)
1291 std r18, VCPU_GPR(r18)(r3)
1292 std r19, VCPU_GPR(r19)(r3)
1293 std r20, VCPU_GPR(r20)(r3)
1294 std r21, VCPU_GPR(r21)(r3)
1295 std r22, VCPU_GPR(r22)(r3)
1296 std r23, VCPU_GPR(r23)(r3)
1297 std r24, VCPU_GPR(r24)(r3)
1298 std r25, VCPU_GPR(r25)(r3)
1299 std r26, VCPU_GPR(r26)(r3)
1300 std r27, VCPU_GPR(r27)(r3)
1301 std r28, VCPU_GPR(r28)(r3)
1302 std r29, VCPU_GPR(r29)(r3)
1303 std r30, VCPU_GPR(r30)(r3)
1304 std r31, VCPU_GPR(r31)(r3)
1305
1306 /* save FP state */
1307 bl .kvmppc_save_fp
1308
1309 /*
1310 * Take a nap until a decrementer or external interrupt occurs,
1311 * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
1312 */
1313 li r0,0x80
1314 stb r0,PACAPROCSTART(r13)
1315 mfspr r5,SPRN_LPCR
1316 ori r5,r5,LPCR_PECE0 | LPCR_PECE1
1317 mtspr SPRN_LPCR,r5
1318 isync
1319 li r0, 0
1320 std r0, HSTATE_SCRATCH0(r13)
1321 ptesync
1322 ld r0, HSTATE_SCRATCH0(r13)
13231: cmpd r0, r0
1324 bne 1b
1325 nap
1326 b .
1327
1328kvm_end_cede:
1329 /* Woken by external or decrementer interrupt */
1330 ld r1, HSTATE_HOST_R1(r13)
1331 ld r2, PACATOC(r13)
1332
1333 /* If we're a secondary thread and we got here by an IPI, ack it */
1334 ld r4,HSTATE_KVM_VCPU(r13)
1335 lwz r3,VCPU_PTID(r4)
1336 cmpwi r3,0
1337 beq 27f
1338 mfspr r3,SPRN_SRR1
1339 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
1340 cmpwi r3,4 /* was it an external interrupt? */
1341 bne 27f
1342 ld r5, HSTATE_XICS_PHYS(r13)
1343 li r0,0xff
1344 li r6,XICS_QIRR
1345 li r7,XICS_XIRR
1346 lwzcix r8,r5,r7 /* ack the interrupt */
1347 sync
1348 stbcix r0,r5,r6 /* clear it */
1349 stwcix r8,r5,r7 /* EOI it */
135027:
1351 /* load up FP state */
1352 bl kvmppc_load_fp
1353
1354 /* Load NV GPRS */
1355 ld r14, VCPU_GPR(r14)(r4)
1356 ld r15, VCPU_GPR(r15)(r4)
1357 ld r16, VCPU_GPR(r16)(r4)
1358 ld r17, VCPU_GPR(r17)(r4)
1359 ld r18, VCPU_GPR(r18)(r4)
1360 ld r19, VCPU_GPR(r19)(r4)
1361 ld r20, VCPU_GPR(r20)(r4)
1362 ld r21, VCPU_GPR(r21)(r4)
1363 ld r22, VCPU_GPR(r22)(r4)
1364 ld r23, VCPU_GPR(r23)(r4)
1365 ld r24, VCPU_GPR(r24)(r4)
1366 ld r25, VCPU_GPR(r25)(r4)
1367 ld r26, VCPU_GPR(r26)(r4)
1368 ld r27, VCPU_GPR(r27)(r4)
1369 ld r28, VCPU_GPR(r28)(r4)
1370 ld r29, VCPU_GPR(r29)(r4)
1371 ld r30, VCPU_GPR(r30)(r4)
1372 ld r31, VCPU_GPR(r31)(r4)
1373
1374 /* clear our bit in vcore->napping_threads */
137533: ld r5,HSTATE_KVM_VCORE(r13)
1376 lwz r3,VCPU_PTID(r4)
1377 li r0,1
1378 sld r0,r0,r3
1379 addi r6,r5,VCORE_NAPPING_THREADS
138032: lwarx r7,0,r6
1381 andc r7,r7,r0
1382 stwcx. r7,0,r6
1383 bne 32b
1384 li r0,0
1385 stb r0,HSTATE_NAPPING(r13)
1386
1387 /* see if any other thread is already exiting */
1388 lwz r0,VCORE_ENTRY_EXIT(r5)
1389 cmpwi r0,0x100
1390 blt kvmppc_cede_reentry /* if not go back to guest */
1391
1392 /* some threads are exiting, so go to the guest exit path */
1393 b hcall_real_fallback
1394
1395 /* cede when already previously prodded case */
13961: li r0,0
1397 stb r0,VCPU_PRODDED(r3)
1398 sync /* order testing prodded vs. clearing ceded */
1399 stb r0,VCPU_CEDED(r3)
1400 li r3,H_SUCCESS
1401 blr
1402
1403 /* we've ceded but we want to give control to the host */
14042: li r3,H_TOO_HARD
1405 blr
1406
1180secondary_too_late: 1407secondary_too_late:
1181 ld r5,HSTATE_KVM_VCORE(r13) 1408 ld r5,HSTATE_KVM_VCORE(r13)
1182 HMT_LOW 1409 HMT_LOW
@@ -1194,14 +1421,20 @@ secondary_too_late:
1194 slbmte r6,r5 1421 slbmte r6,r5
11951: addi r11,r11,16 14221: addi r11,r11,16
1196 .endr 1423 .endr
1197 b 50f
1198 1424
1199secondary_nap: 1425secondary_nap:
1200 /* Clear any pending IPI */ 1426 /* Clear any pending IPI - assume we're a secondary thread */
120150: ld r5, HSTATE_XICS_PHYS(r13) 1427 ld r5, HSTATE_XICS_PHYS(r13)
1428 li r7, XICS_XIRR
1429 lwzcix r3, r5, r7 /* ack any pending interrupt */
1430 rlwinm. r0, r3, 0, 0xffffff /* any pending? */
1431 beq 37f
1432 sync
1202 li r0, 0xff 1433 li r0, 0xff
1203 li r6, XICS_QIRR 1434 li r6, XICS_QIRR
1204 stbcix r0, r5, r6 1435 stbcix r0, r5, r6 /* clear the IPI */
1436 stwcix r3, r5, r7 /* EOI it */
143737: sync
1205 1438
1206 /* increment the nap count and then go to nap mode */ 1439 /* increment the nap count and then go to nap mode */
1207 ld r4, HSTATE_KVM_VCORE(r13) 1440 ld r4, HSTATE_KVM_VCORE(r13)
@@ -1211,13 +1444,12 @@ secondary_nap:
1211 addi r3, r3, 1 1444 addi r3, r3, 1
1212 stwcx. r3, 0, r4 1445 stwcx. r3, 0, r4
1213 bne 51b 1446 bne 51b
1214 isync
1215 1447
1448 li r3, LPCR_PECE0
1216 mfspr r4, SPRN_LPCR 1449 mfspr r4, SPRN_LPCR
1217 li r0, LPCR_PECE 1450 rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
1218 andc r4, r4, r0
1219 ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */
1220 mtspr SPRN_LPCR, r4 1451 mtspr SPRN_LPCR, r4
1452 isync
1221 li r0, 0 1453 li r0, 0
1222 std r0, HSTATE_SCRATCH0(r13) 1454 std r0, HSTATE_SCRATCH0(r13)
1223 ptesync 1455 ptesync
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index c54b0e30cf3f..0a8515a5c042 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -29,27 +29,11 @@
29#define ULONG_SIZE 8 29#define ULONG_SIZE 8
30#define FUNC(name) GLUE(.,name) 30#define FUNC(name) GLUE(.,name)
31 31
32#define GET_SHADOW_VCPU_R13
33
34#define DISABLE_INTERRUPTS \
35 mfmsr r0; \
36 rldicl r0,r0,48,1; \
37 rotldi r0,r0,16; \
38 mtmsrd r0,1; \
39
40#elif defined(CONFIG_PPC_BOOK3S_32) 32#elif defined(CONFIG_PPC_BOOK3S_32)
41 33
42#define ULONG_SIZE 4 34#define ULONG_SIZE 4
43#define FUNC(name) name 35#define FUNC(name) name
44 36
45#define GET_SHADOW_VCPU_R13 \
46 lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2)
47
48#define DISABLE_INTERRUPTS \
49 mfmsr r0; \
50 rlwinm r0,r0,0,17,15; \
51 mtmsr r0; \
52
53#endif /* CONFIG_PPC_BOOK3S_XX */ 37#endif /* CONFIG_PPC_BOOK3S_XX */
54 38
55 39
@@ -108,44 +92,17 @@ kvm_start_entry:
108 92
109kvm_start_lightweight: 93kvm_start_lightweight:
110 94
111 GET_SHADOW_VCPU_R13
112 PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4)
113 PPC_STL r3, HSTATE_VMHANDLER(r13)
114
115 PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
116
117 DISABLE_INTERRUPTS
118
119#ifdef CONFIG_PPC_BOOK3S_64 95#ifdef CONFIG_PPC_BOOK3S_64
120 /* Some guests may need to have dcbz set to 32 byte length.
121 *
122 * Usually we ensure that by patching the guest's instructions
123 * to trap on dcbz and emulate it in the hypervisor.
124 *
125 * If we can, we should tell the CPU to use 32 byte dcbz though,
126 * because that's a lot faster.
127 */
128
129 PPC_LL r3, VCPU_HFLAGS(r4) 96 PPC_LL r3, VCPU_HFLAGS(r4)
130 rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ 97 rldicl r3, r3, 0, 63 /* r3 &= 1 */
131 beq no_dcbz32_on 98 stb r3, HSTATE_RESTORE_HID5(r13)
132
133 mfspr r3,SPRN_HID5
134 ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */
135 mtspr SPRN_HID5,r3
136
137no_dcbz32_on:
138
139#endif /* CONFIG_PPC_BOOK3S_64 */ 99#endif /* CONFIG_PPC_BOOK3S_64 */
140 100
141 PPC_LL r6, VCPU_RMCALL(r4) 101 PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
142 mtctr r6
143
144 PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4)
145 LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
146 102
147 /* Jump to segment patching handler and into our guest */ 103 /* Jump to segment patching handler and into our guest */
148 bctr 104 bl FUNC(kvmppc_entry_trampoline)
105 nop
149 106
150/* 107/*
151 * This is the handler in module memory. It gets jumped at from the 108 * This is the handler in module memory. It gets jumped at from the
@@ -170,21 +127,6 @@ kvmppc_handler_highmem:
170 /* R7 = vcpu */ 127 /* R7 = vcpu */
171 PPC_LL r7, GPR4(r1) 128 PPC_LL r7, GPR4(r1)
172 129
173#ifdef CONFIG_PPC_BOOK3S_64
174
175 PPC_LL r5, VCPU_HFLAGS(r7)
176 rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */
177 beq no_dcbz32_off
178
179 li r4, 0
180 mfspr r5,SPRN_HID5
181 rldimi r5,r4,6,56
182 mtspr SPRN_HID5,r5
183
184no_dcbz32_off:
185
186#endif /* CONFIG_PPC_BOOK3S_64 */
187
188 PPC_STL r14, VCPU_GPR(r14)(r7) 130 PPC_STL r14, VCPU_GPR(r14)(r7)
189 PPC_STL r15, VCPU_GPR(r15)(r7) 131 PPC_STL r15, VCPU_GPR(r15)(r7)
190 PPC_STL r16, VCPU_GPR(r16)(r7) 132 PPC_STL r16, VCPU_GPR(r16)(r7)
@@ -204,67 +146,6 @@ no_dcbz32_off:
204 PPC_STL r30, VCPU_GPR(r30)(r7) 146 PPC_STL r30, VCPU_GPR(r30)(r7)
205 PPC_STL r31, VCPU_GPR(r31)(r7) 147 PPC_STL r31, VCPU_GPR(r31)(r7)
206 148
207 /* Restore host msr -> SRR1 */
208 PPC_LL r6, VCPU_HOST_MSR(r7)
209
210 /*
211 * For some interrupts, we need to call the real Linux
212 * handler, so it can do work for us. This has to happen
213 * as if the interrupt arrived from the kernel though,
214 * so let's fake it here where most state is restored.
215 *
216 * Call Linux for hardware interrupts/decrementer
217 * r3 = address of interrupt handler (exit reason)
218 */
219
220 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
221 beq call_linux_handler
222 cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
223 beq call_linux_handler
224 cmpwi r12, BOOK3S_INTERRUPT_PERFMON
225 beq call_linux_handler
226
227 /* Back to EE=1 */
228 mtmsr r6
229 sync
230 b kvm_return_point
231
232call_linux_handler:
233
234 /*
235 * If we land here we need to jump back to the handler we
236 * came from.
237 *
238 * We have a page that we can access from real mode, so let's
239 * jump back to that and use it as a trampoline to get back into the
240 * interrupt handler!
241 *
242 * R3 still contains the exit code,
243 * R5 VCPU_HOST_RETIP and
244 * R6 VCPU_HOST_MSR
245 */
246
247 /* Restore host IP -> SRR0 */
248 PPC_LL r5, VCPU_HOST_RETIP(r7)
249
250 /* XXX Better move to a safe function?
251 * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
252
253 mtlr r12
254
255 PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7)
256 mtsrr0 r4
257 LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
258 mtsrr1 r3
259
260 RFI
261
262.global kvm_return_point
263kvm_return_point:
264
265 /* Jump back to lightweight entry if we're supposed to */
266 /* go back into the guest */
267
268 /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ 149 /* Pass the exit number as 3rd argument to kvmppc_handle_exit */
269 mr r5, r12 150 mr r5, r12
270 151
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 0c0d3f274437..d417511abfb1 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,16 +150,22 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
150#ifdef CONFIG_PPC_BOOK3S_64 150#ifdef CONFIG_PPC_BOOK3S_64
151 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { 151 if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
152 kvmppc_mmu_book3s_64_init(vcpu); 152 kvmppc_mmu_book3s_64_init(vcpu);
153 to_book3s(vcpu)->hior = 0xfff00000; 153 if (!to_book3s(vcpu)->hior_sregs)
154 to_book3s(vcpu)->hior = 0xfff00000;
154 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; 155 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
156 vcpu->arch.cpu_type = KVM_CPU_3S_64;
155 } else 157 } else
156#endif 158#endif
157 { 159 {
158 kvmppc_mmu_book3s_32_init(vcpu); 160 kvmppc_mmu_book3s_32_init(vcpu);
159 to_book3s(vcpu)->hior = 0; 161 if (!to_book3s(vcpu)->hior_sregs)
162 to_book3s(vcpu)->hior = 0;
160 to_book3s(vcpu)->msr_mask = 0xffffffffULL; 163 to_book3s(vcpu)->msr_mask = 0xffffffffULL;
164 vcpu->arch.cpu_type = KVM_CPU_3S_32;
161 } 165 }
162 166
167 kvmppc_sanity_check(vcpu);
168
163 /* If we are in hypervisor level on 970, we can tell the CPU to 169 /* If we are in hypervisor level on 970, we can tell the CPU to
164 * treat DCBZ as 32 bytes store */ 170 * treat DCBZ as 32 bytes store */
165 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; 171 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
@@ -646,7 +652,27 @@ program_interrupt:
646 break; 652 break;
647 } 653 }
648 case BOOK3S_INTERRUPT_SYSCALL: 654 case BOOK3S_INTERRUPT_SYSCALL:
649 if (vcpu->arch.osi_enabled && 655 if (vcpu->arch.papr_enabled &&
656 (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
657 !(vcpu->arch.shared->msr & MSR_PR)) {
658 /* SC 1 papr hypercalls */
659 ulong cmd = kvmppc_get_gpr(vcpu, 3);
660 int i;
661
662 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
663 r = RESUME_GUEST;
664 break;
665 }
666
667 run->papr_hcall.nr = cmd;
668 for (i = 0; i < 9; ++i) {
669 ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
670 run->papr_hcall.args[i] = gpr;
671 }
672 run->exit_reason = KVM_EXIT_PAPR_HCALL;
673 vcpu->arch.hcall_needed = 1;
674 r = RESUME_HOST;
675 } else if (vcpu->arch.osi_enabled &&
650 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && 676 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
651 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { 677 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
652 /* MOL hypercalls */ 678 /* MOL hypercalls */
@@ -770,6 +796,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
770 } 796 }
771 } 797 }
772 798
799 if (sregs->u.s.flags & KVM_SREGS_S_HIOR)
800 sregs->u.s.hior = to_book3s(vcpu)->hior;
801
773 return 0; 802 return 0;
774} 803}
775 804
@@ -806,6 +835,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
806 /* Flush the MMU after messing with the segments */ 835 /* Flush the MMU after messing with the segments */
807 kvmppc_mmu_pte_flush(vcpu, 0, 0); 836 kvmppc_mmu_pte_flush(vcpu, 0, 0);
808 837
838 if (sregs->u.s.flags & KVM_SREGS_S_HIOR) {
839 to_book3s(vcpu)->hior_sregs = true;
840 to_book3s(vcpu)->hior = sregs->u.s.hior;
841 }
842
809 return 0; 843 return 0;
810} 844}
811 845
@@ -841,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
841 if (!p) 875 if (!p)
842 goto uninit_vcpu; 876 goto uninit_vcpu;
843 877
844 vcpu->arch.host_retip = kvm_return_point;
845 vcpu->arch.host_msr = mfmsr();
846#ifdef CONFIG_PPC_BOOK3S_64 878#ifdef CONFIG_PPC_BOOK3S_64
847 /* default to book3s_64 (970fx) */ 879 /* default to book3s_64 (970fx) */
848 vcpu->arch.pvr = 0x3C0301; 880 vcpu->arch.pvr = 0x3C0301;
@@ -853,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
853 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 885 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
854 vcpu->arch.slb_nr = 64; 886 vcpu->arch.slb_nr = 64;
855 887
856 /* remember where some real-mode handlers are */
857 vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
858 vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
859 vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
860#ifdef CONFIG_PPC_BOOK3S_64
861 vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
862#else
863 vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
864#endif
865
866 vcpu->arch.shadow_msr = MSR_USER64; 888 vcpu->arch.shadow_msr = MSR_USER64;
867 889
868 err = kvmppc_mmu_init(vcpu); 890 err = kvmppc_mmu_init(vcpu);
@@ -908,6 +930,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
908#endif 930#endif
909 ulong ext_msr; 931 ulong ext_msr;
910 932
933 /* Check if we can run the vcpu at all */
934 if (!vcpu->arch.sane) {
935 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
936 return -EINVAL;
937 }
938
911 /* No need to go into the guest when all we do is going out */ 939 /* No need to go into the guest when all we do is going out */
912 if (signal_pending(current)) { 940 if (signal_pending(current)) {
913 kvm_run->exit_reason = KVM_EXIT_INTR; 941 kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
new file mode 100644
index 000000000000..b9589324797b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2011. Freescale Inc. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 * Paul Mackerras <paulus@samba.org>
7 *
8 * Description:
9 *
10 * Hypercall handling for running PAPR guests in PR KVM on Book 3S
11 * processors.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License, version 2, as
15 * published by the Free Software Foundation.
16 */
17
18#include <asm/uaccess.h>
19#include <asm/kvm_ppc.h>
20#include <asm/kvm_book3s.h>
21
22static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
23{
24 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
25 unsigned long pteg_addr;
26
27 pte_index <<= 4;
28 pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
29 pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
30 pteg_addr |= pte_index;
31
32 return pteg_addr;
33}
34
35static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
36{
37 long flags = kvmppc_get_gpr(vcpu, 4);
38 long pte_index = kvmppc_get_gpr(vcpu, 5);
39 unsigned long pteg[2 * 8];
40 unsigned long pteg_addr, i, *hpte;
41
42 pte_index &= ~7UL;
43 pteg_addr = get_pteg_addr(vcpu, pte_index);
44
45 copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
46 hpte = pteg;
47
48 if (likely((flags & H_EXACT) == 0)) {
49 pte_index &= ~7UL;
50 for (i = 0; ; ++i) {
51 if (i == 8)
52 return H_PTEG_FULL;
53 if ((*hpte & HPTE_V_VALID) == 0)
54 break;
55 hpte += 2;
56 }
57 } else {
58 i = kvmppc_get_gpr(vcpu, 5) & 7UL;
59 hpte += i * 2;
60 }
61
62 hpte[0] = kvmppc_get_gpr(vcpu, 6);
63 hpte[1] = kvmppc_get_gpr(vcpu, 7);
64 copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
65 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
66 kvmppc_set_gpr(vcpu, 4, pte_index | i);
67
68 return EMULATE_DONE;
69}
70
71static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
72{
73 unsigned long flags= kvmppc_get_gpr(vcpu, 4);
74 unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
75 unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
76 unsigned long v = 0, pteg, rb;
77 unsigned long pte[2];
78
79 pteg = get_pteg_addr(vcpu, pte_index);
80 copy_from_user(pte, (void __user *)pteg, sizeof(pte));
81
82 if ((pte[0] & HPTE_V_VALID) == 0 ||
83 ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
84 ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) {
85 kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
86 return EMULATE_DONE;
87 }
88
89 copy_to_user((void __user *)pteg, &v, sizeof(v));
90
91 rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
92 vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
93
94 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
95 kvmppc_set_gpr(vcpu, 4, pte[0]);
96 kvmppc_set_gpr(vcpu, 5, pte[1]);
97
98 return EMULATE_DONE;
99}
100
101static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
102{
103 unsigned long flags = kvmppc_get_gpr(vcpu, 4);
104 unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
105 unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
106 unsigned long rb, pteg, r, v;
107 unsigned long pte[2];
108
109 pteg = get_pteg_addr(vcpu, pte_index);
110 copy_from_user(pte, (void __user *)pteg, sizeof(pte));
111
112 if ((pte[0] & HPTE_V_VALID) == 0 ||
113 ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) {
114 kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
115 return EMULATE_DONE;
116 }
117
118 v = pte[0];
119 r = pte[1];
120 r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
121 HPTE_R_KEY_LO);
122 r |= (flags << 55) & HPTE_R_PP0;
123 r |= (flags << 48) & HPTE_R_KEY_HI;
124 r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
125
126 pte[1] = r;
127
128 rb = compute_tlbie_rb(v, r, pte_index);
129 vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
130 copy_to_user((void __user *)pteg, pte, sizeof(pte));
131
132 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
133
134 return EMULATE_DONE;
135}
136
137int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
138{
139 switch (cmd) {
140 case H_ENTER:
141 return kvmppc_h_pr_enter(vcpu);
142 case H_REMOVE:
143 return kvmppc_h_pr_remove(vcpu);
144 case H_PROTECT:
145 return kvmppc_h_pr_protect(vcpu);
146 case H_BULK_REMOVE:
147 /* We just flush all PTEs, so user space can
148 handle the HPT modifications */
149 kvmppc_mmu_pte_flush(vcpu, 0, 0);
150 break;
151 case H_CEDE:
152 kvm_vcpu_block(vcpu);
153 vcpu->stat.halt_wakeup++;
154 return EMULATE_DONE;
155 }
156
157 return EMULATE_FAIL;
158}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index c1f877c4a884..34187585c507 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -20,6 +20,7 @@
20#include <asm/ppc_asm.h> 20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
22#include <asm/reg.h> 22#include <asm/reg.h>
23#include <asm/mmu.h>
23#include <asm/page.h> 24#include <asm/page.h>
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
25 26
@@ -35,10 +36,10 @@
35 36
36#if defined(CONFIG_PPC_BOOK3S_64) 37#if defined(CONFIG_PPC_BOOK3S_64)
37 38
38#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg)
39#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR)
40#define FUNC(name) GLUE(.,name) 39#define FUNC(name) GLUE(.,name)
40#define MTMSR_EERI(reg) mtmsrd (reg),1
41 41
42 .globl kvmppc_skip_interrupt
42kvmppc_skip_interrupt: 43kvmppc_skip_interrupt:
43 /* 44 /*
44 * Here all GPRs are unchanged from when the interrupt happened 45 * Here all GPRs are unchanged from when the interrupt happened
@@ -51,6 +52,7 @@ kvmppc_skip_interrupt:
51 rfid 52 rfid
52 b . 53 b .
53 54
55 .globl kvmppc_skip_Hinterrupt
54kvmppc_skip_Hinterrupt: 56kvmppc_skip_Hinterrupt:
55 /* 57 /*
56 * Here all GPRs are unchanged from when the interrupt happened 58 * Here all GPRs are unchanged from when the interrupt happened
@@ -65,8 +67,8 @@ kvmppc_skip_Hinterrupt:
65 67
66#elif defined(CONFIG_PPC_BOOK3S_32) 68#elif defined(CONFIG_PPC_BOOK3S_32)
67 69
68#define MSR_NOIRQ MSR_KERNEL
69#define FUNC(name) name 70#define FUNC(name) name
71#define MTMSR_EERI(reg) mtmsr (reg)
70 72
71.macro INTERRUPT_TRAMPOLINE intno 73.macro INTERRUPT_TRAMPOLINE intno
72 74
@@ -167,40 +169,24 @@ kvmppc_handler_skip_ins:
167#endif 169#endif
168 170
169/* 171/*
170 * This trampoline brings us back to a real mode handler 172 * Call kvmppc_handler_trampoline_enter in real mode
171 *
172 * Input Registers:
173 *
174 * R5 = SRR0
175 * R6 = SRR1
176 * LR = real-mode IP
177 * 173 *
174 * On entry, r4 contains the guest shadow MSR
178 */ 175 */
179.global kvmppc_handler_lowmem_trampoline 176_GLOBAL(kvmppc_entry_trampoline)
180kvmppc_handler_lowmem_trampoline: 177 mfmsr r5
181 178 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
182 mtsrr0 r5 179 toreal(r7)
180
181 li r9, MSR_RI
182 ori r9, r9, MSR_EE
183 andc r9, r5, r9 /* Clear EE and RI in MSR value */
184 li r6, MSR_IR | MSR_DR
185 ori r6, r6, MSR_EE
186 andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */
187 MTMSR_EERI(r9) /* Clear EE and RI in MSR */
188 mtsrr0 r7 /* before we set srr0/1 */
183 mtsrr1 r6 189 mtsrr1 r6
184 blr
185kvmppc_handler_lowmem_trampoline_end:
186
187/*
188 * Call a function in real mode
189 *
190 * Input Registers:
191 *
192 * R3 = function
193 * R4 = MSR
194 * R5 = scratch register
195 *
196 */
197_GLOBAL(kvmppc_rmcall)
198 LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ)
199 mtmsr r5 /* Disable relocation and interrupts, so mtsrr
200 doesn't get interrupted */
201 sync
202 mtsrr0 r3
203 mtsrr1 r4
204 RFI 190 RFI
205 191
206#if defined(CONFIG_PPC_BOOK3S_32) 192#if defined(CONFIG_PPC_BOOK3S_32)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index aed32e517212..0676ae249b9f 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -23,6 +23,7 @@
23 23
24#define GET_SHADOW_VCPU(reg) \ 24#define GET_SHADOW_VCPU(reg) \
25 mr reg, r13 25 mr reg, r13
26#define MTMSR_EERI(reg) mtmsrd (reg),1
26 27
27#elif defined(CONFIG_PPC_BOOK3S_32) 28#elif defined(CONFIG_PPC_BOOK3S_32)
28 29
@@ -30,6 +31,7 @@
30 tophys(reg, r2); \ 31 tophys(reg, r2); \
31 lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ 32 lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \
32 tophys(reg, reg) 33 tophys(reg, reg)
34#define MTMSR_EERI(reg) mtmsr (reg)
33 35
34#endif 36#endif
35 37
@@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter:
57 /* Required state: 59 /* Required state:
58 * 60 *
59 * MSR = ~IR|DR 61 * MSR = ~IR|DR
60 * R13 = PACA
61 * R1 = host R1 62 * R1 = host R1
62 * R2 = host R2 63 * R2 = host R2
63 * R10 = guest MSR 64 * R4 = guest shadow MSR
65 * R5 = normal host MSR
66 * R6 = current host MSR (EE, IR, DR off)
67 * LR = highmem guest exit code
64 * all other volatile GPRS = free 68 * all other volatile GPRS = free
65 * SVCPU[CR] = guest CR 69 * SVCPU[CR] = guest CR
66 * SVCPU[XER] = guest XER 70 * SVCPU[XER] = guest XER
@@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter:
71 /* r3 = shadow vcpu */ 75 /* r3 = shadow vcpu */
72 GET_SHADOW_VCPU(r3) 76 GET_SHADOW_VCPU(r3)
73 77
78 /* Save guest exit handler address and MSR */
79 mflr r0
80 PPC_STL r0, HSTATE_VMHANDLER(r3)
81 PPC_STL r5, HSTATE_HOST_MSR(r3)
82
74 /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ 83 /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
75 PPC_STL r1, HSTATE_HOST_R1(r3) 84 PPC_STL r1, HSTATE_HOST_R1(r3)
76 PPC_STL r2, HSTATE_HOST_R2(r3) 85 PPC_STL r2, HSTATE_HOST_R2(r3)
77 86
78 /* Move SRR0 and SRR1 into the respective regs */
79 PPC_LL r9, SVCPU_PC(r3)
80 mtsrr0 r9
81 mtsrr1 r10
82
83 /* Activate guest mode, so faults get handled by KVM */ 87 /* Activate guest mode, so faults get handled by KVM */
84 li r11, KVM_GUEST_MODE_GUEST 88 li r11, KVM_GUEST_MODE_GUEST
85 stb r11, HSTATE_IN_GUEST(r3) 89 stb r11, HSTATE_IN_GUEST(r3)
@@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter:
87 /* Switch to guest segment. This is subarch specific. */ 91 /* Switch to guest segment. This is subarch specific. */
88 LOAD_GUEST_SEGMENTS 92 LOAD_GUEST_SEGMENTS
89 93
94#ifdef CONFIG_PPC_BOOK3S_64
95 /* Some guests may need to have dcbz set to 32 byte length.
96 *
97 * Usually we ensure that by patching the guest's instructions
98 * to trap on dcbz and emulate it in the hypervisor.
99 *
100 * If we can, we should tell the CPU to use 32 byte dcbz though,
101 * because that's a lot faster.
102 */
103 lbz r0, HSTATE_RESTORE_HID5(r3)
104 cmpwi r0, 0
105 beq no_dcbz32_on
106
107 mfspr r0,SPRN_HID5
108 ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */
109 mtspr SPRN_HID5,r0
110no_dcbz32_on:
111
112#endif /* CONFIG_PPC_BOOK3S_64 */
113
90 /* Enter guest */ 114 /* Enter guest */
91 115
92 PPC_LL r4, SVCPU_CTR(r3) 116 PPC_LL r8, SVCPU_CTR(r3)
93 PPC_LL r5, SVCPU_LR(r3) 117 PPC_LL r9, SVCPU_LR(r3)
94 lwz r6, SVCPU_CR(r3) 118 lwz r10, SVCPU_CR(r3)
95 lwz r7, SVCPU_XER(r3) 119 lwz r11, SVCPU_XER(r3)
120
121 mtctr r8
122 mtlr r9
123 mtcr r10
124 mtxer r11
96 125
97 mtctr r4 126 /* Move SRR0 and SRR1 into the respective regs */
98 mtlr r5 127 PPC_LL r9, SVCPU_PC(r3)
99 mtcr r6 128 /* First clear RI in our current MSR value */
100 mtxer r7 129 li r0, MSR_RI
130 andc r6, r6, r0
131 MTMSR_EERI(r6)
132 mtsrr0 r9
133 mtsrr1 r4
101 134
102 PPC_LL r0, SVCPU_R0(r3) 135 PPC_LL r0, SVCPU_R0(r3)
103 PPC_LL r1, SVCPU_R1(r3) 136 PPC_LL r1, SVCPU_R1(r3)
@@ -213,11 +246,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
213 beq ld_last_inst 246 beq ld_last_inst
214 cmpwi r12, BOOK3S_INTERRUPT_PROGRAM 247 cmpwi r12, BOOK3S_INTERRUPT_PROGRAM
215 beq ld_last_inst 248 beq ld_last_inst
249 cmpwi r12, BOOK3S_INTERRUPT_SYSCALL
250 beq ld_last_prev_inst
216 cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT 251 cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT
217 beq- ld_last_inst 252 beq- ld_last_inst
218 253
219 b no_ld_last_inst 254 b no_ld_last_inst
220 255
256ld_last_prev_inst:
257 addi r3, r3, -4
258
221ld_last_inst: 259ld_last_inst:
222 /* Save off the guest instruction we're at */ 260 /* Save off the guest instruction we're at */
223 261
@@ -254,6 +292,43 @@ no_ld_last_inst:
254 /* Switch back to host MMU */ 292 /* Switch back to host MMU */
255 LOAD_HOST_SEGMENTS 293 LOAD_HOST_SEGMENTS
256 294
295#ifdef CONFIG_PPC_BOOK3S_64
296
297 lbz r5, HSTATE_RESTORE_HID5(r13)
298 cmpwi r5, 0
299 beq no_dcbz32_off
300
301 li r4, 0
302 mfspr r5,SPRN_HID5
303 rldimi r5,r4,6,56
304 mtspr SPRN_HID5,r5
305
306no_dcbz32_off:
307
308#endif /* CONFIG_PPC_BOOK3S_64 */
309
310 /*
311 * For some interrupts, we need to call the real Linux
312 * handler, so it can do work for us. This has to happen
313 * as if the interrupt arrived from the kernel though,
314 * so let's fake it here where most state is restored.
315 *
316 * Having set up SRR0/1 with the address where we want
317 * to continue with relocation on (potentially in module
318 * space), we either just go straight there with rfi[d],
319 * or we jump to an interrupt handler with bctr if there
320 * is an interrupt to be handled first. In the latter
321 * case, the rfi[d] at the end of the interrupt handler
322 * will get us back to where we want to continue.
323 */
324
325 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
326 beq 1f
327 cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
328 beq 1f
329 cmpwi r12, BOOK3S_INTERRUPT_PERFMON
3301: mtctr r12
331
257 /* Register usage at this point: 332 /* Register usage at this point:
258 * 333 *
259 * R1 = host R1 334 * R1 = host R1
@@ -264,13 +339,15 @@ no_ld_last_inst:
264 * 339 *
265 */ 340 */
266 341
267 /* RFI into the highmem handler */ 342 PPC_LL r6, HSTATE_HOST_MSR(r13)
268 mfmsr r7
269 ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */
270 mtsrr1 r7
271 /* Load highmem handler address */
272 PPC_LL r8, HSTATE_VMHANDLER(r13) 343 PPC_LL r8, HSTATE_VMHANDLER(r13)
344
345 /* Restore host msr -> SRR1 */
346 mtsrr1 r6
347 /* Load highmem handler address */
273 mtsrr0 r8 348 mtsrr0 r8
274 349
350 /* RFI into the highmem handler, or jump to interrupt handler */
351 beqctr
275 RFI 352 RFI
276kvmppc_handler_trampoline_exit_end: 353kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee45fa01220e..bb6c988f010a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
316{ 316{
317 int ret; 317 int ret;
318 318
319 if (!vcpu->arch.sane) {
320 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
321 return -EINVAL;
322 }
323
319 local_irq_disable(); 324 local_irq_disable();
320 kvm_guest_enter(); 325 kvm_guest_enter();
321 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 326 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
618int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 623int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
619{ 624{
620 int i; 625 int i;
626 int r;
621 627
622 vcpu->arch.pc = 0; 628 vcpu->arch.pc = 0;
623 vcpu->arch.shared->msr = 0; 629 vcpu->arch.shared->msr = 0;
@@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
634 640
635 kvmppc_init_timing_stats(vcpu); 641 kvmppc_init_timing_stats(vcpu);
636 642
637 return kvmppc_core_vcpu_setup(vcpu); 643 r = kvmppc_core_vcpu_setup(vcpu);
644 kvmppc_sanity_check(vcpu);
645 return r;
638} 646}
639 647
640int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 648int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 797a7447c268..26d20903f2bc 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
73 /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ 73 /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
74 vcpu->vcpu_id = 0; 74 vcpu->vcpu_id = 0;
75 75
76 vcpu->arch.cpu_type = KVM_CPU_E500V2;
77
76 return 0; 78 return 0;
77} 79}
78 80
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a107c9be0fb1..0d843c6ba315 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -39,12 +39,8 @@
39 39
40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
41{ 41{
42#ifndef CONFIG_KVM_BOOK3S_64_HV
43 return !(v->arch.shared->msr & MSR_WE) || 42 return !(v->arch.shared->msr & MSR_WE) ||
44 !!(v->arch.pending_exceptions); 43 !!(v->arch.pending_exceptions);
45#else
46 return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
47#endif
48} 44}
49 45
50int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 46int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -95,6 +91,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
95 return r; 91 return r;
96} 92}
97 93
94int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
95{
96 int r = false;
97
98 /* We have to know what CPU to virtualize */
99 if (!vcpu->arch.pvr)
100 goto out;
101
102 /* PAPR only works with book3s_64 */
103 if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
104 goto out;
105
106#ifdef CONFIG_KVM_BOOK3S_64_HV
107 /* HV KVM can only do PAPR mode for now */
108 if (!vcpu->arch.papr_enabled)
109 goto out;
110#endif
111
112 r = true;
113
114out:
115 vcpu->arch.sane = r;
116 return r ? 0 : -EINVAL;
117}
118
98int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) 119int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
99{ 120{
100 enum emulation_result er; 121 enum emulation_result er;
@@ -188,6 +209,8 @@ int kvm_dev_ioctl_check_extension(long ext)
188 case KVM_CAP_PPC_BOOKE_SREGS: 209 case KVM_CAP_PPC_BOOKE_SREGS:
189#else 210#else
190 case KVM_CAP_PPC_SEGSTATE: 211 case KVM_CAP_PPC_SEGSTATE:
212 case KVM_CAP_PPC_HIOR:
213 case KVM_CAP_PPC_PAPR:
191#endif 214#endif
192 case KVM_CAP_PPC_UNSET_IRQ: 215 case KVM_CAP_PPC_UNSET_IRQ:
193 case KVM_CAP_PPC_IRQ_LEVEL: 216 case KVM_CAP_PPC_IRQ_LEVEL:
@@ -258,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
258{ 281{
259 struct kvm_vcpu *vcpu; 282 struct kvm_vcpu *vcpu;
260 vcpu = kvmppc_core_vcpu_create(kvm, id); 283 vcpu = kvmppc_core_vcpu_create(kvm, id);
284 vcpu->arch.wqp = &vcpu->wq;
261 if (!IS_ERR(vcpu)) 285 if (!IS_ERR(vcpu))
262 kvmppc_create_vcpu_debugfs(vcpu, id); 286 kvmppc_create_vcpu_debugfs(vcpu, id);
263 return vcpu; 287 return vcpu;
@@ -289,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data)
289 313
290 kvmppc_core_queue_dec(vcpu); 314 kvmppc_core_queue_dec(vcpu);
291 315
292 if (waitqueue_active(&vcpu->wq)) { 316 if (waitqueue_active(vcpu->arch.wqp)) {
293 wake_up_interruptible(&vcpu->wq); 317 wake_up_interruptible(vcpu->arch.wqp);
294 vcpu->stat.halt_wakeup++; 318 vcpu->stat.halt_wakeup++;
295 } 319 }
296} 320}
@@ -543,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
543 567
544int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) 568int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
545{ 569{
546 if (irq->irq == KVM_INTERRUPT_UNSET) 570 if (irq->irq == KVM_INTERRUPT_UNSET) {
547 kvmppc_core_dequeue_external(vcpu, irq); 571 kvmppc_core_dequeue_external(vcpu, irq);
548 else 572 return 0;
549 kvmppc_core_queue_external(vcpu, irq); 573 }
574
575 kvmppc_core_queue_external(vcpu, irq);
550 576
551 if (waitqueue_active(&vcpu->wq)) { 577 if (waitqueue_active(vcpu->arch.wqp)) {
552 wake_up_interruptible(&vcpu->wq); 578 wake_up_interruptible(vcpu->arch.wqp);
553 vcpu->stat.halt_wakeup++; 579 vcpu->stat.halt_wakeup++;
554 } else if (vcpu->cpu != -1) { 580 } else if (vcpu->cpu != -1) {
555 smp_send_reschedule(vcpu->cpu); 581 smp_send_reschedule(vcpu->cpu);
@@ -571,11 +597,18 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
571 r = 0; 597 r = 0;
572 vcpu->arch.osi_enabled = true; 598 vcpu->arch.osi_enabled = true;
573 break; 599 break;
600 case KVM_CAP_PPC_PAPR:
601 r = 0;
602 vcpu->arch.papr_enabled = true;
603 break;
574 default: 604 default:
575 r = -EINVAL; 605 r = -EINVAL;
576 break; 606 break;
577 } 607 }
578 608
609 if (!r)
610 r = kvmppc_sanity_check(vcpu);
611
579 return r; 612 return r;
580} 613}
581 614
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 00ff00dfb24c..1ca5de07ac36 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -119,6 +119,7 @@ struct kvm_vcpu_stat {
119 u32 instruction_lctlg; 119 u32 instruction_lctlg;
120 u32 exit_program_interruption; 120 u32 exit_program_interruption;
121 u32 exit_instr_and_program; 121 u32 exit_instr_and_program;
122 u32 deliver_external_call;
122 u32 deliver_emergency_signal; 123 u32 deliver_emergency_signal;
123 u32 deliver_service_signal; 124 u32 deliver_service_signal;
124 u32 deliver_virtio_interrupt; 125 u32 deliver_virtio_interrupt;
@@ -138,6 +139,7 @@ struct kvm_vcpu_stat {
138 u32 instruction_stfl; 139 u32 instruction_stfl;
139 u32 instruction_tprot; 140 u32 instruction_tprot;
140 u32 instruction_sigp_sense; 141 u32 instruction_sigp_sense;
142 u32 instruction_sigp_external_call;
141 u32 instruction_sigp_emergency; 143 u32 instruction_sigp_emergency;
142 u32 instruction_sigp_stop; 144 u32 instruction_sigp_stop;
143 u32 instruction_sigp_arch; 145 u32 instruction_sigp_arch;
@@ -174,6 +176,10 @@ struct kvm_s390_prefix_info {
174 __u32 address; 176 __u32 address;
175}; 177};
176 178
179struct kvm_s390_extcall_info {
180 __u16 code;
181};
182
177struct kvm_s390_emerg_info { 183struct kvm_s390_emerg_info {
178 __u16 code; 184 __u16 code;
179}; 185};
@@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info {
186 struct kvm_s390_ext_info ext; 192 struct kvm_s390_ext_info ext;
187 struct kvm_s390_pgm_info pgm; 193 struct kvm_s390_pgm_info pgm;
188 struct kvm_s390_emerg_info emerg; 194 struct kvm_s390_emerg_info emerg;
195 struct kvm_s390_extcall_info extcall;
189 struct kvm_s390_prefix_info prefix; 196 struct kvm_s390_prefix_info prefix;
190 }; 197 };
191}; 198};
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c9aeb4b4d0b8..87c16705b381 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
38 struct kvm_s390_interrupt_info *inti) 38 struct kvm_s390_interrupt_info *inti)
39{ 39{
40 switch (inti->type) { 40 switch (inti->type) {
41 case KVM_S390_INT_EXTERNAL_CALL:
42 if (psw_extint_disabled(vcpu))
43 return 0;
44 if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
45 return 1;
41 case KVM_S390_INT_EMERGENCY: 46 case KVM_S390_INT_EMERGENCY:
42 if (psw_extint_disabled(vcpu)) 47 if (psw_extint_disabled(vcpu))
43 return 0; 48 return 0;
@@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
98 struct kvm_s390_interrupt_info *inti) 103 struct kvm_s390_interrupt_info *inti)
99{ 104{
100 switch (inti->type) { 105 switch (inti->type) {
106 case KVM_S390_INT_EXTERNAL_CALL:
101 case KVM_S390_INT_EMERGENCY: 107 case KVM_S390_INT_EMERGENCY:
102 case KVM_S390_INT_SERVICE: 108 case KVM_S390_INT_SERVICE:
103 case KVM_S390_INT_VIRTIO: 109 case KVM_S390_INT_VIRTIO:
@@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
143 exception = 1; 149 exception = 1;
144 break; 150 break;
145 151
152 case KVM_S390_INT_EXTERNAL_CALL:
153 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
154 vcpu->stat.deliver_external_call++;
155 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
156 if (rc == -EFAULT)
157 exception = 1;
158
159 rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
160 if (rc == -EFAULT)
161 exception = 1;
162
163 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
164 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
165 if (rc == -EFAULT)
166 exception = 1;
167
168 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
169 __LC_EXT_NEW_PSW, sizeof(psw_t));
170 if (rc == -EFAULT)
171 exception = 1;
172 break;
173
146 case KVM_S390_INT_SERVICE: 174 case KVM_S390_INT_SERVICE:
147 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", 175 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
148 inti->ext.ext_params); 176 inti->ext.ext_params);
@@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
522 break; 550 break;
523 case KVM_S390_PROGRAM_INT: 551 case KVM_S390_PROGRAM_INT:
524 case KVM_S390_SIGP_STOP: 552 case KVM_S390_SIGP_STOP:
553 case KVM_S390_INT_EXTERNAL_CALL:
525 case KVM_S390_INT_EMERGENCY: 554 case KVM_S390_INT_EMERGENCY:
526 default: 555 default:
527 kfree(inti); 556 kfree(inti);
@@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
581 break; 610 break;
582 case KVM_S390_SIGP_STOP: 611 case KVM_S390_SIGP_STOP:
583 case KVM_S390_RESTART: 612 case KVM_S390_RESTART:
613 case KVM_S390_INT_EXTERNAL_CALL:
584 case KVM_S390_INT_EMERGENCY: 614 case KVM_S390_INT_EMERGENCY:
585 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 615 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
586 inti->type = s390int->type; 616 inti->type = s390int->type;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dc2b580e27bc..9610ba41b974 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
46 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, 46 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
47 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 47 { "instruction_lctl", VCPU_STAT(instruction_lctl) },
48 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, 48 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
49 { "deliver_external_call", VCPU_STAT(deliver_external_call) },
49 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, 50 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
50 { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) }, 51 { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
51 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, 52 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
64 { "instruction_stfl", VCPU_STAT(instruction_stfl) }, 65 { "instruction_stfl", VCPU_STAT(instruction_stfl) },
65 { "instruction_tprot", VCPU_STAT(instruction_tprot) }, 66 { "instruction_tprot", VCPU_STAT(instruction_tprot) },
66 { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, 67 { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
68 { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
67 { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, 69 { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
68 { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, 70 { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
69 { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, 71 { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
@@ -175,6 +177,8 @@ int kvm_arch_init_vm(struct kvm *kvm)
175 if (rc) 177 if (rc)
176 goto out_err; 178 goto out_err;
177 179
180 rc = -ENOMEM;
181
178 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); 182 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
179 if (!kvm->arch.sca) 183 if (!kvm->arch.sca)
180 goto out_err; 184 goto out_err;
@@ -312,11 +316,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
312struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 316struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
313 unsigned int id) 317 unsigned int id)
314{ 318{
315 struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 319 struct kvm_vcpu *vcpu;
316 int rc = -ENOMEM; 320 int rc = -EINVAL;
321
322 if (id >= KVM_MAX_VCPUS)
323 goto out;
317 324
325 rc = -ENOMEM;
326
327 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
318 if (!vcpu) 328 if (!vcpu)
319 goto out_nomem; 329 goto out;
320 330
321 vcpu->arch.sie_block = (struct kvm_s390_sie_block *) 331 vcpu->arch.sie_block = (struct kvm_s390_sie_block *)
322 get_zeroed_page(GFP_KERNEL); 332 get_zeroed_page(GFP_KERNEL);
@@ -352,7 +362,7 @@ out_free_sie_block:
352 free_page((unsigned long)(vcpu->arch.sie_block)); 362 free_page((unsigned long)(vcpu->arch.sie_block));
353out_free_cpu: 363out_free_cpu:
354 kfree(vcpu); 364 kfree(vcpu);
355out_nomem: 365out:
356 return ERR_PTR(rc); 366 return ERR_PTR(rc);
357} 367}
358 368
@@ -386,6 +396,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
386{ 396{
387 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 397 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
388 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 398 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
399 restore_access_regs(vcpu->arch.guest_acrs);
389 return 0; 400 return 0;
390} 401}
391 402
@@ -401,6 +412,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
401{ 412{
402 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 413 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
403 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 414 vcpu->arch.guest_fpregs.fpc = fpu->fpc;
415 restore_fp_regs(&vcpu->arch.guest_fpregs);
404 return 0; 416 return 0;
405} 417}
406 418
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d6a50c1fb2e6..f815118835f3 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
87 return -ENOMEM; 87 return -ENOMEM;
88 88
89 inti->type = KVM_S390_INT_EMERGENCY; 89 inti->type = KVM_S390_INT_EMERGENCY;
90 inti->emerg.code = vcpu->vcpu_id;
90 91
91 spin_lock(&fi->lock); 92 spin_lock(&fi->lock);
92 li = fi->local_int[cpu_addr]; 93 li = fi->local_int[cpu_addr];
@@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
103 wake_up_interruptible(&li->wq); 104 wake_up_interruptible(&li->wq);
104 spin_unlock_bh(&li->lock); 105 spin_unlock_bh(&li->lock);
105 rc = 0; /* order accepted */ 106 rc = 0; /* order accepted */
107 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
108unlock:
109 spin_unlock(&fi->lock);
110 return rc;
111}
112
113static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
114{
115 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
116 struct kvm_s390_local_interrupt *li;
117 struct kvm_s390_interrupt_info *inti;
118 int rc;
119
120 if (cpu_addr >= KVM_MAX_VCPUS)
121 return 3; /* not operational */
122
123 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
124 if (!inti)
125 return -ENOMEM;
126
127 inti->type = KVM_S390_INT_EXTERNAL_CALL;
128 inti->extcall.code = vcpu->vcpu_id;
129
130 spin_lock(&fi->lock);
131 li = fi->local_int[cpu_addr];
132 if (li == NULL) {
133 rc = 3; /* not operational */
134 kfree(inti);
135 goto unlock;
136 }
137 spin_lock_bh(&li->lock);
138 list_add_tail(&inti->list, &li->list);
139 atomic_set(&li->active, 1);
140 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
141 if (waitqueue_active(&li->wq))
142 wake_up_interruptible(&li->wq);
143 spin_unlock_bh(&li->lock);
144 rc = 0; /* order accepted */
145 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
106unlock: 146unlock:
107 spin_unlock(&fi->lock); 147 spin_unlock(&fi->lock);
108 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
109 return rc; 148 return rc;
110} 149}
111 150
@@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
267 rc = __sigp_sense(vcpu, cpu_addr, 306 rc = __sigp_sense(vcpu, cpu_addr,
268 &vcpu->arch.guest_gprs[r1]); 307 &vcpu->arch.guest_gprs[r1]);
269 break; 308 break;
309 case SIGP_EXTERNAL_CALL:
310 vcpu->stat.instruction_sigp_external_call++;
311 rc = __sigp_external_call(vcpu, cpu_addr);
312 break;
270 case SIGP_EMERGENCY: 313 case SIGP_EMERGENCY:
271 vcpu->stat.instruction_sigp_emergency++; 314 vcpu->stat.instruction_sigp_emergency++;
272 rc = __sigp_emergency(vcpu, cpu_addr); 315 rc = __sigp_emergency(vcpu, cpu_addr);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 34595d5e1038..3925d8007864 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -100,7 +100,9 @@
100#define APIC_TIMER_BASE_CLKIN 0x0 100#define APIC_TIMER_BASE_CLKIN 0x0
101#define APIC_TIMER_BASE_TMBASE 0x1 101#define APIC_TIMER_BASE_TMBASE 0x1
102#define APIC_TIMER_BASE_DIV 0x2 102#define APIC_TIMER_BASE_DIV 0x2
103#define APIC_LVT_TIMER_ONESHOT (0 << 17)
103#define APIC_LVT_TIMER_PERIODIC (1 << 17) 104#define APIC_LVT_TIMER_PERIODIC (1 << 17)
105#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
104#define APIC_LVT_MASKED (1 << 16) 106#define APIC_LVT_MASKED (1 << 16)
105#define APIC_LVT_LEVEL_TRIGGER (1 << 15) 107#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
106#define APIC_LVT_REMOTE_IRR (1 << 14) 108#define APIC_LVT_REMOTE_IRR (1 << 14)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aa6a488cd075..2f84a433b6a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -121,6 +121,7 @@
121#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 121#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
122#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ 122#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
123#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ 123#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
124#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
124#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 125#define X86_FEATURE_AES (4*32+25) /* AES instructions */
125#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 126#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
126#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 127#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 6040d115ef51..a026507893e9 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -262,7 +262,7 @@ struct x86_emulate_ctxt {
262 struct operand dst; 262 struct operand dst;
263 bool has_seg_override; 263 bool has_seg_override;
264 u8 seg_override; 264 u8 seg_override;
265 unsigned int d; 265 u64 d;
266 int (*execute)(struct x86_emulate_ctxt *ctxt); 266 int (*execute)(struct x86_emulate_ctxt *ctxt);
267 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 267 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
268 /* modrm */ 268 /* modrm */
@@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
275 unsigned long _eip; 275 unsigned long _eip;
276 /* Fields above regs are cleared together. */ 276 /* Fields above regs are cleared together. */
277 unsigned long regs[NR_VCPU_REGS]; 277 unsigned long regs[NR_VCPU_REGS];
278 struct operand memop;
279 struct operand *memopp;
278 struct fetch_cache fetch; 280 struct fetch_cache fetch;
279 struct read_cache io_read; 281 struct read_cache io_read;
280 struct read_cache mem_read; 282 struct read_cache mem_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd51c83aa5de..b4973f4dab98 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -26,7 +26,8 @@
26#include <asm/mtrr.h> 26#include <asm/mtrr.h>
27#include <asm/msr-index.h> 27#include <asm/msr-index.h>
28 28
29#define KVM_MAX_VCPUS 64 29#define KVM_MAX_VCPUS 254
30#define KVM_SOFT_MAX_VCPUS 64
30#define KVM_MEMORY_SLOTS 32 31#define KVM_MEMORY_SLOTS 32
31/* memory slots that does not exposed to userspace */ 32/* memory slots that does not exposed to userspace */
32#define KVM_PRIVATE_MEM_SLOTS 4 33#define KVM_PRIVATE_MEM_SLOTS 4
@@ -264,6 +265,7 @@ struct kvm_mmu {
264 void (*new_cr3)(struct kvm_vcpu *vcpu); 265 void (*new_cr3)(struct kvm_vcpu *vcpu);
265 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 266 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
266 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 267 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
268 u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
267 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, 269 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
268 bool prefault); 270 bool prefault);
269 void (*inject_page_fault)(struct kvm_vcpu *vcpu, 271 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
@@ -411,8 +413,9 @@ struct kvm_vcpu_arch {
411 u32 tsc_catchup_mult; 413 u32 tsc_catchup_mult;
412 s8 tsc_catchup_shift; 414 s8 tsc_catchup_shift;
413 415
414 bool nmi_pending; 416 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
415 bool nmi_injected; 417 unsigned nmi_pending; /* NMI queued after currently running handler */
418 bool nmi_injected; /* Trying to inject an NMI this entry */
416 419
417 struct mtrr_state_type mtrr_state; 420 struct mtrr_state_type mtrr_state;
418 u32 pat; 421 u32 pat;
@@ -628,14 +631,13 @@ struct kvm_x86_ops {
628 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 631 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
629 632
630 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 633 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
634 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
631 635
632 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 636 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
633 637
634 int (*check_intercept)(struct kvm_vcpu *vcpu, 638 int (*check_intercept)(struct kvm_vcpu *vcpu,
635 struct x86_instruction_info *info, 639 struct x86_instruction_info *info,
636 enum x86_intercept_stage stage); 640 enum x86_intercept_stage stage);
637
638 const struct trace_print_flags *exit_reasons_str;
639}; 641};
640 642
641struct kvm_arch_async_pf { 643struct kvm_arch_async_pf {
@@ -672,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
672 674
673extern bool tdp_enabled; 675extern bool tdp_enabled;
674 676
677u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
678
675/* control of guest tsc rate supported? */ 679/* control of guest tsc rate supported? */
676extern bool kvm_has_tsc_control; 680extern bool kvm_has_tsc_control;
677/* minimum supported tsc_khz for guests */ 681/* minimum supported tsc_khz for guests */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d52609aeeab8..a6962d9161a0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -229,6 +229,8 @@
229#define MSR_IA32_APICBASE_ENABLE (1<<11) 229#define MSR_IA32_APICBASE_ENABLE (1<<11)
230#define MSR_IA32_APICBASE_BASE (0xfffff<<12) 230#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
231 231
232#define MSR_IA32_TSCDEADLINE 0x000006e0
233
232#define MSR_IA32_UCODE_WRITE 0x00000079 234#define MSR_IA32_UCODE_WRITE 0x00000079
233#define MSR_IA32_UCODE_REV 0x0000008b 235#define MSR_IA32_UCODE_REV 0x0000008b
234 236
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2caf290e9895..31f180c21ce9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -350,6 +350,18 @@ enum vmcs_field {
350#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ 350#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
351 351
352 352
353/*
354 * Exit Qualifications for APIC-Access
355 */
356#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */
357#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */
358#define TYPE_LINEAR_APIC_INST_READ (0 << 12)
359#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12)
360#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12)
361#define TYPE_LINEAR_APIC_EVENT (3 << 12)
362#define TYPE_PHYSICAL_APIC_EVENT (10 << 12)
363#define TYPE_PHYSICAL_APIC_INST (15 << 12)
364
353/* segment AR */ 365/* segment AR */
354#define SEGMENT_AR_L_MASK (1 << 13) 366#define SEGMENT_AR_L_MASK (1 << 13)
355 367
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8b4cc5f067de..f1e3be18a08f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,39 @@
29#include "tss.h" 29#include "tss.h"
30 30
31/* 31/*
32 * Operand types
33 */
34#define OpNone 0ull
35#define OpImplicit 1ull /* No generic decode */
36#define OpReg 2ull /* Register */
37#define OpMem 3ull /* Memory */
38#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
39#define OpDI 5ull /* ES:DI/EDI/RDI */
40#define OpMem64 6ull /* Memory, 64-bit */
41#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
42#define OpDX 8ull /* DX register */
43#define OpCL 9ull /* CL register (for shifts) */
44#define OpImmByte 10ull /* 8-bit sign extended immediate */
45#define OpOne 11ull /* Implied 1 */
46#define OpImm 12ull /* Sign extended immediate */
47#define OpMem16 13ull /* Memory operand (16-bit). */
48#define OpMem32 14ull /* Memory operand (32-bit). */
49#define OpImmU 15ull /* Immediate operand, zero extended */
50#define OpSI 16ull /* SI/ESI/RSI */
51#define OpImmFAddr 17ull /* Immediate far address */
52#define OpMemFAddr 18ull /* Far address in memory */
53#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
54#define OpES 20ull /* ES */
55#define OpCS 21ull /* CS */
56#define OpSS 22ull /* SS */
57#define OpDS 23ull /* DS */
58#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */
60
61#define OpBits 5 /* Width of operand field */
62#define OpMask ((1ull << OpBits) - 1)
63
64/*
32 * Opcode effective-address decode tables. 65 * Opcode effective-address decode tables.
33 * Note that we only emulate instructions that have at least one memory 66 * Note that we only emulate instructions that have at least one memory
34 * operand (excluding implicit stack references). We assume that stack 67 * operand (excluding implicit stack references). We assume that stack
@@ -40,37 +73,35 @@
40/* Operand sizes: 8-bit operands or specified/overridden size. */ 73/* Operand sizes: 8-bit operands or specified/overridden size. */
41#define ByteOp (1<<0) /* 8-bit operands. */ 74#define ByteOp (1<<0) /* 8-bit operands. */
42/* Destination operand type. */ 75/* Destination operand type. */
43#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 76#define DstShift 1
44#define DstReg (2<<1) /* Register operand. */ 77#define ImplicitOps (OpImplicit << DstShift)
45#define DstMem (3<<1) /* Memory operand. */ 78#define DstReg (OpReg << DstShift)
46#define DstAcc (4<<1) /* Destination Accumulator */ 79#define DstMem (OpMem << DstShift)
47#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 80#define DstAcc (OpAcc << DstShift)
48#define DstMem64 (6<<1) /* 64bit memory operand */ 81#define DstDI (OpDI << DstShift)
49#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ 82#define DstMem64 (OpMem64 << DstShift)
50#define DstDX (8<<1) /* Destination is in DX register */ 83#define DstImmUByte (OpImmUByte << DstShift)
51#define DstMask (0xf<<1) 84#define DstDX (OpDX << DstShift)
85#define DstMask (OpMask << DstShift)
52/* Source operand type. */ 86/* Source operand type. */
53#define SrcNone (0<<5) /* No source operand. */ 87#define SrcShift 6
54#define SrcReg (1<<5) /* Register operand. */ 88#define SrcNone (OpNone << SrcShift)
55#define SrcMem (2<<5) /* Memory operand. */ 89#define SrcReg (OpReg << SrcShift)
56#define SrcMem16 (3<<5) /* Memory operand (16-bit). */ 90#define SrcMem (OpMem << SrcShift)
57#define SrcMem32 (4<<5) /* Memory operand (32-bit). */ 91#define SrcMem16 (OpMem16 << SrcShift)
58#define SrcImm (5<<5) /* Immediate operand. */ 92#define SrcMem32 (OpMem32 << SrcShift)
59#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ 93#define SrcImm (OpImm << SrcShift)
60#define SrcOne (7<<5) /* Implied '1' */ 94#define SrcImmByte (OpImmByte << SrcShift)
61#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ 95#define SrcOne (OpOne << SrcShift)
62#define SrcImmU (9<<5) /* Immediate operand, unsigned */ 96#define SrcImmUByte (OpImmUByte << SrcShift)
63#define SrcSI (0xa<<5) /* Source is in the DS:RSI */ 97#define SrcImmU (OpImmU << SrcShift)
64#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ 98#define SrcSI (OpSI << SrcShift)
65#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ 99#define SrcImmFAddr (OpImmFAddr << SrcShift)
66#define SrcAcc (0xd<<5) /* Source Accumulator */ 100#define SrcMemFAddr (OpMemFAddr << SrcShift)
67#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ 101#define SrcAcc (OpAcc << SrcShift)
68#define SrcDX (0xf<<5) /* Source is in DX register */ 102#define SrcImmU16 (OpImmU16 << SrcShift)
69#define SrcMask (0xf<<5) 103#define SrcDX (OpDX << SrcShift)
70/* Generic ModRM decode. */ 104#define SrcMask (OpMask << SrcShift)
71#define ModRM (1<<9)
72/* Destination is only written; never read. */
73#define Mov (1<<10)
74#define BitOp (1<<11) 105#define BitOp (1<<11)
75#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 106#define MemAbs (1<<12) /* Memory operand is absolute displacement */
76#define String (1<<13) /* String instruction (rep capable) */ 107#define String (1<<13) /* String instruction (rep capable) */
@@ -81,6 +112,10 @@
81#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ 112#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
82#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 113#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
83#define Sse (1<<18) /* SSE Vector instruction */ 114#define Sse (1<<18) /* SSE Vector instruction */
115/* Generic ModRM decode. */
116#define ModRM (1<<19)
117/* Destination is only written; never read. */
118#define Mov (1<<20)
84/* Misc flags */ 119/* Misc flags */
85#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ 120#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
86#define VendorSpecific (1<<22) /* Vendor specific instruction */ 121#define VendorSpecific (1<<22) /* Vendor specific instruction */
@@ -91,12 +126,19 @@
91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
92#define No64 (1<<28) 127#define No64 (1<<28)
93/* Source 2 operand type */ 128/* Source 2 operand type */
94#define Src2None (0<<29) 129#define Src2Shift (29)
95#define Src2CL (1<<29) 130#define Src2None (OpNone << Src2Shift)
96#define Src2ImmByte (2<<29) 131#define Src2CL (OpCL << Src2Shift)
97#define Src2One (3<<29) 132#define Src2ImmByte (OpImmByte << Src2Shift)
98#define Src2Imm (4<<29) 133#define Src2One (OpOne << Src2Shift)
99#define Src2Mask (7<<29) 134#define Src2Imm (OpImm << Src2Shift)
135#define Src2ES (OpES << Src2Shift)
136#define Src2CS (OpCS << Src2Shift)
137#define Src2SS (OpSS << Src2Shift)
138#define Src2DS (OpDS << Src2Shift)
139#define Src2FS (OpFS << Src2Shift)
140#define Src2GS (OpGS << Src2Shift)
141#define Src2Mask (OpMask << Src2Shift)
100 142
101#define X2(x...) x, x 143#define X2(x...) x, x
102#define X3(x...) X2(x), x 144#define X3(x...) X2(x), x
@@ -108,8 +150,8 @@
108#define X16(x...) X8(x), X8(x) 150#define X16(x...) X8(x), X8(x)
109 151
110struct opcode { 152struct opcode {
111 u32 flags; 153 u64 flags : 56;
112 u8 intercept; 154 u64 intercept : 8;
113 union { 155 union {
114 int (*execute)(struct x86_emulate_ctxt *ctxt); 156 int (*execute)(struct x86_emulate_ctxt *ctxt);
115 struct opcode *group; 157 struct opcode *group;
@@ -205,105 +247,100 @@ struct gprefix {
205#define ON64(x) 247#define ON64(x)
206#endif 248#endif
207 249
208#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ 250#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
209 do { \ 251 do { \
210 __asm__ __volatile__ ( \ 252 __asm__ __volatile__ ( \
211 _PRE_EFLAGS("0", "4", "2") \ 253 _PRE_EFLAGS("0", "4", "2") \
212 _op _suffix " %"_x"3,%1; " \ 254 _op _suffix " %"_x"3,%1; " \
213 _POST_EFLAGS("0", "4", "2") \ 255 _POST_EFLAGS("0", "4", "2") \
214 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ 256 : "=m" ((ctxt)->eflags), \
257 "+q" (*(_dsttype*)&(ctxt)->dst.val), \
215 "=&r" (_tmp) \ 258 "=&r" (_tmp) \
216 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 259 : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
217 } while (0) 260 } while (0)
218 261
219 262
220/* Raw emulation: instruction has two explicit operands. */ 263/* Raw emulation: instruction has two explicit operands. */
221#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ 264#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
222 do { \ 265 do { \
223 unsigned long _tmp; \ 266 unsigned long _tmp; \
224 \ 267 \
225 switch ((_dst).bytes) { \ 268 switch ((ctxt)->dst.bytes) { \
226 case 2: \ 269 case 2: \
227 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ 270 ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
228 break; \ 271 break; \
229 case 4: \ 272 case 4: \
230 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ 273 ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
231 break; \ 274 break; \
232 case 8: \ 275 case 8: \
233 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ 276 ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
234 break; \ 277 break; \
235 } \ 278 } \
236 } while (0) 279 } while (0)
237 280
238#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ 281#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
239 do { \ 282 do { \
240 unsigned long _tmp; \ 283 unsigned long _tmp; \
241 switch ((_dst).bytes) { \ 284 switch ((ctxt)->dst.bytes) { \
242 case 1: \ 285 case 1: \
243 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ 286 ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
244 break; \ 287 break; \
245 default: \ 288 default: \
246 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 289 __emulate_2op_nobyte(ctxt, _op, \
247 _wx, _wy, _lx, _ly, _qx, _qy); \ 290 _wx, _wy, _lx, _ly, _qx, _qy); \
248 break; \ 291 break; \
249 } \ 292 } \
250 } while (0) 293 } while (0)
251 294
252/* Source operand is byte-sized and may be restricted to just %cl. */ 295/* Source operand is byte-sized and may be restricted to just %cl. */
253#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ 296#define emulate_2op_SrcB(ctxt, _op) \
254 __emulate_2op(_op, _src, _dst, _eflags, \ 297 __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
255 "b", "c", "b", "c", "b", "c", "b", "c")
256 298
257/* Source operand is byte, word, long or quad sized. */ 299/* Source operand is byte, word, long or quad sized. */
258#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ 300#define emulate_2op_SrcV(ctxt, _op) \
259 __emulate_2op(_op, _src, _dst, _eflags, \ 301 __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
260 "b", "q", "w", "r", _LO32, "r", "", "r")
261 302
262/* Source operand is word, long or quad sized. */ 303/* Source operand is word, long or quad sized. */
263#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ 304#define emulate_2op_SrcV_nobyte(ctxt, _op) \
264 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 305 __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
265 "w", "r", _LO32, "r", "", "r")
266 306
267/* Instruction has three operands and one operand is stored in ECX register */ 307/* Instruction has three operands and one operand is stored in ECX register */
268#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 308#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
269 do { \ 309 do { \
270 unsigned long _tmp; \ 310 unsigned long _tmp; \
271 _type _clv = (_cl).val; \ 311 _type _clv = (ctxt)->src2.val; \
272 _type _srcv = (_src).val; \ 312 _type _srcv = (ctxt)->src.val; \
273 _type _dstv = (_dst).val; \ 313 _type _dstv = (ctxt)->dst.val; \
274 \ 314 \
275 __asm__ __volatile__ ( \ 315 __asm__ __volatile__ ( \
276 _PRE_EFLAGS("0", "5", "2") \ 316 _PRE_EFLAGS("0", "5", "2") \
277 _op _suffix " %4,%1 \n" \ 317 _op _suffix " %4,%1 \n" \
278 _POST_EFLAGS("0", "5", "2") \ 318 _POST_EFLAGS("0", "5", "2") \
279 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 319 : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
280 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 320 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
281 ); \ 321 ); \
282 \ 322 \
283 (_cl).val = (unsigned long) _clv; \ 323 (ctxt)->src2.val = (unsigned long) _clv; \
284 (_src).val = (unsigned long) _srcv; \ 324 (ctxt)->src2.val = (unsigned long) _srcv; \
285 (_dst).val = (unsigned long) _dstv; \ 325 (ctxt)->dst.val = (unsigned long) _dstv; \
286 } while (0) 326 } while (0)
287 327
288#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 328#define emulate_2op_cl(ctxt, _op) \
289 do { \ 329 do { \
290 switch ((_dst).bytes) { \ 330 switch ((ctxt)->dst.bytes) { \
291 case 2: \ 331 case 2: \
292 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 332 __emulate_2op_cl(ctxt, _op, "w", u16); \
293 "w", unsigned short); \
294 break; \ 333 break; \
295 case 4: \ 334 case 4: \
296 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 335 __emulate_2op_cl(ctxt, _op, "l", u32); \
297 "l", unsigned int); \
298 break; \ 336 break; \
299 case 8: \ 337 case 8: \
300 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 338 ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
301 "q", unsigned long)); \
302 break; \ 339 break; \
303 } \ 340 } \
304 } while (0) 341 } while (0)
305 342
306#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 343#define __emulate_1op(ctxt, _op, _suffix) \
307 do { \ 344 do { \
308 unsigned long _tmp; \ 345 unsigned long _tmp; \
309 \ 346 \
@@ -311,39 +348,27 @@ struct gprefix {
311 _PRE_EFLAGS("0", "3", "2") \ 348 _PRE_EFLAGS("0", "3", "2") \
312 _op _suffix " %1; " \ 349 _op _suffix " %1; " \
313 _POST_EFLAGS("0", "3", "2") \ 350 _POST_EFLAGS("0", "3", "2") \
314 : "=m" (_eflags), "+m" ((_dst).val), \ 351 : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
315 "=&r" (_tmp) \ 352 "=&r" (_tmp) \
316 : "i" (EFLAGS_MASK)); \ 353 : "i" (EFLAGS_MASK)); \
317 } while (0) 354 } while (0)
318 355
319/* Instruction has only one explicit operand (no source operand). */ 356/* Instruction has only one explicit operand (no source operand). */
320#define emulate_1op(_op, _dst, _eflags) \ 357#define emulate_1op(ctxt, _op) \
321 do { \ 358 do { \
322 switch ((_dst).bytes) { \ 359 switch ((ctxt)->dst.bytes) { \
323 case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ 360 case 1: __emulate_1op(ctxt, _op, "b"); break; \
324 case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ 361 case 2: __emulate_1op(ctxt, _op, "w"); break; \
325 case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ 362 case 4: __emulate_1op(ctxt, _op, "l"); break; \
326 case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ 363 case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
327 } \ 364 } \
328 } while (0) 365 } while (0)
329 366
330#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ 367#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
331 do { \
332 unsigned long _tmp; \
333 \
334 __asm__ __volatile__ ( \
335 _PRE_EFLAGS("0", "4", "1") \
336 _op _suffix " %5; " \
337 _POST_EFLAGS("0", "4", "1") \
338 : "=m" (_eflags), "=&r" (_tmp), \
339 "+a" (_rax), "+d" (_rdx) \
340 : "i" (EFLAGS_MASK), "m" ((_src).val), \
341 "a" (_rax), "d" (_rdx)); \
342 } while (0)
343
344#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
345 do { \ 368 do { \
346 unsigned long _tmp; \ 369 unsigned long _tmp; \
370 ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \
371 ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \
347 \ 372 \
348 __asm__ __volatile__ ( \ 373 __asm__ __volatile__ ( \
349 _PRE_EFLAGS("0", "5", "1") \ 374 _PRE_EFLAGS("0", "5", "1") \
@@ -356,53 +381,27 @@ struct gprefix {
356 "jmp 2b \n\t" \ 381 "jmp 2b \n\t" \
357 ".popsection \n\t" \ 382 ".popsection \n\t" \
358 _ASM_EXTABLE(1b, 3b) \ 383 _ASM_EXTABLE(1b, 3b) \
359 : "=m" (_eflags), "=&r" (_tmp), \ 384 : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
360 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ 385 "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
361 : "i" (EFLAGS_MASK), "m" ((_src).val), \ 386 : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \
362 "a" (_rax), "d" (_rdx)); \ 387 "a" (*rax), "d" (*rdx)); \
363 } while (0) 388 } while (0)
364 389
365/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ 390/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
366#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ 391#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
367 do { \ 392 do { \
368 switch((_src).bytes) { \ 393 switch((ctxt)->src.bytes) { \
369 case 1: \ 394 case 1: \
370 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 395 __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
371 _eflags, "b"); \
372 break; \ 396 break; \
373 case 2: \ 397 case 2: \
374 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 398 __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
375 _eflags, "w"); \
376 break; \ 399 break; \
377 case 4: \ 400 case 4: \
378 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 401 __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
379 _eflags, "l"); \
380 break; \
381 case 8: \
382 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
383 _eflags, "q")); \
384 break; \
385 } \
386 } while (0)
387
388#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
389 do { \
390 switch((_src).bytes) { \
391 case 1: \
392 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
393 _eflags, "b", _ex); \
394 break; \
395 case 2: \
396 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
397 _eflags, "w", _ex); \
398 break; \
399 case 4: \
400 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
401 _eflags, "l", _ex); \
402 break; \ 402 break; \
403 case 8: ON64( \ 403 case 8: ON64( \
404 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ 404 __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
405 _eflags, "q", _ex)); \
406 break; \ 405 break; \
407 } \ 406 } \
408 } while (0) 407 } while (0)
@@ -651,41 +650,50 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
651 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); 650 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
652} 651}
653 652
654static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, 653/*
655 unsigned long eip, u8 *dest) 654 * Fetch the next byte of the instruction being emulated which is pointed to
655 * by ctxt->_eip, then increment ctxt->_eip.
656 *
657 * Also prefetch the remaining bytes of the instruction without crossing page
658 * boundary if they are not in fetch_cache yet.
659 */
660static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
656{ 661{
657 struct fetch_cache *fc = &ctxt->fetch; 662 struct fetch_cache *fc = &ctxt->fetch;
658 int rc; 663 int rc;
659 int size, cur_size; 664 int size, cur_size;
660 665
661 if (eip == fc->end) { 666 if (ctxt->_eip == fc->end) {
662 unsigned long linear; 667 unsigned long linear;
663 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; 668 struct segmented_address addr = { .seg = VCPU_SREG_CS,
669 .ea = ctxt->_eip };
664 cur_size = fc->end - fc->start; 670 cur_size = fc->end - fc->start;
665 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 671 size = min(15UL - cur_size,
672 PAGE_SIZE - offset_in_page(ctxt->_eip));
666 rc = __linearize(ctxt, addr, size, false, true, &linear); 673 rc = __linearize(ctxt, addr, size, false, true, &linear);
667 if (rc != X86EMUL_CONTINUE) 674 if (unlikely(rc != X86EMUL_CONTINUE))
668 return rc; 675 return rc;
669 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, 676 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
670 size, &ctxt->exception); 677 size, &ctxt->exception);
671 if (rc != X86EMUL_CONTINUE) 678 if (unlikely(rc != X86EMUL_CONTINUE))
672 return rc; 679 return rc;
673 fc->end += size; 680 fc->end += size;
674 } 681 }
675 *dest = fc->data[eip - fc->start]; 682 *dest = fc->data[ctxt->_eip - fc->start];
683 ctxt->_eip++;
676 return X86EMUL_CONTINUE; 684 return X86EMUL_CONTINUE;
677} 685}
678 686
679static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 687static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
680 unsigned long eip, void *dest, unsigned size) 688 void *dest, unsigned size)
681{ 689{
682 int rc; 690 int rc;
683 691
684 /* x86 instructions are limited to 15 bytes. */ 692 /* x86 instructions are limited to 15 bytes. */
685 if (eip + size - ctxt->eip > 15) 693 if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
686 return X86EMUL_UNHANDLEABLE; 694 return X86EMUL_UNHANDLEABLE;
687 while (size--) { 695 while (size--) {
688 rc = do_insn_fetch_byte(ctxt, eip++, dest++); 696 rc = do_insn_fetch_byte(ctxt, dest++);
689 if (rc != X86EMUL_CONTINUE) 697 if (rc != X86EMUL_CONTINUE)
690 return rc; 698 return rc;
691 } 699 }
@@ -693,20 +701,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
693} 701}
694 702
695/* Fetch next part of the instruction being emulated. */ 703/* Fetch next part of the instruction being emulated. */
696#define insn_fetch(_type, _size, _eip) \ 704#define insn_fetch(_type, _ctxt) \
697({ unsigned long _x; \ 705({ unsigned long _x; \
698 rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ 706 rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
699 if (rc != X86EMUL_CONTINUE) \ 707 if (rc != X86EMUL_CONTINUE) \
700 goto done; \ 708 goto done; \
701 (_eip) += (_size); \
702 (_type)_x; \ 709 (_type)_x; \
703}) 710})
704 711
705#define insn_fetch_arr(_arr, _size, _eip) \ 712#define insn_fetch_arr(_arr, _size, _ctxt) \
706({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ 713({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
707 if (rc != X86EMUL_CONTINUE) \ 714 if (rc != X86EMUL_CONTINUE) \
708 goto done; \ 715 goto done; \
709 (_eip) += (_size); \
710}) 716})
711 717
712/* 718/*
@@ -894,7 +900,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
894 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ 900 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
895 } 901 }
896 902
897 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 903 ctxt->modrm = insn_fetch(u8, ctxt);
898 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; 904 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
899 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; 905 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
900 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 906 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -928,13 +934,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
928 switch (ctxt->modrm_mod) { 934 switch (ctxt->modrm_mod) {
929 case 0: 935 case 0:
930 if (ctxt->modrm_rm == 6) 936 if (ctxt->modrm_rm == 6)
931 modrm_ea += insn_fetch(u16, 2, ctxt->_eip); 937 modrm_ea += insn_fetch(u16, ctxt);
932 break; 938 break;
933 case 1: 939 case 1:
934 modrm_ea += insn_fetch(s8, 1, ctxt->_eip); 940 modrm_ea += insn_fetch(s8, ctxt);
935 break; 941 break;
936 case 2: 942 case 2:
937 modrm_ea += insn_fetch(u16, 2, ctxt->_eip); 943 modrm_ea += insn_fetch(u16, ctxt);
938 break; 944 break;
939 } 945 }
940 switch (ctxt->modrm_rm) { 946 switch (ctxt->modrm_rm) {
@@ -971,13 +977,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
971 } else { 977 } else {
972 /* 32/64-bit ModR/M decode. */ 978 /* 32/64-bit ModR/M decode. */
973 if ((ctxt->modrm_rm & 7) == 4) { 979 if ((ctxt->modrm_rm & 7) == 4) {
974 sib = insn_fetch(u8, 1, ctxt->_eip); 980 sib = insn_fetch(u8, ctxt);
975 index_reg |= (sib >> 3) & 7; 981 index_reg |= (sib >> 3) & 7;
976 base_reg |= sib & 7; 982 base_reg |= sib & 7;
977 scale = sib >> 6; 983 scale = sib >> 6;
978 984
979 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 985 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
980 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 986 modrm_ea += insn_fetch(s32, ctxt);
981 else 987 else
982 modrm_ea += ctxt->regs[base_reg]; 988 modrm_ea += ctxt->regs[base_reg];
983 if (index_reg != 4) 989 if (index_reg != 4)
@@ -990,13 +996,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
990 switch (ctxt->modrm_mod) { 996 switch (ctxt->modrm_mod) {
991 case 0: 997 case 0:
992 if (ctxt->modrm_rm == 5) 998 if (ctxt->modrm_rm == 5)
993 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 999 modrm_ea += insn_fetch(s32, ctxt);
994 break; 1000 break;
995 case 1: 1001 case 1:
996 modrm_ea += insn_fetch(s8, 1, ctxt->_eip); 1002 modrm_ea += insn_fetch(s8, ctxt);
997 break; 1003 break;
998 case 2: 1004 case 2:
999 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 1005 modrm_ea += insn_fetch(s32, ctxt);
1000 break; 1006 break;
1001 } 1007 }
1002 } 1008 }
@@ -1013,13 +1019,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
1013 op->type = OP_MEM; 1019 op->type = OP_MEM;
1014 switch (ctxt->ad_bytes) { 1020 switch (ctxt->ad_bytes) {
1015 case 2: 1021 case 2:
1016 op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); 1022 op->addr.mem.ea = insn_fetch(u16, ctxt);
1017 break; 1023 break;
1018 case 4: 1024 case 4:
1019 op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); 1025 op->addr.mem.ea = insn_fetch(u32, ctxt);
1020 break; 1026 break;
1021 case 8: 1027 case 8:
1022 op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); 1028 op->addr.mem.ea = insn_fetch(u64, ctxt);
1023 break; 1029 break;
1024 } 1030 }
1025done: 1031done:
@@ -1452,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1452 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1458 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1453} 1459}
1454 1460
1455static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1461static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1456{ 1462{
1463 int seg = ctxt->src2.val;
1464
1457 ctxt->src.val = get_segment_selector(ctxt, seg); 1465 ctxt->src.val = get_segment_selector(ctxt, seg);
1458 1466
1459 return em_push(ctxt); 1467 return em_push(ctxt);
1460} 1468}
1461 1469
1462static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1470static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1463{ 1471{
1472 int seg = ctxt->src2.val;
1464 unsigned long selector; 1473 unsigned long selector;
1465 int rc; 1474 int rc;
1466 1475
@@ -1674,64 +1683,74 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
1674{ 1683{
1675 switch (ctxt->modrm_reg) { 1684 switch (ctxt->modrm_reg) {
1676 case 0: /* rol */ 1685 case 0: /* rol */
1677 emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); 1686 emulate_2op_SrcB(ctxt, "rol");
1678 break; 1687 break;
1679 case 1: /* ror */ 1688 case 1: /* ror */
1680 emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); 1689 emulate_2op_SrcB(ctxt, "ror");
1681 break; 1690 break;
1682 case 2: /* rcl */ 1691 case 2: /* rcl */
1683 emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); 1692 emulate_2op_SrcB(ctxt, "rcl");
1684 break; 1693 break;
1685 case 3: /* rcr */ 1694 case 3: /* rcr */
1686 emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); 1695 emulate_2op_SrcB(ctxt, "rcr");
1687 break; 1696 break;
1688 case 4: /* sal/shl */ 1697 case 4: /* sal/shl */
1689 case 6: /* sal/shl */ 1698 case 6: /* sal/shl */
1690 emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); 1699 emulate_2op_SrcB(ctxt, "sal");
1691 break; 1700 break;
1692 case 5: /* shr */ 1701 case 5: /* shr */
1693 emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); 1702 emulate_2op_SrcB(ctxt, "shr");
1694 break; 1703 break;
1695 case 7: /* sar */ 1704 case 7: /* sar */
1696 emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); 1705 emulate_2op_SrcB(ctxt, "sar");
1697 break; 1706 break;
1698 } 1707 }
1699 return X86EMUL_CONTINUE; 1708 return X86EMUL_CONTINUE;
1700} 1709}
1701 1710
1702static int em_grp3(struct x86_emulate_ctxt *ctxt) 1711static int em_not(struct x86_emulate_ctxt *ctxt)
1712{
1713 ctxt->dst.val = ~ctxt->dst.val;
1714 return X86EMUL_CONTINUE;
1715}
1716
1717static int em_neg(struct x86_emulate_ctxt *ctxt)
1718{
1719 emulate_1op(ctxt, "neg");
1720 return X86EMUL_CONTINUE;
1721}
1722
1723static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
1724{
1725 u8 ex = 0;
1726
1727 emulate_1op_rax_rdx(ctxt, "mul", ex);
1728 return X86EMUL_CONTINUE;
1729}
1730
1731static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
1732{
1733 u8 ex = 0;
1734
1735 emulate_1op_rax_rdx(ctxt, "imul", ex);
1736 return X86EMUL_CONTINUE;
1737}
1738
1739static int em_div_ex(struct x86_emulate_ctxt *ctxt)
1703{ 1740{
1704 unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
1705 unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
1706 u8 de = 0; 1741 u8 de = 0;
1707 1742
1708 switch (ctxt->modrm_reg) { 1743 emulate_1op_rax_rdx(ctxt, "div", de);
1709 case 0 ... 1: /* test */ 1744 if (de)
1710 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); 1745 return emulate_de(ctxt);
1711 break; 1746 return X86EMUL_CONTINUE;
1712 case 2: /* not */ 1747}
1713 ctxt->dst.val = ~ctxt->dst.val; 1748
1714 break; 1749static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
1715 case 3: /* neg */ 1750{
1716 emulate_1op("neg", ctxt->dst, ctxt->eflags); 1751 u8 de = 0;
1717 break; 1752
1718 case 4: /* mul */ 1753 emulate_1op_rax_rdx(ctxt, "idiv", de);
1719 emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
1720 break;
1721 case 5: /* imul */
1722 emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
1723 break;
1724 case 6: /* div */
1725 emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
1726 ctxt->eflags, de);
1727 break;
1728 case 7: /* idiv */
1729 emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
1730 ctxt->eflags, de);
1731 break;
1732 default:
1733 return X86EMUL_UNHANDLEABLE;
1734 }
1735 if (de) 1754 if (de)
1736 return emulate_de(ctxt); 1755 return emulate_de(ctxt);
1737 return X86EMUL_CONTINUE; 1756 return X86EMUL_CONTINUE;
@@ -1743,10 +1762,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1743 1762
1744 switch (ctxt->modrm_reg) { 1763 switch (ctxt->modrm_reg) {
1745 case 0: /* inc */ 1764 case 0: /* inc */
1746 emulate_1op("inc", ctxt->dst, ctxt->eflags); 1765 emulate_1op(ctxt, "inc");
1747 break; 1766 break;
1748 case 1: /* dec */ 1767 case 1: /* dec */
1749 emulate_1op("dec", ctxt->dst, ctxt->eflags); 1768 emulate_1op(ctxt, "dec");
1750 break; 1769 break;
1751 case 2: /* call near abs */ { 1770 case 2: /* call near abs */ {
1752 long int old_eip; 1771 long int old_eip;
@@ -1812,8 +1831,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1812 return rc; 1831 return rc;
1813} 1832}
1814 1833
1815static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) 1834static int em_lseg(struct x86_emulate_ctxt *ctxt)
1816{ 1835{
1836 int seg = ctxt->src2.val;
1817 unsigned short sel; 1837 unsigned short sel;
1818 int rc; 1838 int rc;
1819 1839
@@ -2452,7 +2472,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2452 ctxt->src.type = OP_IMM; 2472 ctxt->src.type = OP_IMM;
2453 ctxt->src.val = 0; 2473 ctxt->src.val = 0;
2454 ctxt->src.bytes = 1; 2474 ctxt->src.bytes = 1;
2455 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); 2475 emulate_2op_SrcV(ctxt, "or");
2456 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2476 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2457 if (cf) 2477 if (cf)
2458 ctxt->eflags |= X86_EFLAGS_CF; 2478 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2502,49 +2522,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2502 2522
2503static int em_add(struct x86_emulate_ctxt *ctxt) 2523static int em_add(struct x86_emulate_ctxt *ctxt)
2504{ 2524{
2505 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); 2525 emulate_2op_SrcV(ctxt, "add");
2506 return X86EMUL_CONTINUE; 2526 return X86EMUL_CONTINUE;
2507} 2527}
2508 2528
2509static int em_or(struct x86_emulate_ctxt *ctxt) 2529static int em_or(struct x86_emulate_ctxt *ctxt)
2510{ 2530{
2511 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); 2531 emulate_2op_SrcV(ctxt, "or");
2512 return X86EMUL_CONTINUE; 2532 return X86EMUL_CONTINUE;
2513} 2533}
2514 2534
2515static int em_adc(struct x86_emulate_ctxt *ctxt) 2535static int em_adc(struct x86_emulate_ctxt *ctxt)
2516{ 2536{
2517 emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); 2537 emulate_2op_SrcV(ctxt, "adc");
2518 return X86EMUL_CONTINUE; 2538 return X86EMUL_CONTINUE;
2519} 2539}
2520 2540
2521static int em_sbb(struct x86_emulate_ctxt *ctxt) 2541static int em_sbb(struct x86_emulate_ctxt *ctxt)
2522{ 2542{
2523 emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); 2543 emulate_2op_SrcV(ctxt, "sbb");
2524 return X86EMUL_CONTINUE; 2544 return X86EMUL_CONTINUE;
2525} 2545}
2526 2546
2527static int em_and(struct x86_emulate_ctxt *ctxt) 2547static int em_and(struct x86_emulate_ctxt *ctxt)
2528{ 2548{
2529 emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); 2549 emulate_2op_SrcV(ctxt, "and");
2530 return X86EMUL_CONTINUE; 2550 return X86EMUL_CONTINUE;
2531} 2551}
2532 2552
2533static int em_sub(struct x86_emulate_ctxt *ctxt) 2553static int em_sub(struct x86_emulate_ctxt *ctxt)
2534{ 2554{
2535 emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); 2555 emulate_2op_SrcV(ctxt, "sub");
2536 return X86EMUL_CONTINUE; 2556 return X86EMUL_CONTINUE;
2537} 2557}
2538 2558
2539static int em_xor(struct x86_emulate_ctxt *ctxt) 2559static int em_xor(struct x86_emulate_ctxt *ctxt)
2540{ 2560{
2541 emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); 2561 emulate_2op_SrcV(ctxt, "xor");
2542 return X86EMUL_CONTINUE; 2562 return X86EMUL_CONTINUE;
2543} 2563}
2544 2564
2545static int em_cmp(struct x86_emulate_ctxt *ctxt) 2565static int em_cmp(struct x86_emulate_ctxt *ctxt)
2546{ 2566{
2547 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); 2567 emulate_2op_SrcV(ctxt, "cmp");
2548 /* Disable writeback. */ 2568 /* Disable writeback. */
2549 ctxt->dst.type = OP_NONE; 2569 ctxt->dst.type = OP_NONE;
2550 return X86EMUL_CONTINUE; 2570 return X86EMUL_CONTINUE;
@@ -2552,7 +2572,9 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt)
2552 2572
2553static int em_test(struct x86_emulate_ctxt *ctxt) 2573static int em_test(struct x86_emulate_ctxt *ctxt)
2554{ 2574{
2555 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); 2575 emulate_2op_SrcV(ctxt, "test");
2576 /* Disable writeback. */
2577 ctxt->dst.type = OP_NONE;
2556 return X86EMUL_CONTINUE; 2578 return X86EMUL_CONTINUE;
2557} 2579}
2558 2580
@@ -2570,7 +2592,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
2570 2592
2571static int em_imul(struct x86_emulate_ctxt *ctxt) 2593static int em_imul(struct x86_emulate_ctxt *ctxt)
2572{ 2594{
2573 emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); 2595 emulate_2op_SrcV_nobyte(ctxt, "imul");
2574 return X86EMUL_CONTINUE; 2596 return X86EMUL_CONTINUE;
2575} 2597}
2576 2598
@@ -3025,9 +3047,14 @@ static struct opcode group1A[] = {
3025}; 3047};
3026 3048
3027static struct opcode group3[] = { 3049static struct opcode group3[] = {
3028 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), 3050 I(DstMem | SrcImm | ModRM, em_test),
3029 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), 3051 I(DstMem | SrcImm | ModRM, em_test),
3030 X4(D(SrcMem | ModRM)), 3052 I(DstMem | SrcNone | ModRM | Lock, em_not),
3053 I(DstMem | SrcNone | ModRM | Lock, em_neg),
3054 I(SrcMem | ModRM, em_mul_ex),
3055 I(SrcMem | ModRM, em_imul_ex),
3056 I(SrcMem | ModRM, em_div_ex),
3057 I(SrcMem | ModRM, em_idiv_ex),
3031}; 3058};
3032 3059
3033static struct opcode group4[] = { 3060static struct opcode group4[] = {
@@ -3090,16 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = {
3090static struct opcode opcode_table[256] = { 3117static struct opcode opcode_table[256] = {
3091 /* 0x00 - 0x07 */ 3118 /* 0x00 - 0x07 */
3092 I6ALU(Lock, em_add), 3119 I6ALU(Lock, em_add),
3093 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3120 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3121 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3094 /* 0x08 - 0x0F */ 3122 /* 0x08 - 0x0F */
3095 I6ALU(Lock, em_or), 3123 I6ALU(Lock, em_or),
3096 D(ImplicitOps | Stack | No64), N, 3124 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3125 N,
3097 /* 0x10 - 0x17 */ 3126 /* 0x10 - 0x17 */
3098 I6ALU(Lock, em_adc), 3127 I6ALU(Lock, em_adc),
3099 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3128 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
3129 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
3100 /* 0x18 - 0x1F */ 3130 /* 0x18 - 0x1F */
3101 I6ALU(Lock, em_sbb), 3131 I6ALU(Lock, em_sbb),
3102 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3132 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3133 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3103 /* 0x20 - 0x27 */ 3134 /* 0x20 - 0x27 */
3104 I6ALU(Lock, em_and), N, N, 3135 I6ALU(Lock, em_and), N, N,
3105 /* 0x28 - 0x2F */ 3136 /* 0x28 - 0x2F */
@@ -3167,7 +3198,8 @@ static struct opcode opcode_table[256] = {
3167 D2bv(DstMem | SrcImmByte | ModRM), 3198 D2bv(DstMem | SrcImmByte | ModRM),
3168 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3199 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3169 I(ImplicitOps | Stack, em_ret), 3200 I(ImplicitOps | Stack, em_ret),
3170 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), 3201 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
3202 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3171 G(ByteOp, group11), G(0, group11), 3203 G(ByteOp, group11), G(0, group11),
3172 /* 0xC8 - 0xCF */ 3204 /* 0xC8 - 0xCF */
3173 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3205 N, N, N, I(ImplicitOps | Stack, em_ret_far),
@@ -3242,20 +3274,22 @@ static struct opcode twobyte_table[256] = {
3242 /* 0x90 - 0x9F */ 3274 /* 0x90 - 0x9F */
3243 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3275 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3244 /* 0xA0 - 0xA7 */ 3276 /* 0xA0 - 0xA7 */
3245 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3277 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3246 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), 3278 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
3247 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3279 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3248 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3280 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3249 /* 0xA8 - 0xAF */ 3281 /* 0xA8 - 0xAF */
3250 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3282 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3251 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3283 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3252 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3284 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3253 D(DstMem | SrcReg | Src2CL | ModRM), 3285 D(DstMem | SrcReg | Src2CL | ModRM),
3254 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3286 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3255 /* 0xB0 - 0xB7 */ 3287 /* 0xB0 - 0xB7 */
3256 D2bv(DstMem | SrcReg | ModRM | Lock), 3288 D2bv(DstMem | SrcReg | ModRM | Lock),
3257 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3289 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3258 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), 3290 D(DstMem | SrcReg | ModRM | BitOp | Lock),
3291 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3292 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3259 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3293 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3260 /* 0xB8 - 0xBF */ 3294 /* 0xB8 - 0xBF */
3261 N, N, 3295 N, N,
@@ -3309,13 +3343,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3309 /* NB. Immediates are sign-extended as necessary. */ 3343 /* NB. Immediates are sign-extended as necessary. */
3310 switch (op->bytes) { 3344 switch (op->bytes) {
3311 case 1: 3345 case 1:
3312 op->val = insn_fetch(s8, 1, ctxt->_eip); 3346 op->val = insn_fetch(s8, ctxt);
3313 break; 3347 break;
3314 case 2: 3348 case 2:
3315 op->val = insn_fetch(s16, 2, ctxt->_eip); 3349 op->val = insn_fetch(s16, ctxt);
3316 break; 3350 break;
3317 case 4: 3351 case 4:
3318 op->val = insn_fetch(s32, 4, ctxt->_eip); 3352 op->val = insn_fetch(s32, ctxt);
3319 break; 3353 break;
3320 } 3354 }
3321 if (!sign_extension) { 3355 if (!sign_extension) {
@@ -3335,6 +3369,125 @@ done:
3335 return rc; 3369 return rc;
3336} 3370}
3337 3371
3372static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3373 unsigned d)
3374{
3375 int rc = X86EMUL_CONTINUE;
3376
3377 switch (d) {
3378 case OpReg:
3379 decode_register_operand(ctxt, op,
3380 op == &ctxt->dst &&
3381 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3382 break;
3383 case OpImmUByte:
3384 rc = decode_imm(ctxt, op, 1, false);
3385 break;
3386 case OpMem:
3387 ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3388 mem_common:
3389 *op = ctxt->memop;
3390 ctxt->memopp = op;
3391 if ((ctxt->d & BitOp) && op == &ctxt->dst)
3392 fetch_bit_operand(ctxt);
3393 op->orig_val = op->val;
3394 break;
3395 case OpMem64:
3396 ctxt->memop.bytes = 8;
3397 goto mem_common;
3398 case OpAcc:
3399 op->type = OP_REG;
3400 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3401 op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3402 fetch_register_operand(op);
3403 op->orig_val = op->val;
3404 break;
3405 case OpDI:
3406 op->type = OP_MEM;
3407 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3408 op->addr.mem.ea =
3409 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3410 op->addr.mem.seg = VCPU_SREG_ES;
3411 op->val = 0;
3412 break;
3413 case OpDX:
3414 op->type = OP_REG;
3415 op->bytes = 2;
3416 op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3417 fetch_register_operand(op);
3418 break;
3419 case OpCL:
3420 op->bytes = 1;
3421 op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
3422 break;
3423 case OpImmByte:
3424 rc = decode_imm(ctxt, op, 1, true);
3425 break;
3426 case OpOne:
3427 op->bytes = 1;
3428 op->val = 1;
3429 break;
3430 case OpImm:
3431 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
3432 break;
3433 case OpMem16:
3434 ctxt->memop.bytes = 2;
3435 goto mem_common;
3436 case OpMem32:
3437 ctxt->memop.bytes = 4;
3438 goto mem_common;
3439 case OpImmU16:
3440 rc = decode_imm(ctxt, op, 2, false);
3441 break;
3442 case OpImmU:
3443 rc = decode_imm(ctxt, op, imm_size(ctxt), false);
3444 break;
3445 case OpSI:
3446 op->type = OP_MEM;
3447 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3448 op->addr.mem.ea =
3449 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3450 op->addr.mem.seg = seg_override(ctxt);
3451 op->val = 0;
3452 break;
3453 case OpImmFAddr:
3454 op->type = OP_IMM;
3455 op->addr.mem.ea = ctxt->_eip;
3456 op->bytes = ctxt->op_bytes + 2;
3457 insn_fetch_arr(op->valptr, op->bytes, ctxt);
3458 break;
3459 case OpMemFAddr:
3460 ctxt->memop.bytes = ctxt->op_bytes + 2;
3461 goto mem_common;
3462 case OpES:
3463 op->val = VCPU_SREG_ES;
3464 break;
3465 case OpCS:
3466 op->val = VCPU_SREG_CS;
3467 break;
3468 case OpSS:
3469 op->val = VCPU_SREG_SS;
3470 break;
3471 case OpDS:
3472 op->val = VCPU_SREG_DS;
3473 break;
3474 case OpFS:
3475 op->val = VCPU_SREG_FS;
3476 break;
3477 case OpGS:
3478 op->val = VCPU_SREG_GS;
3479 break;
3480 case OpImplicit:
3481 /* Special instructions do their own operand decoding. */
3482 default:
3483 op->type = OP_NONE; /* Disable writeback. */
3484 break;
3485 }
3486
3487done:
3488 return rc;
3489}
3490
3338int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) 3491int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3339{ 3492{
3340 int rc = X86EMUL_CONTINUE; 3493 int rc = X86EMUL_CONTINUE;
@@ -3342,8 +3495,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3342 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 3495 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
3343 bool op_prefix = false; 3496 bool op_prefix = false;
3344 struct opcode opcode; 3497 struct opcode opcode;
3345 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3346 3498
3499 ctxt->memop.type = OP_NONE;
3500 ctxt->memopp = NULL;
3347 ctxt->_eip = ctxt->eip; 3501 ctxt->_eip = ctxt->eip;
3348 ctxt->fetch.start = ctxt->_eip; 3502 ctxt->fetch.start = ctxt->_eip;
3349 ctxt->fetch.end = ctxt->fetch.start + insn_len; 3503 ctxt->fetch.end = ctxt->fetch.start + insn_len;
@@ -3366,7 +3520,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3366 break; 3520 break;
3367#endif 3521#endif
3368 default: 3522 default:
3369 return -1; 3523 return EMULATION_FAILED;
3370 } 3524 }
3371 3525
3372 ctxt->op_bytes = def_op_bytes; 3526 ctxt->op_bytes = def_op_bytes;
@@ -3374,7 +3528,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3374 3528
3375 /* Legacy prefixes. */ 3529 /* Legacy prefixes. */
3376 for (;;) { 3530 for (;;) {
3377 switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { 3531 switch (ctxt->b = insn_fetch(u8, ctxt)) {
3378 case 0x66: /* operand-size override */ 3532 case 0x66: /* operand-size override */
3379 op_prefix = true; 3533 op_prefix = true;
3380 /* switch between 2/4 bytes */ 3534 /* switch between 2/4 bytes */
@@ -3430,7 +3584,7 @@ done_prefixes:
3430 /* Two-byte opcode? */ 3584 /* Two-byte opcode? */
3431 if (ctxt->b == 0x0f) { 3585 if (ctxt->b == 0x0f) {
3432 ctxt->twobyte = 1; 3586 ctxt->twobyte = 1;
3433 ctxt->b = insn_fetch(u8, 1, ctxt->_eip); 3587 ctxt->b = insn_fetch(u8, ctxt);
3434 opcode = twobyte_table[ctxt->b]; 3588 opcode = twobyte_table[ctxt->b];
3435 } 3589 }
3436 ctxt->d = opcode.flags; 3590 ctxt->d = opcode.flags;
@@ -3438,13 +3592,13 @@ done_prefixes:
3438 while (ctxt->d & GroupMask) { 3592 while (ctxt->d & GroupMask) {
3439 switch (ctxt->d & GroupMask) { 3593 switch (ctxt->d & GroupMask) {
3440 case Group: 3594 case Group:
3441 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 3595 ctxt->modrm = insn_fetch(u8, ctxt);
3442 --ctxt->_eip; 3596 --ctxt->_eip;
3443 goffset = (ctxt->modrm >> 3) & 7; 3597 goffset = (ctxt->modrm >> 3) & 7;
3444 opcode = opcode.u.group[goffset]; 3598 opcode = opcode.u.group[goffset];
3445 break; 3599 break;
3446 case GroupDual: 3600 case GroupDual:
3447 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 3601 ctxt->modrm = insn_fetch(u8, ctxt);
3448 --ctxt->_eip; 3602 --ctxt->_eip;
3449 goffset = (ctxt->modrm >> 3) & 7; 3603 goffset = (ctxt->modrm >> 3) & 7;
3450 if ((ctxt->modrm >> 6) == 3) 3604 if ((ctxt->modrm >> 6) == 3)
@@ -3458,7 +3612,7 @@ done_prefixes:
3458 break; 3612 break;
3459 case Prefix: 3613 case Prefix:
3460 if (ctxt->rep_prefix && op_prefix) 3614 if (ctxt->rep_prefix && op_prefix)
3461 return X86EMUL_UNHANDLEABLE; 3615 return EMULATION_FAILED;
3462 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; 3616 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
3463 switch (simd_prefix) { 3617 switch (simd_prefix) {
3464 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 3618 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
@@ -3468,10 +3622,10 @@ done_prefixes:
3468 } 3622 }
3469 break; 3623 break;
3470 default: 3624 default:
3471 return X86EMUL_UNHANDLEABLE; 3625 return EMULATION_FAILED;
3472 } 3626 }
3473 3627
3474 ctxt->d &= ~GroupMask; 3628 ctxt->d &= ~(u64)GroupMask;
3475 ctxt->d |= opcode.flags; 3629 ctxt->d |= opcode.flags;
3476 } 3630 }
3477 3631
@@ -3481,10 +3635,10 @@ done_prefixes:
3481 3635
3482 /* Unrecognised? */ 3636 /* Unrecognised? */
3483 if (ctxt->d == 0 || (ctxt->d & Undefined)) 3637 if (ctxt->d == 0 || (ctxt->d & Undefined))
3484 return -1; 3638 return EMULATION_FAILED;
3485 3639
3486 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 3640 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3487 return -1; 3641 return EMULATION_FAILED;
3488 3642
3489 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) 3643 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
3490 ctxt->op_bytes = 8; 3644 ctxt->op_bytes = 8;
@@ -3501,96 +3655,27 @@ done_prefixes:
3501 3655
3502 /* ModRM and SIB bytes. */ 3656 /* ModRM and SIB bytes. */
3503 if (ctxt->d & ModRM) { 3657 if (ctxt->d & ModRM) {
3504 rc = decode_modrm(ctxt, &memop); 3658 rc = decode_modrm(ctxt, &ctxt->memop);
3505 if (!ctxt->has_seg_override) 3659 if (!ctxt->has_seg_override)
3506 set_seg_override(ctxt, ctxt->modrm_seg); 3660 set_seg_override(ctxt, ctxt->modrm_seg);
3507 } else if (ctxt->d & MemAbs) 3661 } else if (ctxt->d & MemAbs)
3508 rc = decode_abs(ctxt, &memop); 3662 rc = decode_abs(ctxt, &ctxt->memop);
3509 if (rc != X86EMUL_CONTINUE) 3663 if (rc != X86EMUL_CONTINUE)
3510 goto done; 3664 goto done;
3511 3665
3512 if (!ctxt->has_seg_override) 3666 if (!ctxt->has_seg_override)
3513 set_seg_override(ctxt, VCPU_SREG_DS); 3667 set_seg_override(ctxt, VCPU_SREG_DS);
3514 3668
3515 memop.addr.mem.seg = seg_override(ctxt); 3669 ctxt->memop.addr.mem.seg = seg_override(ctxt);
3516 3670
3517 if (memop.type == OP_MEM && ctxt->ad_bytes != 8) 3671 if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
3518 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3672 ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
3519 3673
3520 /* 3674 /*
3521 * Decode and fetch the source operand: register, memory 3675 * Decode and fetch the source operand: register, memory
3522 * or immediate. 3676 * or immediate.
3523 */ 3677 */
3524 switch (ctxt->d & SrcMask) { 3678 rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
3525 case SrcNone:
3526 break;
3527 case SrcReg:
3528 decode_register_operand(ctxt, &ctxt->src, 0);
3529 break;
3530 case SrcMem16:
3531 memop.bytes = 2;
3532 goto srcmem_common;
3533 case SrcMem32:
3534 memop.bytes = 4;
3535 goto srcmem_common;
3536 case SrcMem:
3537 memop.bytes = (ctxt->d & ByteOp) ? 1 :
3538 ctxt->op_bytes;
3539 srcmem_common:
3540 ctxt->src = memop;
3541 memopp = &ctxt->src;
3542 break;
3543 case SrcImmU16:
3544 rc = decode_imm(ctxt, &ctxt->src, 2, false);
3545 break;
3546 case SrcImm:
3547 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
3548 break;
3549 case SrcImmU:
3550 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
3551 break;
3552 case SrcImmByte:
3553 rc = decode_imm(ctxt, &ctxt->src, 1, true);
3554 break;
3555 case SrcImmUByte:
3556 rc = decode_imm(ctxt, &ctxt->src, 1, false);
3557 break;
3558 case SrcAcc:
3559 ctxt->src.type = OP_REG;
3560 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3561 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3562 fetch_register_operand(&ctxt->src);
3563 break;
3564 case SrcOne:
3565 ctxt->src.bytes = 1;
3566 ctxt->src.val = 1;
3567 break;
3568 case SrcSI:
3569 ctxt->src.type = OP_MEM;
3570 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3571 ctxt->src.addr.mem.ea =
3572 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3573 ctxt->src.addr.mem.seg = seg_override(ctxt);
3574 ctxt->src.val = 0;
3575 break;
3576 case SrcImmFAddr:
3577 ctxt->src.type = OP_IMM;
3578 ctxt->src.addr.mem.ea = ctxt->_eip;
3579 ctxt->src.bytes = ctxt->op_bytes + 2;
3580 insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
3581 break;
3582 case SrcMemFAddr:
3583 memop.bytes = ctxt->op_bytes + 2;
3584 goto srcmem_common;
3585 break;
3586 case SrcDX:
3587 ctxt->src.type = OP_REG;
3588 ctxt->src.bytes = 2;
3589 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3590 fetch_register_operand(&ctxt->src);
3591 break;
3592 }
3593
3594 if (rc != X86EMUL_CONTINUE) 3679 if (rc != X86EMUL_CONTINUE)
3595 goto done; 3680 goto done;
3596 3681
@@ -3598,85 +3683,18 @@ done_prefixes:
3598 * Decode and fetch the second source operand: register, memory 3683 * Decode and fetch the second source operand: register, memory
3599 * or immediate. 3684 * or immediate.
3600 */ 3685 */
3601 switch (ctxt->d & Src2Mask) { 3686 rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
3602 case Src2None:
3603 break;
3604 case Src2CL:
3605 ctxt->src2.bytes = 1;
3606 ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
3607 break;
3608 case Src2ImmByte:
3609 rc = decode_imm(ctxt, &ctxt->src2, 1, true);
3610 break;
3611 case Src2One:
3612 ctxt->src2.bytes = 1;
3613 ctxt->src2.val = 1;
3614 break;
3615 case Src2Imm:
3616 rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
3617 break;
3618 }
3619
3620 if (rc != X86EMUL_CONTINUE) 3687 if (rc != X86EMUL_CONTINUE)
3621 goto done; 3688 goto done;
3622 3689
3623 /* Decode and fetch the destination operand: register or memory. */ 3690 /* Decode and fetch the destination operand: register or memory. */
3624 switch (ctxt->d & DstMask) { 3691 rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
3625 case DstReg:
3626 decode_register_operand(ctxt, &ctxt->dst,
3627 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3628 break;
3629 case DstImmUByte:
3630 ctxt->dst.type = OP_IMM;
3631 ctxt->dst.addr.mem.ea = ctxt->_eip;
3632 ctxt->dst.bytes = 1;
3633 ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
3634 break;
3635 case DstMem:
3636 case DstMem64:
3637 ctxt->dst = memop;
3638 memopp = &ctxt->dst;
3639 if ((ctxt->d & DstMask) == DstMem64)
3640 ctxt->dst.bytes = 8;
3641 else
3642 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3643 if (ctxt->d & BitOp)
3644 fetch_bit_operand(ctxt);
3645 ctxt->dst.orig_val = ctxt->dst.val;
3646 break;
3647 case DstAcc:
3648 ctxt->dst.type = OP_REG;
3649 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3650 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3651 fetch_register_operand(&ctxt->dst);
3652 ctxt->dst.orig_val = ctxt->dst.val;
3653 break;
3654 case DstDI:
3655 ctxt->dst.type = OP_MEM;
3656 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3657 ctxt->dst.addr.mem.ea =
3658 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3659 ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
3660 ctxt->dst.val = 0;
3661 break;
3662 case DstDX:
3663 ctxt->dst.type = OP_REG;
3664 ctxt->dst.bytes = 2;
3665 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3666 fetch_register_operand(&ctxt->dst);
3667 break;
3668 case ImplicitOps:
3669 /* Special instructions do their own operand decoding. */
3670 default:
3671 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3672 break;
3673 }
3674 3692
3675done: 3693done:
3676 if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) 3694 if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
3677 memopp->addr.mem.ea += ctxt->_eip; 3695 ctxt->memopp->addr.mem.ea += ctxt->_eip;
3678 3696
3679 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3697 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
3680} 3698}
3681 3699
3682static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3700static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -3825,32 +3843,11 @@ special_insn:
3825 goto twobyte_insn; 3843 goto twobyte_insn;
3826 3844
3827 switch (ctxt->b) { 3845 switch (ctxt->b) {
3828 case 0x06: /* push es */
3829 rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
3830 break;
3831 case 0x07: /* pop es */
3832 rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
3833 break;
3834 case 0x0e: /* push cs */
3835 rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
3836 break;
3837 case 0x16: /* push ss */
3838 rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
3839 break;
3840 case 0x17: /* pop ss */
3841 rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
3842 break;
3843 case 0x1e: /* push ds */
3844 rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
3845 break;
3846 case 0x1f: /* pop ds */
3847 rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
3848 break;
3849 case 0x40 ... 0x47: /* inc r16/r32 */ 3846 case 0x40 ... 0x47: /* inc r16/r32 */
3850 emulate_1op("inc", ctxt->dst, ctxt->eflags); 3847 emulate_1op(ctxt, "inc");
3851 break; 3848 break;
3852 case 0x48 ... 0x4f: /* dec r16/r32 */ 3849 case 0x48 ... 0x4f: /* dec r16/r32 */
3853 emulate_1op("dec", ctxt->dst, ctxt->eflags); 3850 emulate_1op(ctxt, "dec");
3854 break; 3851 break;
3855 case 0x63: /* movsxd */ 3852 case 0x63: /* movsxd */
3856 if (ctxt->mode != X86EMUL_MODE_PROT64) 3853 if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3891,12 +3888,6 @@ special_insn:
3891 case 0xc0 ... 0xc1: 3888 case 0xc0 ... 0xc1:
3892 rc = em_grp2(ctxt); 3889 rc = em_grp2(ctxt);
3893 break; 3890 break;
3894 case 0xc4: /* les */
3895 rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
3896 break;
3897 case 0xc5: /* lds */
3898 rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
3899 break;
3900 case 0xcc: /* int3 */ 3891 case 0xcc: /* int3 */
3901 rc = emulate_int(ctxt, 3); 3892 rc = emulate_int(ctxt, 3);
3902 break; 3893 break;
@@ -3953,9 +3944,6 @@ special_insn:
3953 /* complement carry flag from eflags reg */ 3944 /* complement carry flag from eflags reg */
3954 ctxt->eflags ^= EFLG_CF; 3945 ctxt->eflags ^= EFLG_CF;
3955 break; 3946 break;
3956 case 0xf6 ... 0xf7: /* Grp3 */
3957 rc = em_grp3(ctxt);
3958 break;
3959 case 0xf8: /* clc */ 3947 case 0xf8: /* clc */
3960 ctxt->eflags &= ~EFLG_CF; 3948 ctxt->eflags &= ~EFLG_CF;
3961 break; 3949 break;
@@ -4103,36 +4091,24 @@ twobyte_insn:
4103 case 0x90 ... 0x9f: /* setcc r/m8 */ 4091 case 0x90 ... 0x9f: /* setcc r/m8 */
4104 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4092 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4105 break; 4093 break;
4106 case 0xa0: /* push fs */
4107 rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
4108 break;
4109 case 0xa1: /* pop fs */
4110 rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
4111 break;
4112 case 0xa3: 4094 case 0xa3:
4113 bt: /* bt */ 4095 bt: /* bt */
4114 ctxt->dst.type = OP_NONE; 4096 ctxt->dst.type = OP_NONE;
4115 /* only subword offset */ 4097 /* only subword offset */
4116 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; 4098 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4117 emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); 4099 emulate_2op_SrcV_nobyte(ctxt, "bt");
4118 break; 4100 break;
4119 case 0xa4: /* shld imm8, r, r/m */ 4101 case 0xa4: /* shld imm8, r, r/m */
4120 case 0xa5: /* shld cl, r, r/m */ 4102 case 0xa5: /* shld cl, r, r/m */
4121 emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); 4103 emulate_2op_cl(ctxt, "shld");
4122 break;
4123 case 0xa8: /* push gs */
4124 rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
4125 break;
4126 case 0xa9: /* pop gs */
4127 rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
4128 break; 4104 break;
4129 case 0xab: 4105 case 0xab:
4130 bts: /* bts */ 4106 bts: /* bts */
4131 emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); 4107 emulate_2op_SrcV_nobyte(ctxt, "bts");
4132 break; 4108 break;
4133 case 0xac: /* shrd imm8, r, r/m */ 4109 case 0xac: /* shrd imm8, r, r/m */
4134 case 0xad: /* shrd cl, r, r/m */ 4110 case 0xad: /* shrd cl, r, r/m */
4135 emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); 4111 emulate_2op_cl(ctxt, "shrd");
4136 break; 4112 break;
4137 case 0xae: /* clflush */ 4113 case 0xae: /* clflush */
4138 break; 4114 break;
@@ -4143,7 +4119,7 @@ twobyte_insn:
4143 */ 4119 */
4144 ctxt->src.orig_val = ctxt->src.val; 4120 ctxt->src.orig_val = ctxt->src.val;
4145 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; 4121 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4146 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); 4122 emulate_2op_SrcV(ctxt, "cmp");
4147 if (ctxt->eflags & EFLG_ZF) { 4123 if (ctxt->eflags & EFLG_ZF) {
4148 /* Success: write back to memory. */ 4124 /* Success: write back to memory. */
4149 ctxt->dst.val = ctxt->src.orig_val; 4125 ctxt->dst.val = ctxt->src.orig_val;
@@ -4153,18 +4129,9 @@ twobyte_insn:
4153 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; 4129 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4154 } 4130 }
4155 break; 4131 break;
4156 case 0xb2: /* lss */
4157 rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
4158 break;
4159 case 0xb3: 4132 case 0xb3:
4160 btr: /* btr */ 4133 btr: /* btr */
4161 emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); 4134 emulate_2op_SrcV_nobyte(ctxt, "btr");
4162 break;
4163 case 0xb4: /* lfs */
4164 rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
4165 break;
4166 case 0xb5: /* lgs */
4167 rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
4168 break; 4135 break;
4169 case 0xb6 ... 0xb7: /* movzx */ 4136 case 0xb6 ... 0xb7: /* movzx */
4170 ctxt->dst.bytes = ctxt->op_bytes; 4137 ctxt->dst.bytes = ctxt->op_bytes;
@@ -4185,7 +4152,7 @@ twobyte_insn:
4185 break; 4152 break;
4186 case 0xbb: 4153 case 0xbb:
4187 btc: /* btc */ 4154 btc: /* btc */
4188 emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); 4155 emulate_2op_SrcV_nobyte(ctxt, "btc");
4189 break; 4156 break;
4190 case 0xbc: { /* bsf */ 4157 case 0xbc: { /* bsf */
4191 u8 zf; 4158 u8 zf;
@@ -4217,7 +4184,7 @@ twobyte_insn:
4217 (s16) ctxt->src.val; 4184 (s16) ctxt->src.val;
4218 break; 4185 break;
4219 case 0xc0 ... 0xc1: /* xadd */ 4186 case 0xc0 ... 0xc1: /* xadd */
4220 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); 4187 emulate_2op_SrcV(ctxt, "add");
4221 /* Write back the register source. */ 4188 /* Write back the register source. */
4222 ctxt->src.val = ctxt->dst.orig_val; 4189 ctxt->src.val = ctxt->dst.orig_val;
4223 write_register_operand(&ctxt->src); 4190 write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad72385058..76e3f1cd0369 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
713 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 713 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
714 714
715 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 715 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
716 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); 716 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
717 KVM_PIT_MEM_LENGTH, &pit->dev);
717 if (ret < 0) 718 if (ret < 0)
718 goto fail; 719 goto fail;
719 720
720 if (flags & KVM_PIT_SPEAKER_DUMMY) { 721 if (flags & KVM_PIT_SPEAKER_DUMMY) {
721 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 722 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
722 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 723 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
723 &pit->speaker_dev); 724 KVM_SPEAKER_BASE_ADDRESS, 4,
725 &pit->speaker_dev);
724 if (ret < 0) 726 if (ret < 0)
725 goto fail_unregister; 727 goto fail_unregister;
726 } 728 }
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 19fe855e7953..cac4746d7ffb 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,9 @@
34#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
35#include "trace.h" 35#include "trace.h"
36 36
37#define pr_pic_unimpl(fmt, ...) \
38 pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
39
37static void pic_irq_request(struct kvm *kvm, int level); 40static void pic_irq_request(struct kvm *kvm, int level);
38 41
39static void pic_lock(struct kvm_pic *s) 42static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
306 } 309 }
307 s->init_state = 1; 310 s->init_state = 1;
308 if (val & 0x02) 311 if (val & 0x02)
309 printk(KERN_ERR "single mode not supported"); 312 pr_pic_unimpl("single mode not supported");
310 if (val & 0x08) 313 if (val & 0x08)
311 printk(KERN_ERR 314 pr_pic_unimpl(
312 "level sensitive irq not supported"); 315 "level sensitive irq not supported");
313 } else if (val & 0x08) { 316 } else if (val & 0x08) {
314 if (val & 0x04) 317 if (val & 0x04)
315 s->poll = 1; 318 s->poll = 1;
@@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr)
459 } 462 }
460} 463}
461 464
462static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) 465static int picdev_write(struct kvm_pic *s,
463{
464 return container_of(dev, struct kvm_pic, dev);
465}
466
467static int picdev_write(struct kvm_io_device *this,
468 gpa_t addr, int len, const void *val) 466 gpa_t addr, int len, const void *val)
469{ 467{
470 struct kvm_pic *s = to_pic(this);
471 unsigned char data = *(unsigned char *)val; 468 unsigned char data = *(unsigned char *)val;
472 if (!picdev_in_range(addr)) 469 if (!picdev_in_range(addr))
473 return -EOPNOTSUPP; 470 return -EOPNOTSUPP;
474 471
475 if (len != 1) { 472 if (len != 1) {
476 if (printk_ratelimit()) 473 pr_pic_unimpl("non byte write\n");
477 printk(KERN_ERR "PIC: non byte write\n");
478 return 0; 474 return 0;
479 } 475 }
480 pic_lock(s); 476 pic_lock(s);
@@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this,
494 return 0; 490 return 0;
495} 491}
496 492
497static int picdev_read(struct kvm_io_device *this, 493static int picdev_read(struct kvm_pic *s,
498 gpa_t addr, int len, void *val) 494 gpa_t addr, int len, void *val)
499{ 495{
500 struct kvm_pic *s = to_pic(this);
501 unsigned char data = 0; 496 unsigned char data = 0;
502 if (!picdev_in_range(addr)) 497 if (!picdev_in_range(addr))
503 return -EOPNOTSUPP; 498 return -EOPNOTSUPP;
504 499
505 if (len != 1) { 500 if (len != 1) {
506 if (printk_ratelimit()) 501 pr_pic_unimpl("non byte read\n");
507 printk(KERN_ERR "PIC: non byte read\n");
508 return 0; 502 return 0;
509 } 503 }
510 pic_lock(s); 504 pic_lock(s);
@@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this,
525 return 0; 519 return 0;
526} 520}
527 521
522static int picdev_master_write(struct kvm_io_device *dev,
523 gpa_t addr, int len, const void *val)
524{
525 return picdev_write(container_of(dev, struct kvm_pic, dev_master),
526 addr, len, val);
527}
528
529static int picdev_master_read(struct kvm_io_device *dev,
530 gpa_t addr, int len, void *val)
531{
532 return picdev_read(container_of(dev, struct kvm_pic, dev_master),
533 addr, len, val);
534}
535
536static int picdev_slave_write(struct kvm_io_device *dev,
537 gpa_t addr, int len, const void *val)
538{
539 return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
540 addr, len, val);
541}
542
543static int picdev_slave_read(struct kvm_io_device *dev,
544 gpa_t addr, int len, void *val)
545{
546 return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
547 addr, len, val);
548}
549
550static int picdev_eclr_write(struct kvm_io_device *dev,
551 gpa_t addr, int len, const void *val)
552{
553 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
554 addr, len, val);
555}
556
557static int picdev_eclr_read(struct kvm_io_device *dev,
558 gpa_t addr, int len, void *val)
559{
560 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
561 addr, len, val);
562}
563
528/* 564/*
529 * callback when PIC0 irq status changed 565 * callback when PIC0 irq status changed
530 */ 566 */
@@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
537 s->output = level; 573 s->output = level;
538} 574}
539 575
540static const struct kvm_io_device_ops picdev_ops = { 576static const struct kvm_io_device_ops picdev_master_ops = {
541 .read = picdev_read, 577 .read = picdev_master_read,
542 .write = picdev_write, 578 .write = picdev_master_write,
579};
580
581static const struct kvm_io_device_ops picdev_slave_ops = {
582 .read = picdev_slave_read,
583 .write = picdev_slave_write,
584};
585
586static const struct kvm_io_device_ops picdev_eclr_ops = {
587 .read = picdev_eclr_read,
588 .write = picdev_eclr_write,
543}; 589};
544 590
545struct kvm_pic *kvm_create_pic(struct kvm *kvm) 591struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
560 /* 606 /*
561 * Initialize PIO device 607 * Initialize PIO device
562 */ 608 */
563 kvm_iodevice_init(&s->dev, &picdev_ops); 609 kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
610 kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
611 kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
564 mutex_lock(&kvm->slots_lock); 612 mutex_lock(&kvm->slots_lock);
565 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); 613 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
614 &s->dev_master);
615 if (ret < 0)
616 goto fail_unlock;
617
618 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
619 if (ret < 0)
620 goto fail_unreg_2;
621
622 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
623 if (ret < 0)
624 goto fail_unreg_1;
625
566 mutex_unlock(&kvm->slots_lock); 626 mutex_unlock(&kvm->slots_lock);
567 if (ret < 0) {
568 kfree(s);
569 return NULL;
570 }
571 627
572 return s; 628 return s;
629
630fail_unreg_1:
631 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
632
633fail_unreg_2:
634 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
635
636fail_unlock:
637 mutex_unlock(&kvm->slots_lock);
638
639 kfree(s);
640
641 return NULL;
573} 642}
574 643
575void kvm_destroy_pic(struct kvm *kvm) 644void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm)
577 struct kvm_pic *vpic = kvm->arch.vpic; 646 struct kvm_pic *vpic = kvm->arch.vpic;
578 647
579 if (vpic) { 648 if (vpic) {
580 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); 649 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
650 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
651 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
581 kvm->arch.vpic = NULL; 652 kvm->arch.vpic = NULL;
582 kfree(vpic); 653 kfree(vpic);
583 } 654 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d084bffb..2086f2bfba33 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -66,7 +66,9 @@ struct kvm_pic {
66 struct kvm *kvm; 66 struct kvm *kvm;
67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
68 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
69 struct kvm_io_device dev; 69 struct kvm_io_device dev_master;
70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr;
70 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
71 unsigned long irq_states[16]; 73 unsigned long irq_states[16];
72}; 74};
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3377d53fcd36..544076c4f44b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
45 return vcpu->arch.walk_mmu->pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46} 46}
47 47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
53}
54
55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
56{ 49{
57 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; 50 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 64bc6ea78d90..497dbaa366d4 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -2,6 +2,8 @@
2struct kvm_timer { 2struct kvm_timer {
3 struct hrtimer timer; 3 struct hrtimer timer;
4 s64 period; /* unit: ns */ 4 s64 period; /* unit: ns */
5 u32 timer_mode_mask;
6 u64 tscdeadline;
5 atomic_t pending; /* accumulated triggered timers */ 7 atomic_t pending; /* accumulated triggered timers */
6 bool reinject; 8 bool reinject;
7 struct kvm_timer_ops *t_ops; 9 struct kvm_timer_ops *t_ops;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57dcbd4308fa..54abb40199d6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,9 @@
68#define VEC_POS(v) ((v) & (32 - 1)) 68#define VEC_POS(v) ((v) & (32 - 1))
69#define REG_POS(v) (((v) >> 5) << 4) 69#define REG_POS(v) (((v) >> 5) << 4)
70 70
71static unsigned int min_timer_period_us = 500;
72module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
73
71static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) 74static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
72{ 75{
73 return *((u32 *) (apic->regs + reg_off)); 76 return *((u32 *) (apic->regs + reg_off));
@@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
135 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; 138 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
136} 139}
137 140
141static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
142{
143 return ((apic_get_reg(apic, APIC_LVTT) &
144 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
145}
146
138static inline int apic_lvtt_period(struct kvm_lapic *apic) 147static inline int apic_lvtt_period(struct kvm_lapic *apic)
139{ 148{
140 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; 149 return ((apic_get_reg(apic, APIC_LVTT) &
150 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
151}
152
153static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
154{
155 return ((apic_get_reg(apic, APIC_LVTT) &
156 apic->lapic_timer.timer_mode_mask) ==
157 APIC_LVT_TIMER_TSCDEADLINE);
141} 158}
142 159
143static inline int apic_lvt_nmi_mode(u32 lvt_val) 160static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
166} 183}
167 184
168static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 185static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
169 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 186 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
170 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 187 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
171 LVT_MASK | APIC_MODE_MASK, /* LVTPC */ 188 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
172 LINT_MASK, LINT_MASK, /* LVT0-1 */ 189 LINT_MASK, LINT_MASK, /* LVT0-1 */
@@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
316 result = 1; 333 result = 1;
317 break; 334 break;
318 default: 335 default:
319 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", 336 apic_debug("Bad DFR vcpu %d: %08x\n",
320 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); 337 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
321 break; 338 break;
322 } 339 }
323 340
@@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
354 result = (target != source); 371 result = (target != source);
355 break; 372 break;
356 default: 373 default:
357 printk(KERN_WARNING "Bad dest shorthand value %x\n", 374 apic_debug("kvm: apic: Bad dest shorthand value %x\n",
358 short_hand); 375 short_hand);
359 break; 376 break;
360 } 377 }
361 378
@@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
401 break; 418 break;
402 419
403 case APIC_DM_REMRD: 420 case APIC_DM_REMRD:
404 printk(KERN_DEBUG "Ignoring delivery mode 3\n"); 421 apic_debug("Ignoring delivery mode 3\n");
405 break; 422 break;
406 423
407 case APIC_DM_SMI: 424 case APIC_DM_SMI:
408 printk(KERN_DEBUG "Ignoring guest SMI\n"); 425 apic_debug("Ignoring guest SMI\n");
409 break; 426 break;
410 427
411 case APIC_DM_NMI: 428 case APIC_DM_NMI:
@@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
565 val = kvm_apic_id(apic) << 24; 582 val = kvm_apic_id(apic) << 24;
566 break; 583 break;
567 case APIC_ARBPRI: 584 case APIC_ARBPRI:
568 printk(KERN_WARNING "Access APIC ARBPRI register " 585 apic_debug("Access APIC ARBPRI register which is for P6\n");
569 "which is for P6\n");
570 break; 586 break;
571 587
572 case APIC_TMCCT: /* Timer CCR */ 588 case APIC_TMCCT: /* Timer CCR */
589 if (apic_lvtt_tscdeadline(apic))
590 return 0;
591
573 val = apic_get_tmcct(apic); 592 val = apic_get_tmcct(apic);
574 break; 593 break;
575 594
@@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic)
664 683
665static void start_apic_timer(struct kvm_lapic *apic) 684static void start_apic_timer(struct kvm_lapic *apic)
666{ 685{
667 ktime_t now = apic->lapic_timer.timer.base->get_time(); 686 ktime_t now;
668
669 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
670 APIC_BUS_CYCLE_NS * apic->divide_count;
671 atomic_set(&apic->lapic_timer.pending, 0); 687 atomic_set(&apic->lapic_timer.pending, 0);
672 688
673 if (!apic->lapic_timer.period) 689 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
674 return; 690 /* lapic timer in oneshot or peroidic mode */
675 /* 691 now = apic->lapic_timer.timer.base->get_time();
676 * Do not allow the guest to program periodic timers with small 692 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
677 * interval, since the hrtimers are not throttled by the host 693 * APIC_BUS_CYCLE_NS * apic->divide_count;
678 * scheduler. 694
679 */ 695 if (!apic->lapic_timer.period)
680 if (apic_lvtt_period(apic)) { 696 return;
681 if (apic->lapic_timer.period < NSEC_PER_MSEC/2) 697 /*
682 apic->lapic_timer.period = NSEC_PER_MSEC/2; 698 * Do not allow the guest to program periodic timers with small
683 } 699 * interval, since the hrtimers are not throttled by the host
700 * scheduler.
701 */
702 if (apic_lvtt_period(apic)) {
703 s64 min_period = min_timer_period_us * 1000LL;
704
705 if (apic->lapic_timer.period < min_period) {
706 pr_info_ratelimited(
707 "kvm: vcpu %i: requested %lld ns "
708 "lapic timer period limited to %lld ns\n",
709 apic->vcpu->vcpu_id,
710 apic->lapic_timer.period, min_period);
711 apic->lapic_timer.period = min_period;
712 }
713 }
684 714
685 hrtimer_start(&apic->lapic_timer.timer, 715 hrtimer_start(&apic->lapic_timer.timer,
686 ktime_add_ns(now, apic->lapic_timer.period), 716 ktime_add_ns(now, apic->lapic_timer.period),
687 HRTIMER_MODE_ABS); 717 HRTIMER_MODE_ABS);
688 718
689 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 719 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
690 PRIx64 ", " 720 PRIx64 ", "
691 "timer initial count 0x%x, period %lldns, " 721 "timer initial count 0x%x, period %lldns, "
692 "expire @ 0x%016" PRIx64 ".\n", __func__, 722 "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic)
695 apic->lapic_timer.period, 725 apic->lapic_timer.period,
696 ktime_to_ns(ktime_add_ns(now, 726 ktime_to_ns(ktime_add_ns(now,
697 apic->lapic_timer.period))); 727 apic->lapic_timer.period)));
728 } else if (apic_lvtt_tscdeadline(apic)) {
729 /* lapic timer in tsc deadline mode */
730 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
731 u64 ns = 0;
732 struct kvm_vcpu *vcpu = apic->vcpu;
733 unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
734 unsigned long flags;
735
736 if (unlikely(!tscdeadline || !this_tsc_khz))
737 return;
738
739 local_irq_save(flags);
740
741 now = apic->lapic_timer.timer.base->get_time();
742 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
743 if (likely(tscdeadline > guest_tsc)) {
744 ns = (tscdeadline - guest_tsc) * 1000000ULL;
745 do_div(ns, this_tsc_khz);
746 }
747 hrtimer_start(&apic->lapic_timer.timer,
748 ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
749
750 local_irq_restore(flags);
751 }
698} 752}
699 753
700static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 754static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
782 836
783 case APIC_LVT0: 837 case APIC_LVT0:
784 apic_manage_nmi_watchdog(apic, val); 838 apic_manage_nmi_watchdog(apic, val);
785 case APIC_LVTT:
786 case APIC_LVTTHMR: 839 case APIC_LVTTHMR:
787 case APIC_LVTPC: 840 case APIC_LVTPC:
788 case APIC_LVT1: 841 case APIC_LVT1:
@@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
796 849
797 break; 850 break;
798 851
852 case APIC_LVTT:
853 if ((apic_get_reg(apic, APIC_LVTT) &
854 apic->lapic_timer.timer_mode_mask) !=
855 (val & apic->lapic_timer.timer_mode_mask))
856 hrtimer_cancel(&apic->lapic_timer.timer);
857
858 if (!apic_sw_enabled(apic))
859 val |= APIC_LVT_MASKED;
860 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
861 apic_set_reg(apic, APIC_LVTT, val);
862 break;
863
799 case APIC_TMICT: 864 case APIC_TMICT:
865 if (apic_lvtt_tscdeadline(apic))
866 break;
867
800 hrtimer_cancel(&apic->lapic_timer.timer); 868 hrtimer_cancel(&apic->lapic_timer.timer);
801 apic_set_reg(apic, APIC_TMICT, val); 869 apic_set_reg(apic, APIC_TMICT, val);
802 start_apic_timer(apic); 870 start_apic_timer(apic);
@@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
804 872
805 case APIC_TDCR: 873 case APIC_TDCR:
806 if (val & 4) 874 if (val & 4)
807 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); 875 apic_debug("KVM_WRITE:TDCR %x\n", val);
808 apic_set_reg(apic, APIC_TDCR, val); 876 apic_set_reg(apic, APIC_TDCR, val);
809 update_divide_count(apic); 877 update_divide_count(apic);
810 break; 878 break;
811 879
812 case APIC_ESR: 880 case APIC_ESR:
813 if (apic_x2apic_mode(apic) && val != 0) { 881 if (apic_x2apic_mode(apic) && val != 0) {
814 printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); 882 apic_debug("KVM_WRITE:ESR not zero %x\n", val);
815 ret = 1; 883 ret = 1;
816 } 884 }
817 break; 885 break;
@@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this,
864 return 0; 932 return 0;
865} 933}
866 934
935void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
936{
937 struct kvm_lapic *apic = vcpu->arch.apic;
938
939 if (apic)
940 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
941}
942EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
943
867void kvm_free_lapic(struct kvm_vcpu *vcpu) 944void kvm_free_lapic(struct kvm_vcpu *vcpu)
868{ 945{
869 if (!vcpu->arch.apic) 946 if (!vcpu->arch.apic)
@@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
883 *---------------------------------------------------------------------- 960 *----------------------------------------------------------------------
884 */ 961 */
885 962
963u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
964{
965 struct kvm_lapic *apic = vcpu->arch.apic;
966 if (!apic)
967 return 0;
968
969 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
970 return 0;
971
972 return apic->lapic_timer.tscdeadline;
973}
974
975void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
976{
977 struct kvm_lapic *apic = vcpu->arch.apic;
978 if (!apic)
979 return;
980
981 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
982 return;
983
984 hrtimer_cancel(&apic->lapic_timer.timer);
985 apic->lapic_timer.tscdeadline = data;
986 start_apic_timer(apic);
987}
988
886void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) 989void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
887{ 990{
888 struct kvm_lapic *apic = vcpu->arch.apic; 991 struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 52c9e6b9e725..138e8cc6fea6 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
26void kvm_lapic_reset(struct kvm_vcpu *vcpu); 26void kvm_lapic_reset(struct kvm_vcpu *vcpu);
27u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 27u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
29void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
29void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 30void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
30u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
31void kvm_apic_set_version(struct kvm_vcpu *vcpu); 32void kvm_apic_set_version(struct kvm_vcpu *vcpu);
@@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
41bool kvm_apic_present(struct kvm_vcpu *vcpu); 42bool kvm_apic_present(struct kvm_vcpu *vcpu);
42int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
43 44
45u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
46void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
47
44void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 48void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
45void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 49void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
46void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 50void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8e8da7960dbe..f1b36cf3e3d0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2770 2770
2771 ASSERT(!VALID_PAGE(root)); 2771 ASSERT(!VALID_PAGE(root));
2772 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2772 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2773 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); 2773 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
2774 if (!is_present_gpte(pdptr)) { 2774 if (!is_present_gpte(pdptr)) {
2775 vcpu->arch.mmu.pae_root[i] = 0; 2775 vcpu->arch.mmu.pae_root[i] = 0;
2776 continue; 2776 continue;
@@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3318 context->direct_map = true; 3318 context->direct_map = true;
3319 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 3319 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
3320 context->get_cr3 = get_cr3; 3320 context->get_cr3 = get_cr3;
3321 context->get_pdptr = kvm_pdptr_read;
3321 context->inject_page_fault = kvm_inject_page_fault; 3322 context->inject_page_fault = kvm_inject_page_fault;
3322 context->nx = is_nx(vcpu); 3323 context->nx = is_nx(vcpu);
3323 3324
@@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3376 3377
3377 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; 3378 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3378 vcpu->arch.walk_mmu->get_cr3 = get_cr3; 3379 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3380 vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
3379 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3381 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3380 3382
3381 return r; 3383 return r;
@@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3386 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 3388 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3387 3389
3388 g_context->get_cr3 = get_cr3; 3390 g_context->get_cr3 = get_cr3;
3391 g_context->get_pdptr = kvm_pdptr_read;
3389 g_context->inject_page_fault = kvm_inject_page_fault; 3392 g_context->inject_page_fault = kvm_inject_page_fault;
3390 3393
3391 /* 3394 /*
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2460a265be23..746ec259d024 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
121 121
122static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 122static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
123{ 123{
124 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
124 unsigned long *rmapp; 125 unsigned long *rmapp;
125 struct kvm_mmu_page *rev_sp; 126 struct kvm_mmu_page *rev_sp;
126 gfn_t gfn; 127 gfn_t gfn;
127 128
128
129 rev_sp = page_header(__pa(sptep)); 129 rev_sp = page_header(__pa(sptep));
130 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 130 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
131 131
132 if (!gfn_to_memslot(kvm, gfn)) { 132 if (!gfn_to_memslot(kvm, gfn)) {
133 if (!printk_ratelimit()) 133 if (!__ratelimit(&ratelimit_state))
134 return; 134 return;
135 audit_printk(kvm, "no memslot for gfn %llx\n", gfn); 135 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
136 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", 136 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
141 141
142 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 142 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
143 if (!*rmapp) { 143 if (!*rmapp) {
144 if (!printk_ratelimit()) 144 if (!__ratelimit(&ratelimit_state))
145 return; 145 return;
146 audit_printk(kvm, "no rmap for writable spte %llx\n", 146 audit_printk(kvm, "no rmap for writable spte %llx\n",
147 *sptep); 147 *sptep);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 507e2b844cfa..92994100638b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
147 gfn_t table_gfn; 147 gfn_t table_gfn;
148 unsigned index, pt_access, uninitialized_var(pte_access); 148 unsigned index, pt_access, uninitialized_var(pte_access);
149 gpa_t pte_gpa; 149 gpa_t pte_gpa;
150 bool eperm; 150 bool eperm, last_gpte;
151 int offset; 151 int offset;
152 const int write_fault = access & PFERR_WRITE_MASK; 152 const int write_fault = access & PFERR_WRITE_MASK;
153 const int user_fault = access & PFERR_USER_MASK; 153 const int user_fault = access & PFERR_USER_MASK;
@@ -163,7 +163,7 @@ retry_walk:
163 163
164#if PTTYPE == 64 164#if PTTYPE == 64
165 if (walker->level == PT32E_ROOT_LEVEL) { 165 if (walker->level == PT32E_ROOT_LEVEL) {
166 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); 166 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
167 trace_kvm_mmu_paging_element(pte, walker->level); 167 trace_kvm_mmu_paging_element(pte, walker->level);
168 if (!is_present_gpte(pte)) 168 if (!is_present_gpte(pte))
169 goto error; 169 goto error;
@@ -221,6 +221,17 @@ retry_walk:
221 eperm = true; 221 eperm = true;
222#endif 222#endif
223 223
224 last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
225 if (last_gpte) {
226 pte_access = pt_access &
227 FNAME(gpte_access)(vcpu, pte, true);
228 /* check if the kernel is fetching from user page */
229 if (unlikely(pte_access & PT_USER_MASK) &&
230 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
231 if (fetch_fault && !user_fault)
232 eperm = true;
233 }
234
224 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { 235 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
225 int ret; 236 int ret;
226 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 237 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -238,18 +249,12 @@ retry_walk:
238 249
239 walker->ptes[walker->level - 1] = pte; 250 walker->ptes[walker->level - 1] = pte;
240 251
241 if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { 252 if (last_gpte) {
242 int lvl = walker->level; 253 int lvl = walker->level;
243 gpa_t real_gpa; 254 gpa_t real_gpa;
244 gfn_t gfn; 255 gfn_t gfn;
245 u32 ac; 256 u32 ac;
246 257
247 /* check if the kernel is fetching from user page */
248 if (unlikely(pte_access & PT_USER_MASK) &&
249 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
250 if (fetch_fault && !user_fault)
251 eperm = true;
252
253 gfn = gpte_to_gfn_lvl(pte, lvl); 258 gfn = gpte_to_gfn_lvl(pte, lvl);
254 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; 259 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
255 260
@@ -295,7 +300,6 @@ retry_walk:
295 walker->ptes[walker->level - 1] = pte; 300 walker->ptes[walker->level - 1] = pte;
296 } 301 }
297 302
298 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
299 walker->pt_access = pt_access; 303 walker->pt_access = pt_access;
300 walker->pte_access = pte_access; 304 walker->pte_access = pte_access;
301 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 305 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 475d1c948501..e32243eac2f4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1084 if (npt_enabled) { 1084 if (npt_enabled) {
1085 /* Setup VMCB for Nested Paging */ 1085 /* Setup VMCB for Nested Paging */
1086 control->nested_ctl = 1; 1086 control->nested_ctl = 1;
1087 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
1088 clr_intercept(svm, INTERCEPT_INVLPG); 1087 clr_intercept(svm, INTERCEPT_INVLPG);
1089 clr_exception_intercept(svm, PF_VECTOR); 1088 clr_exception_intercept(svm, PF_VECTOR);
1090 clr_cr_intercept(svm, INTERCEPT_CR3_READ); 1089 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1844 return svm->nested.nested_cr3; 1843 return svm->nested.nested_cr3;
1845} 1844}
1846 1845
1846static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1847{
1848 struct vcpu_svm *svm = to_svm(vcpu);
1849 u64 cr3 = svm->nested.nested_cr3;
1850 u64 pdpte;
1851 int ret;
1852
1853 ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
1854 offset_in_page(cr3) + index * 8, 8);
1855 if (ret)
1856 return 0;
1857 return pdpte;
1858}
1859
1847static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, 1860static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1848 unsigned long root) 1861 unsigned long root)
1849{ 1862{
@@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1875 1888
1876 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; 1889 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1877 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 1890 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1891 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
1878 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; 1892 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1879 vcpu->arch.mmu.shadow_root_level = get_npt_level(); 1893 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1880 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 1894 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
@@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2182 vmcb->control.exit_info_1, 2196 vmcb->control.exit_info_1,
2183 vmcb->control.exit_info_2, 2197 vmcb->control.exit_info_2,
2184 vmcb->control.exit_int_info, 2198 vmcb->control.exit_int_info,
2185 vmcb->control.exit_int_info_err); 2199 vmcb->control.exit_int_info_err,
2200 KVM_ISA_SVM);
2186 2201
2187 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); 2202 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2188 if (!nested_vmcb) 2203 if (!nested_vmcb)
@@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm)
2894 return 0; 2909 return 0;
2895} 2910}
2896 2911
2912u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
2913{
2914 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
2915 return vmcb->control.tsc_offset +
2916 svm_scale_tsc(vcpu, native_read_tsc());
2917}
2918
2897static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 2919static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2898{ 2920{
2899 struct vcpu_svm *svm = to_svm(vcpu); 2921 struct vcpu_svm *svm = to_svm(vcpu);
2900 2922
2901 switch (ecx) { 2923 switch (ecx) {
2902 case MSR_IA32_TSC: { 2924 case MSR_IA32_TSC: {
2903 struct vmcb *vmcb = get_host_vmcb(svm); 2925 *data = svm->vmcb->control.tsc_offset +
2904
2905 *data = vmcb->control.tsc_offset +
2906 svm_scale_tsc(vcpu, native_read_tsc()); 2926 svm_scale_tsc(vcpu, native_read_tsc());
2907 2927
2908 break; 2928 break;
@@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3314 struct kvm_run *kvm_run = vcpu->run; 3334 struct kvm_run *kvm_run = vcpu->run;
3315 u32 exit_code = svm->vmcb->control.exit_code; 3335 u32 exit_code = svm->vmcb->control.exit_code;
3316 3336
3317 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3318
3319 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 3337 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3320 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3338 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3321 if (npt_enabled) 3339 if (npt_enabled)
@@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3335 svm->vmcb->control.exit_info_1, 3353 svm->vmcb->control.exit_info_1,
3336 svm->vmcb->control.exit_info_2, 3354 svm->vmcb->control.exit_info_2,
3337 svm->vmcb->control.exit_int_info, 3355 svm->vmcb->control.exit_int_info,
3338 svm->vmcb->control.exit_int_info_err); 3356 svm->vmcb->control.exit_int_info_err,
3357 KVM_ISA_SVM);
3339 3358
3340 vmexit = nested_svm_exit_special(svm); 3359 vmexit = nested_svm_exit_special(svm);
3341 3360
@@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3768 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3787 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3769 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3788 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3770 3789
3790 trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
3791
3771 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3792 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3772 kvm_before_handle_nmi(&svm->vcpu); 3793 kvm_before_handle_nmi(&svm->vcpu);
3773 3794
@@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3897 } 3918 }
3898} 3919}
3899 3920
3900static const struct trace_print_flags svm_exit_reasons_str[] = {
3901 { SVM_EXIT_READ_CR0, "read_cr0" },
3902 { SVM_EXIT_READ_CR3, "read_cr3" },
3903 { SVM_EXIT_READ_CR4, "read_cr4" },
3904 { SVM_EXIT_READ_CR8, "read_cr8" },
3905 { SVM_EXIT_WRITE_CR0, "write_cr0" },
3906 { SVM_EXIT_WRITE_CR3, "write_cr3" },
3907 { SVM_EXIT_WRITE_CR4, "write_cr4" },
3908 { SVM_EXIT_WRITE_CR8, "write_cr8" },
3909 { SVM_EXIT_READ_DR0, "read_dr0" },
3910 { SVM_EXIT_READ_DR1, "read_dr1" },
3911 { SVM_EXIT_READ_DR2, "read_dr2" },
3912 { SVM_EXIT_READ_DR3, "read_dr3" },
3913 { SVM_EXIT_WRITE_DR0, "write_dr0" },
3914 { SVM_EXIT_WRITE_DR1, "write_dr1" },
3915 { SVM_EXIT_WRITE_DR2, "write_dr2" },
3916 { SVM_EXIT_WRITE_DR3, "write_dr3" },
3917 { SVM_EXIT_WRITE_DR5, "write_dr5" },
3918 { SVM_EXIT_WRITE_DR7, "write_dr7" },
3919 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
3920 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
3921 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
3922 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
3923 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
3924 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
3925 { SVM_EXIT_INTR, "interrupt" },
3926 { SVM_EXIT_NMI, "nmi" },
3927 { SVM_EXIT_SMI, "smi" },
3928 { SVM_EXIT_INIT, "init" },
3929 { SVM_EXIT_VINTR, "vintr" },
3930 { SVM_EXIT_CPUID, "cpuid" },
3931 { SVM_EXIT_INVD, "invd" },
3932 { SVM_EXIT_HLT, "hlt" },
3933 { SVM_EXIT_INVLPG, "invlpg" },
3934 { SVM_EXIT_INVLPGA, "invlpga" },
3935 { SVM_EXIT_IOIO, "io" },
3936 { SVM_EXIT_MSR, "msr" },
3937 { SVM_EXIT_TASK_SWITCH, "task_switch" },
3938 { SVM_EXIT_SHUTDOWN, "shutdown" },
3939 { SVM_EXIT_VMRUN, "vmrun" },
3940 { SVM_EXIT_VMMCALL, "hypercall" },
3941 { SVM_EXIT_VMLOAD, "vmload" },
3942 { SVM_EXIT_VMSAVE, "vmsave" },
3943 { SVM_EXIT_STGI, "stgi" },
3944 { SVM_EXIT_CLGI, "clgi" },
3945 { SVM_EXIT_SKINIT, "skinit" },
3946 { SVM_EXIT_WBINVD, "wbinvd" },
3947 { SVM_EXIT_MONITOR, "monitor" },
3948 { SVM_EXIT_MWAIT, "mwait" },
3949 { SVM_EXIT_XSETBV, "xsetbv" },
3950 { SVM_EXIT_NPF, "npf" },
3951 { -1, NULL }
3952};
3953
3954static int svm_get_lpage_level(void) 3921static int svm_get_lpage_level(void)
3955{ 3922{
3956 return PT_PDPE_LEVEL; 3923 return PT_PDPE_LEVEL;
@@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = {
4223 .get_mt_mask = svm_get_mt_mask, 4190 .get_mt_mask = svm_get_mt_mask,
4224 4191
4225 .get_exit_info = svm_get_exit_info, 4192 .get_exit_info = svm_get_exit_info,
4226 .exit_reasons_str = svm_exit_reasons_str,
4227 4193
4228 .get_lpage_level = svm_get_lpage_level, 4194 .get_lpage_level = svm_get_lpage_level,
4229 4195
@@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4239 .write_tsc_offset = svm_write_tsc_offset, 4205 .write_tsc_offset = svm_write_tsc_offset,
4240 .adjust_tsc_offset = svm_adjust_tsc_offset, 4206 .adjust_tsc_offset = svm_adjust_tsc_offset,
4241 .compute_tsc_offset = svm_compute_tsc_offset, 4207 .compute_tsc_offset = svm_compute_tsc_offset,
4208 .read_l1_tsc = svm_read_l1_tsc,
4242 4209
4243 .set_tdp_cr3 = set_tdp_cr3, 4210 .set_tdp_cr3 = set_tdp_cr3,
4244 4211
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 3ff898c104f7..911d2641f14c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -2,6 +2,8 @@
2#define _TRACE_KVM_H 2#define _TRACE_KVM_H
3 3
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h>
6#include <asm/svm.h>
5 7
6#undef TRACE_SYSTEM 8#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 9#define TRACE_SYSTEM kvm
@@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic,
181#define KVM_ISA_VMX 1 183#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2 184#define KVM_ISA_SVM 2
183 185
186#define VMX_EXIT_REASONS \
187 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
188 { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
189 { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
190 { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
191 { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
192 { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
193 { EXIT_REASON_CPUID, "CPUID" }, \
194 { EXIT_REASON_HLT, "HLT" }, \
195 { EXIT_REASON_INVLPG, "INVLPG" }, \
196 { EXIT_REASON_RDPMC, "RDPMC" }, \
197 { EXIT_REASON_RDTSC, "RDTSC" }, \
198 { EXIT_REASON_VMCALL, "VMCALL" }, \
199 { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
200 { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
201 { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
202 { EXIT_REASON_VMPTRST, "VMPTRST" }, \
203 { EXIT_REASON_VMREAD, "VMREAD" }, \
204 { EXIT_REASON_VMRESUME, "VMRESUME" }, \
205 { EXIT_REASON_VMWRITE, "VMWRITE" }, \
206 { EXIT_REASON_VMOFF, "VMOFF" }, \
207 { EXIT_REASON_VMON, "VMON" }, \
208 { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
209 { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
210 { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
211 { EXIT_REASON_MSR_READ, "MSR_READ" }, \
212 { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
213 { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
214 { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
215 { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
216 { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
217 { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
218 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
219 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
220 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
221 { EXIT_REASON_WBINVD, "WBINVD" }
222
223#define SVM_EXIT_REASONS \
224 { SVM_EXIT_READ_CR0, "read_cr0" }, \
225 { SVM_EXIT_READ_CR3, "read_cr3" }, \
226 { SVM_EXIT_READ_CR4, "read_cr4" }, \
227 { SVM_EXIT_READ_CR8, "read_cr8" }, \
228 { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
229 { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
230 { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
231 { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
232 { SVM_EXIT_READ_DR0, "read_dr0" }, \
233 { SVM_EXIT_READ_DR1, "read_dr1" }, \
234 { SVM_EXIT_READ_DR2, "read_dr2" }, \
235 { SVM_EXIT_READ_DR3, "read_dr3" }, \
236 { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
237 { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
238 { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
239 { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
240 { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
241 { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
242 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
243 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
244 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
245 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
246 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
247 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
248 { SVM_EXIT_INTR, "interrupt" }, \
249 { SVM_EXIT_NMI, "nmi" }, \
250 { SVM_EXIT_SMI, "smi" }, \
251 { SVM_EXIT_INIT, "init" }, \
252 { SVM_EXIT_VINTR, "vintr" }, \
253 { SVM_EXIT_CPUID, "cpuid" }, \
254 { SVM_EXIT_INVD, "invd" }, \
255 { SVM_EXIT_HLT, "hlt" }, \
256 { SVM_EXIT_INVLPG, "invlpg" }, \
257 { SVM_EXIT_INVLPGA, "invlpga" }, \
258 { SVM_EXIT_IOIO, "io" }, \
259 { SVM_EXIT_MSR, "msr" }, \
260 { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
261 { SVM_EXIT_SHUTDOWN, "shutdown" }, \
262 { SVM_EXIT_VMRUN, "vmrun" }, \
263 { SVM_EXIT_VMMCALL, "hypercall" }, \
264 { SVM_EXIT_VMLOAD, "vmload" }, \
265 { SVM_EXIT_VMSAVE, "vmsave" }, \
266 { SVM_EXIT_STGI, "stgi" }, \
267 { SVM_EXIT_CLGI, "clgi" }, \
268 { SVM_EXIT_SKINIT, "skinit" }, \
269 { SVM_EXIT_WBINVD, "wbinvd" }, \
270 { SVM_EXIT_MONITOR, "monitor" }, \
271 { SVM_EXIT_MWAIT, "mwait" }, \
272 { SVM_EXIT_XSETBV, "xsetbv" }, \
273 { SVM_EXIT_NPF, "npf" }
274
184/* 275/*
185 * Tracepoint for kvm guest exit: 276 * Tracepoint for kvm guest exit:
186 */ 277 */
@@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit,
205 ), 296 ),
206 297
207 TP_printk("reason %s rip 0x%lx info %llx %llx", 298 TP_printk("reason %s rip 0x%lx info %llx %llx",
208 ftrace_print_symbols_seq(p, __entry->exit_reason, 299 (__entry->isa == KVM_ISA_VMX) ?
209 kvm_x86_ops->exit_reasons_str), 300 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
301 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
210 __entry->guest_rip, __entry->info1, __entry->info2) 302 __entry->guest_rip, __entry->info1, __entry->info2)
211); 303);
212 304
@@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts,
486TRACE_EVENT(kvm_nested_vmexit, 578TRACE_EVENT(kvm_nested_vmexit,
487 TP_PROTO(__u64 rip, __u32 exit_code, 579 TP_PROTO(__u64 rip, __u32 exit_code,
488 __u64 exit_info1, __u64 exit_info2, 580 __u64 exit_info1, __u64 exit_info2,
489 __u32 exit_int_info, __u32 exit_int_info_err), 581 __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
490 TP_ARGS(rip, exit_code, exit_info1, exit_info2, 582 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
491 exit_int_info, exit_int_info_err), 583 exit_int_info, exit_int_info_err, isa),
492 584
493 TP_STRUCT__entry( 585 TP_STRUCT__entry(
494 __field( __u64, rip ) 586 __field( __u64, rip )
@@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit,
497 __field( __u64, exit_info2 ) 589 __field( __u64, exit_info2 )
498 __field( __u32, exit_int_info ) 590 __field( __u32, exit_int_info )
499 __field( __u32, exit_int_info_err ) 591 __field( __u32, exit_int_info_err )
592 __field( __u32, isa )
500 ), 593 ),
501 594
502 TP_fast_assign( 595 TP_fast_assign(
@@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit,
506 __entry->exit_info2 = exit_info2; 599 __entry->exit_info2 = exit_info2;
507 __entry->exit_int_info = exit_int_info; 600 __entry->exit_int_info = exit_int_info;
508 __entry->exit_int_info_err = exit_int_info_err; 601 __entry->exit_int_info_err = exit_int_info_err;
602 __entry->isa = isa;
509 ), 603 ),
510 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 604 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
511 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", 605 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
512 __entry->rip, 606 __entry->rip,
513 ftrace_print_symbols_seq(p, __entry->exit_code, 607 (__entry->isa == KVM_ISA_VMX) ?
514 kvm_x86_ops->exit_reasons_str), 608 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
609 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
515 __entry->exit_info1, __entry->exit_info2, 610 __entry->exit_info1, __entry->exit_info2,
516 __entry->exit_int_info, __entry->exit_int_info_err) 611 __entry->exit_int_info, __entry->exit_int_info_err)
517); 612);
@@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit,
522TRACE_EVENT(kvm_nested_vmexit_inject, 617TRACE_EVENT(kvm_nested_vmexit_inject,
523 TP_PROTO(__u32 exit_code, 618 TP_PROTO(__u32 exit_code,
524 __u64 exit_info1, __u64 exit_info2, 619 __u64 exit_info1, __u64 exit_info2,
525 __u32 exit_int_info, __u32 exit_int_info_err), 620 __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
526 TP_ARGS(exit_code, exit_info1, exit_info2, 621 TP_ARGS(exit_code, exit_info1, exit_info2,
527 exit_int_info, exit_int_info_err), 622 exit_int_info, exit_int_info_err, isa),
528 623
529 TP_STRUCT__entry( 624 TP_STRUCT__entry(
530 __field( __u32, exit_code ) 625 __field( __u32, exit_code )
@@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
532 __field( __u64, exit_info2 ) 627 __field( __u64, exit_info2 )
533 __field( __u32, exit_int_info ) 628 __field( __u32, exit_int_info )
534 __field( __u32, exit_int_info_err ) 629 __field( __u32, exit_int_info_err )
630 __field( __u32, isa )
535 ), 631 ),
536 632
537 TP_fast_assign( 633 TP_fast_assign(
@@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
540 __entry->exit_info2 = exit_info2; 636 __entry->exit_info2 = exit_info2;
541 __entry->exit_int_info = exit_int_info; 637 __entry->exit_int_info = exit_int_info;
542 __entry->exit_int_info_err = exit_int_info_err; 638 __entry->exit_int_info_err = exit_int_info_err;
639 __entry->isa = isa;
543 ), 640 ),
544 641
545 TP_printk("reason: %s ext_inf1: 0x%016llx " 642 TP_printk("reason: %s ext_inf1: 0x%016llx "
546 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", 643 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
547 ftrace_print_symbols_seq(p, __entry->exit_code, 644 (__entry->isa == KVM_ISA_VMX) ?
548 kvm_x86_ops->exit_reasons_str), 645 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
646 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
549 __entry->exit_info1, __entry->exit_info2, 647 __entry->exit_info1, __entry->exit_info2,
550 __entry->exit_int_info, __entry->exit_int_info_err) 648 __entry->exit_int_info, __entry->exit_int_info_err)
551); 649);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e65a158dee64..a0d6bd9ad442 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
71static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
72module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
73 73
74static int __read_mostly fasteoi = 1;
75module_param(fasteoi, bool, S_IRUGO);
76
74/* 77/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use 78 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 79 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1748,6 +1751,21 @@ static u64 guest_read_tsc(void)
1748} 1751}
1749 1752
1750/* 1753/*
1754 * Like guest_read_tsc, but always returns L1's notion of the timestamp
1755 * counter, even if a nested guest (L2) is currently running.
1756 */
1757u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
1758{
1759 u64 host_tsc, tsc_offset;
1760
1761 rdtscll(host_tsc);
1762 tsc_offset = is_guest_mode(vcpu) ?
1763 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1764 vmcs_read64(TSC_OFFSET);
1765 return host_tsc + tsc_offset;
1766}
1767
1768/*
1751 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ 1769 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1752 * ioctl. In this case the call-back should update internal vmx state to make 1770 * ioctl. In this case the call-back should update internal vmx state to make
1753 * the changes effective. 1771 * the changes effective.
@@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1762 */ 1780 */
1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1781static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1764{ 1782{
1765 vmcs_write64(TSC_OFFSET, offset); 1783 if (is_guest_mode(vcpu)) {
1766 if (is_guest_mode(vcpu))
1767 /* 1784 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since 1785 * We're here if L1 chose not to trap WRMSR to TSC. According
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also 1786 * to the spec, this should set L1's TSC; The offset that L1
1770 * set the vmcs12 field here. 1787 * set for L2 remains unchanged, and still needs to be added
1788 * to the newly set TSC to get L2's TSC.
1771 */ 1789 */
1772 get_vmcs12(vcpu)->tsc_offset = offset - 1790 struct vmcs12 *vmcs12;
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset; 1791 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
1792 /* recalculate vmcs02.TSC_OFFSET: */
1793 vmcs12 = get_vmcs12(vcpu);
1794 vmcs_write64(TSC_OFFSET, offset +
1795 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
1796 vmcs12->tsc_offset : 0));
1797 } else {
1798 vmcs_write64(TSC_OFFSET, offset);
1799 }
1774} 1800}
1775 1801
1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1802static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
2736 2762
2737 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 2763 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2738 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 2764 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
2739 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 2765 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2740 __func__); 2766 __func__);
2741 vmcs_write32(GUEST_TR_AR_BYTES, 2767 vmcs_write32(GUEST_TR_AR_BYTES,
2742 (guest_tr_ar & ~AR_TYPE_MASK) 2768 (guest_tr_ar & ~AR_TYPE_MASK)
2743 | AR_TYPE_BUSY_64_TSS); 2769 | AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4115 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4141 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4116 if (is_page_fault(intr_info)) { 4142 if (is_page_fault(intr_info)) {
4117 /* EPT won't cause page fault directly */ 4143 /* EPT won't cause page fault directly */
4118 if (enable_ept) 4144 BUG_ON(enable_ept);
4119 BUG();
4120 cr2 = vmcs_readl(EXIT_QUALIFICATION); 4145 cr2 = vmcs_readl(EXIT_QUALIFICATION);
4121 trace_kvm_page_fault(cr2, error_code); 4146 trace_kvm_page_fault(cr2, error_code);
4122 4147
@@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
4518 4543
4519static int handle_apic_access(struct kvm_vcpu *vcpu) 4544static int handle_apic_access(struct kvm_vcpu *vcpu)
4520{ 4545{
4546 if (likely(fasteoi)) {
4547 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4548 int access_type, offset;
4549
4550 access_type = exit_qualification & APIC_ACCESS_TYPE;
4551 offset = exit_qualification & APIC_ACCESS_OFFSET;
4552 /*
4553 * Sane guest uses MOV to write EOI, with written value
4554 * not cared. So make a short-circuit here by avoiding
4555 * heavy instruction emulation.
4556 */
4557 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
4558 (offset == APIC_EOI)) {
4559 kvm_lapic_set_eoi(vcpu);
4560 skip_emulated_instruction(vcpu);
4561 return 1;
4562 }
4563 }
4521 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4564 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4522} 4565}
4523 4566
@@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5591 return 0; 5634 return 0;
5592 5635
5593 if (unlikely(vmx->fail)) { 5636 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n", 5637 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); 5638 vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1; 5639 return 1;
5597 } 5640 }
5598 5641
@@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5696 u32 exit_reason = vmx->exit_reason; 5739 u32 exit_reason = vmx->exit_reason;
5697 u32 vectoring_info = vmx->idt_vectoring_info; 5740 u32 vectoring_info = vmx->idt_vectoring_info;
5698 5741
5699 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
5700
5701 /* If guest state is invalid, start emulating */ 5742 /* If guest state is invalid, start emulating */
5702 if (vmx->emulation_required && emulate_invalid_guest_state) 5743 if (vmx->emulation_required && emulate_invalid_guest_state)
5703 return handle_invalid_guest_state(vcpu); 5744 return handle_invalid_guest_state(vcpu);
@@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6101 vmx->loaded_vmcs->launched = 1; 6142 vmx->loaded_vmcs->launched = 1;
6102 6143
6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6144 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
6145 trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
6104 6146
6105 vmx_complete_atomic_exit(vmx); 6147 vmx_complete_atomic_exit(vmx);
6106 vmx_recover_nmi_blocking(vmx); 6148 vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6241 return ret; 6283 return ret;
6242} 6284}
6243 6285
6244#define _ER(x) { EXIT_REASON_##x, #x }
6245
6246static const struct trace_print_flags vmx_exit_reasons_str[] = {
6247 _ER(EXCEPTION_NMI),
6248 _ER(EXTERNAL_INTERRUPT),
6249 _ER(TRIPLE_FAULT),
6250 _ER(PENDING_INTERRUPT),
6251 _ER(NMI_WINDOW),
6252 _ER(TASK_SWITCH),
6253 _ER(CPUID),
6254 _ER(HLT),
6255 _ER(INVLPG),
6256 _ER(RDPMC),
6257 _ER(RDTSC),
6258 _ER(VMCALL),
6259 _ER(VMCLEAR),
6260 _ER(VMLAUNCH),
6261 _ER(VMPTRLD),
6262 _ER(VMPTRST),
6263 _ER(VMREAD),
6264 _ER(VMRESUME),
6265 _ER(VMWRITE),
6266 _ER(VMOFF),
6267 _ER(VMON),
6268 _ER(CR_ACCESS),
6269 _ER(DR_ACCESS),
6270 _ER(IO_INSTRUCTION),
6271 _ER(MSR_READ),
6272 _ER(MSR_WRITE),
6273 _ER(MWAIT_INSTRUCTION),
6274 _ER(MONITOR_INSTRUCTION),
6275 _ER(PAUSE_INSTRUCTION),
6276 _ER(MCE_DURING_VMENTRY),
6277 _ER(TPR_BELOW_THRESHOLD),
6278 _ER(APIC_ACCESS),
6279 _ER(EPT_VIOLATION),
6280 _ER(EPT_MISCONFIG),
6281 _ER(WBINVD),
6282 { -1, NULL }
6283};
6284
6285#undef _ER
6286
6287static int vmx_get_lpage_level(void) 6286static int vmx_get_lpage_level(void)
6288{ 6287{
6289 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 6288 if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6514 6513
6515 set_cr4_guest_host_mask(vmx); 6514 set_cr4_guest_host_mask(vmx);
6516 6515
6517 vmcs_write64(TSC_OFFSET, 6516 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519 else
6520 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6519 6521
6520 if (enable_vpid) { 6522 if (enable_vpid) {
6521 /* 6523 /*
@@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6610 if (vmcs12->vm_entry_msr_load_count > 0 || 6612 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 || 6613 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) { 6614 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit()) 6615 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
6614 printk(KERN_WARNING 6616 __func__);
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 6617 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1; 6618 return 1;
6618 } 6619 }
@@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6922 6923
6923 load_vmcs12_host_state(vcpu, vmcs12); 6924 load_vmcs12_host_state(vcpu, vmcs12);
6924 6925
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ 6926 /* Update TSC_OFFSET if TSC was changed while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 6927 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927 6928
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 6929 /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
7039 .get_mt_mask = vmx_get_mt_mask, 7040 .get_mt_mask = vmx_get_mt_mask,
7040 7041
7041 .get_exit_info = vmx_get_exit_info, 7042 .get_exit_info = vmx_get_exit_info,
7042 .exit_reasons_str = vmx_exit_reasons_str,
7043 7043
7044 .get_lpage_level = vmx_get_lpage_level, 7044 .get_lpage_level = vmx_get_lpage_level,
7045 7045
@@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7055 .write_tsc_offset = vmx_write_tsc_offset, 7055 .write_tsc_offset = vmx_write_tsc_offset,
7056 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7056 .adjust_tsc_offset = vmx_adjust_tsc_offset,
7057 .compute_tsc_offset = vmx_compute_tsc_offset, 7057 .compute_tsc_offset = vmx_compute_tsc_offset,
7058 .read_l1_tsc = vmx_read_l1_tsc,
7058 7059
7059 .set_tdp_cr3 = vmx_set_cr3, 7060 .set_tdp_cr3 = vmx_set_cr3,
7060 7061
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea45fa4..cf269096eadf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
83static void update_cr8_intercept(struct kvm_vcpu *vcpu); 83static void update_cr8_intercept(struct kvm_vcpu *vcpu);
84static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 84static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
85 struct kvm_cpuid_entry2 __user *entries); 85 struct kvm_cpuid_entry2 __user *entries);
86static void process_nmi(struct kvm_vcpu *vcpu);
86 87
87struct kvm_x86_ops *kvm_x86_ops; 88struct kvm_x86_ops *kvm_x86_ops;
88EXPORT_SYMBOL_GPL(kvm_x86_ops); 89EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
359 360
360void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
361{ 362{
362 kvm_make_request(KVM_REQ_EVENT, vcpu); 363 atomic_inc(&vcpu->arch.nmi_queued);
363 vcpu->arch.nmi_pending = 1; 364 kvm_make_request(KVM_REQ_NMI, vcpu);
364} 365}
365EXPORT_SYMBOL_GPL(kvm_inject_nmi); 366EXPORT_SYMBOL_GPL(kvm_inject_nmi);
366 367
@@ -599,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
599static void update_cpuid(struct kvm_vcpu *vcpu) 600static void update_cpuid(struct kvm_vcpu *vcpu)
600{ 601{
601 struct kvm_cpuid_entry2 *best; 602 struct kvm_cpuid_entry2 *best;
603 struct kvm_lapic *apic = vcpu->arch.apic;
604 u32 timer_mode_mask;
602 605
603 best = kvm_find_cpuid_entry(vcpu, 1, 0); 606 best = kvm_find_cpuid_entry(vcpu, 1, 0);
604 if (!best) 607 if (!best)
@@ -610,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
610 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) 613 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
611 best->ecx |= bit(X86_FEATURE_OSXSAVE); 614 best->ecx |= bit(X86_FEATURE_OSXSAVE);
612 } 615 }
616
617 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
618 best->function == 0x1) {
619 best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
620 timer_mode_mask = 3 << 17;
621 } else
622 timer_mode_mask = 1 << 17;
623
624 if (apic)
625 apic->lapic_timer.timer_mode_mask = timer_mode_mask;
613} 626}
614 627
615int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 628int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -825,6 +838,7 @@ static u32 msrs_to_save[] = {
825static unsigned num_msrs_to_save; 838static unsigned num_msrs_to_save;
826 839
827static u32 emulated_msrs[] = { 840static u32 emulated_msrs[] = {
841 MSR_IA32_TSCDEADLINE,
828 MSR_IA32_MISC_ENABLE, 842 MSR_IA32_MISC_ENABLE,
829 MSR_IA32_MCG_STATUS, 843 MSR_IA32_MCG_STATUS,
830 MSR_IA32_MCG_CTL, 844 MSR_IA32_MCG_CTL,
@@ -1000,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void)
1000 return ret; 1014 return ret;
1001} 1015}
1002 1016
1003static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 1017u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
1004{ 1018{
1005 if (vcpu->arch.virtual_tsc_khz) 1019 if (vcpu->arch.virtual_tsc_khz)
1006 return vcpu->arch.virtual_tsc_khz; 1020 return vcpu->arch.virtual_tsc_khz;
@@ -1098,7 +1112,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 1112
1099 /* Keep irq disabled to prevent changes to the clock */ 1113 /* Keep irq disabled to prevent changes to the clock */
1100 local_irq_save(flags); 1114 local_irq_save(flags);
1101 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1115 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1102 kernel_ns = get_kernel_ns(); 1116 kernel_ns = get_kernel_ns();
1103 this_tsc_khz = vcpu_tsc_khz(v); 1117 this_tsc_khz = vcpu_tsc_khz(v);
1104 if (unlikely(this_tsc_khz == 0)) { 1118 if (unlikely(this_tsc_khz == 0)) {
@@ -1564,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1564 break; 1578 break;
1565 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1579 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1566 return kvm_x2apic_msr_write(vcpu, msr, data); 1580 return kvm_x2apic_msr_write(vcpu, msr, data);
1581 case MSR_IA32_TSCDEADLINE:
1582 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1583 break;
1567 case MSR_IA32_MISC_ENABLE: 1584 case MSR_IA32_MISC_ENABLE:
1568 vcpu->arch.ia32_misc_enable_msr = data; 1585 vcpu->arch.ia32_misc_enable_msr = data;
1569 break; 1586 break;
@@ -1825,6 +1842,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1825 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 1842 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1826 case HV_X64_MSR_TPR: 1843 case HV_X64_MSR_TPR:
1827 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 1844 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1845 case HV_X64_MSR_APIC_ASSIST_PAGE:
1846 data = vcpu->arch.hv_vapic;
1847 break;
1828 default: 1848 default:
1829 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1830 return 1; 1850 return 1;
@@ -1839,7 +1859,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1839 1859
1840 switch (msr) { 1860 switch (msr) {
1841 case MSR_IA32_PLATFORM_ID: 1861 case MSR_IA32_PLATFORM_ID:
1842 case MSR_IA32_UCODE_REV:
1843 case MSR_IA32_EBL_CR_POWERON: 1862 case MSR_IA32_EBL_CR_POWERON:
1844 case MSR_IA32_DEBUGCTLMSR: 1863 case MSR_IA32_DEBUGCTLMSR:
1845 case MSR_IA32_LASTBRANCHFROMIP: 1864 case MSR_IA32_LASTBRANCHFROMIP:
@@ -1860,6 +1879,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1860 case MSR_FAM10H_MMIO_CONF_BASE: 1879 case MSR_FAM10H_MMIO_CONF_BASE:
1861 data = 0; 1880 data = 0;
1862 break; 1881 break;
1882 case MSR_IA32_UCODE_REV:
1883 data = 0x100000000ULL;
1884 break;
1863 case MSR_MTRRcap: 1885 case MSR_MTRRcap:
1864 data = 0x500 | KVM_NR_VAR_MTRR; 1886 data = 0x500 | KVM_NR_VAR_MTRR;
1865 break; 1887 break;
@@ -1888,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1888 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1910 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1889 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1911 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1890 break; 1912 break;
1913 case MSR_IA32_TSCDEADLINE:
1914 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1915 break;
1891 case MSR_IA32_MISC_ENABLE: 1916 case MSR_IA32_MISC_ENABLE:
1892 data = vcpu->arch.ia32_misc_enable_msr; 1917 data = vcpu->arch.ia32_misc_enable_msr;
1893 break; 1918 break;
@@ -2086,6 +2111,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2086 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 2111 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2087 break; 2112 break;
2088 case KVM_CAP_NR_VCPUS: 2113 case KVM_CAP_NR_VCPUS:
2114 r = KVM_SOFT_MAX_VCPUS;
2115 break;
2116 case KVM_CAP_MAX_VCPUS:
2089 r = KVM_MAX_VCPUS; 2117 r = KVM_MAX_VCPUS;
2090 break; 2118 break;
2091 case KVM_CAP_NR_MEMSLOTS: 2119 case KVM_CAP_NR_MEMSLOTS:
@@ -2210,7 +2238,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2210 s64 tsc_delta; 2238 s64 tsc_delta;
2211 u64 tsc; 2239 u64 tsc;
2212 2240
2213 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); 2241 tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2214 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2242 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2215 tsc - vcpu->arch.last_guest_tsc; 2243 tsc - vcpu->arch.last_guest_tsc;
2216 2244
@@ -2234,7 +2262,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2234{ 2262{
2235 kvm_x86_ops->vcpu_put(vcpu); 2263 kvm_x86_ops->vcpu_put(vcpu);
2236 kvm_put_guest_fpu(vcpu); 2264 kvm_put_guest_fpu(vcpu);
2237 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 2265 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2238} 2266}
2239 2267
2240static int is_efer_nx(void) 2268static int is_efer_nx(void)
@@ -2819,6 +2847,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2819static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2847static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2820 struct kvm_vcpu_events *events) 2848 struct kvm_vcpu_events *events)
2821{ 2849{
2850 process_nmi(vcpu);
2822 events->exception.injected = 2851 events->exception.injected =
2823 vcpu->arch.exception.pending && 2852 vcpu->arch.exception.pending &&
2824 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2853 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2865,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2836 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); 2865 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2837 2866
2838 events->nmi.injected = vcpu->arch.nmi_injected; 2867 events->nmi.injected = vcpu->arch.nmi_injected;
2839 events->nmi.pending = vcpu->arch.nmi_pending; 2868 events->nmi.pending = vcpu->arch.nmi_pending != 0;
2840 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2869 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2841 events->nmi.pad = 0; 2870 events->nmi.pad = 0;
2842 2871
@@ -2856,6 +2885,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2856 | KVM_VCPUEVENT_VALID_SHADOW)) 2885 | KVM_VCPUEVENT_VALID_SHADOW))
2857 return -EINVAL; 2886 return -EINVAL;
2858 2887
2888 process_nmi(vcpu);
2859 vcpu->arch.exception.pending = events->exception.injected; 2889 vcpu->arch.exception.pending = events->exception.injected;
2860 vcpu->arch.exception.nr = events->exception.nr; 2890 vcpu->arch.exception.nr = events->exception.nr;
2861 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2891 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -3556,7 +3586,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
3556 if (r) { 3586 if (r) {
3557 mutex_lock(&kvm->slots_lock); 3587 mutex_lock(&kvm->slots_lock);
3558 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3588 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3559 &vpic->dev); 3589 &vpic->dev_master);
3590 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3591 &vpic->dev_slave);
3592 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3593 &vpic->dev_eclr);
3560 mutex_unlock(&kvm->slots_lock); 3594 mutex_unlock(&kvm->slots_lock);
3561 kfree(vpic); 3595 kfree(vpic);
3562 goto create_irqchip_unlock; 3596 goto create_irqchip_unlock;
@@ -4045,84 +4079,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4045 return 0; 4079 return 0;
4046} 4080}
4047 4081
4048static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4082int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4049 unsigned long addr, 4083 const void *val, int bytes)
4050 void *val,
4051 unsigned int bytes,
4052 struct x86_exception *exception)
4053{ 4084{
4054 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4085 int ret;
4055 gpa_t gpa;
4056 int handled, ret;
4057 4086
4087 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4088 if (ret < 0)
4089 return 0;
4090 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
4091 return 1;
4092}
4093
4094struct read_write_emulator_ops {
4095 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4096 int bytes);
4097 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4098 void *val, int bytes);
4099 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4100 int bytes, void *val);
4101 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4102 void *val, int bytes);
4103 bool write;
4104};
4105
4106static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4107{
4058 if (vcpu->mmio_read_completed) { 4108 if (vcpu->mmio_read_completed) {
4059 memcpy(val, vcpu->mmio_data, bytes); 4109 memcpy(val, vcpu->mmio_data, bytes);
4060 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 4110 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4061 vcpu->mmio_phys_addr, *(u64 *)val); 4111 vcpu->mmio_phys_addr, *(u64 *)val);
4062 vcpu->mmio_read_completed = 0; 4112 vcpu->mmio_read_completed = 0;
4063 return X86EMUL_CONTINUE; 4113 return 1;
4064 } 4114 }
4065 4115
4066 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); 4116 return 0;
4067 4117}
4068 if (ret < 0)
4069 return X86EMUL_PROPAGATE_FAULT;
4070
4071 if (ret)
4072 goto mmio;
4073
4074 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
4075 == X86EMUL_CONTINUE)
4076 return X86EMUL_CONTINUE;
4077 4118
4078mmio: 4119static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4079 /* 4120 void *val, int bytes)
4080 * Is this MMIO handled locally? 4121{
4081 */ 4122 return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4082 handled = vcpu_mmio_read(vcpu, gpa, bytes, val); 4123}
4083 4124
4084 if (handled == bytes) 4125static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4085 return X86EMUL_CONTINUE; 4126 void *val, int bytes)
4127{
4128 return emulator_write_phys(vcpu, gpa, val, bytes);
4129}
4086 4130
4087 gpa += handled; 4131static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4088 bytes -= handled; 4132{
4089 val += handled; 4133 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4134 return vcpu_mmio_write(vcpu, gpa, bytes, val);
4135}
4090 4136
4137static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4138 void *val, int bytes)
4139{
4091 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 4140 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4092
4093 vcpu->mmio_needed = 1;
4094 vcpu->run->exit_reason = KVM_EXIT_MMIO;
4095 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4096 vcpu->mmio_size = bytes;
4097 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4098 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
4099 vcpu->mmio_index = 0;
4100
4101 return X86EMUL_IO_NEEDED; 4141 return X86EMUL_IO_NEEDED;
4102} 4142}
4103 4143
4104int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 4144static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4105 const void *val, int bytes) 4145 void *val, int bytes)
4106{ 4146{
4107 int ret; 4147 memcpy(vcpu->mmio_data, val, bytes);
4108 4148 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
4109 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 4149 return X86EMUL_CONTINUE;
4110 if (ret < 0)
4111 return 0;
4112 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
4113 return 1;
4114} 4150}
4115 4151
4116static int emulator_write_emulated_onepage(unsigned long addr, 4152static struct read_write_emulator_ops read_emultor = {
4117 const void *val, 4153 .read_write_prepare = read_prepare,
4118 unsigned int bytes, 4154 .read_write_emulate = read_emulate,
4119 struct x86_exception *exception, 4155 .read_write_mmio = vcpu_mmio_read,
4120 struct kvm_vcpu *vcpu) 4156 .read_write_exit_mmio = read_exit_mmio,
4157};
4158
4159static struct read_write_emulator_ops write_emultor = {
4160 .read_write_emulate = write_emulate,
4161 .read_write_mmio = write_mmio,
4162 .read_write_exit_mmio = write_exit_mmio,
4163 .write = true,
4164};
4165
4166static int emulator_read_write_onepage(unsigned long addr, void *val,
4167 unsigned int bytes,
4168 struct x86_exception *exception,
4169 struct kvm_vcpu *vcpu,
4170 struct read_write_emulator_ops *ops)
4121{ 4171{
4122 gpa_t gpa; 4172 gpa_t gpa;
4123 int handled, ret; 4173 int handled, ret;
4174 bool write = ops->write;
4124 4175
4125 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); 4176 if (ops->read_write_prepare &&
4177 ops->read_write_prepare(vcpu, val, bytes))
4178 return X86EMUL_CONTINUE;
4179
4180 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4126 4181
4127 if (ret < 0) 4182 if (ret < 0)
4128 return X86EMUL_PROPAGATE_FAULT; 4183 return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4186,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
4131 if (ret) 4186 if (ret)
4132 goto mmio; 4187 goto mmio;
4133 4188
4134 if (emulator_write_phys(vcpu, gpa, val, bytes)) 4189 if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4135 return X86EMUL_CONTINUE; 4190 return X86EMUL_CONTINUE;
4136 4191
4137mmio: 4192mmio:
4138 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4139 /* 4193 /*
4140 * Is this MMIO handled locally? 4194 * Is this MMIO handled locally?
4141 */ 4195 */
4142 handled = vcpu_mmio_write(vcpu, gpa, bytes, val); 4196 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4143 if (handled == bytes) 4197 if (handled == bytes)
4144 return X86EMUL_CONTINUE; 4198 return X86EMUL_CONTINUE;
4145 4199
@@ -4148,23 +4202,20 @@ mmio:
4148 val += handled; 4202 val += handled;
4149 4203
4150 vcpu->mmio_needed = 1; 4204 vcpu->mmio_needed = 1;
4151 memcpy(vcpu->mmio_data, val, bytes);
4152 vcpu->run->exit_reason = KVM_EXIT_MMIO; 4205 vcpu->run->exit_reason = KVM_EXIT_MMIO;
4153 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 4206 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4154 vcpu->mmio_size = bytes; 4207 vcpu->mmio_size = bytes;
4155 vcpu->run->mmio.len = min(vcpu->mmio_size, 8); 4208 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4156 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 4209 vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
4157 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
4158 vcpu->mmio_index = 0; 4210 vcpu->mmio_index = 0;
4159 4211
4160 return X86EMUL_CONTINUE; 4212 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4161} 4213}
4162 4214
4163int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 4215int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4164 unsigned long addr, 4216 void *val, unsigned int bytes,
4165 const void *val, 4217 struct x86_exception *exception,
4166 unsigned int bytes, 4218 struct read_write_emulator_ops *ops)
4167 struct x86_exception *exception)
4168{ 4219{
4169 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4170 4221
@@ -4173,16 +4224,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4173 int rc, now; 4224 int rc, now;
4174 4225
4175 now = -addr & ~PAGE_MASK; 4226 now = -addr & ~PAGE_MASK;
4176 rc = emulator_write_emulated_onepage(addr, val, now, exception, 4227 rc = emulator_read_write_onepage(addr, val, now, exception,
4177 vcpu); 4228 vcpu, ops);
4229
4178 if (rc != X86EMUL_CONTINUE) 4230 if (rc != X86EMUL_CONTINUE)
4179 return rc; 4231 return rc;
4180 addr += now; 4232 addr += now;
4181 val += now; 4233 val += now;
4182 bytes -= now; 4234 bytes -= now;
4183 } 4235 }
4184 return emulator_write_emulated_onepage(addr, val, bytes, exception, 4236
4185 vcpu); 4237 return emulator_read_write_onepage(addr, val, bytes, exception,
4238 vcpu, ops);
4239}
4240
4241static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4242 unsigned long addr,
4243 void *val,
4244 unsigned int bytes,
4245 struct x86_exception *exception)
4246{
4247 return emulator_read_write(ctxt, addr, val, bytes,
4248 exception, &read_emultor);
4249}
4250
4251int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4252 unsigned long addr,
4253 const void *val,
4254 unsigned int bytes,
4255 struct x86_exception *exception)
4256{
4257 return emulator_read_write(ctxt, addr, (void *)val, bytes,
4258 exception, &write_emultor);
4186} 4259}
4187 4260
4188#define CMPXCHG_TYPE(t, ptr, old, new) \ 4261#define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4712,7 +4785,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4712 kvm_set_rflags(vcpu, ctxt->eflags); 4785 kvm_set_rflags(vcpu, ctxt->eflags);
4713 4786
4714 if (irq == NMI_VECTOR) 4787 if (irq == NMI_VECTOR)
4715 vcpu->arch.nmi_pending = false; 4788 vcpu->arch.nmi_pending = 0;
4716 else 4789 else
4717 vcpu->arch.interrupt.pending = false; 4790 vcpu->arch.interrupt.pending = false;
4718 4791
@@ -4788,7 +4861,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4788 4861
4789 trace_kvm_emulate_insn_start(vcpu); 4862 trace_kvm_emulate_insn_start(vcpu);
4790 ++vcpu->stat.insn_emulation; 4863 ++vcpu->stat.insn_emulation;
4791 if (r) { 4864 if (r != EMULATION_OK) {
4792 if (emulation_type & EMULTYPE_TRAP_UD) 4865 if (emulation_type & EMULTYPE_TRAP_UD)
4793 return EMULATE_FAIL; 4866 return EMULATE_FAIL;
4794 if (reexecute_instruction(vcpu, cr2)) 4867 if (reexecute_instruction(vcpu, cr2))
@@ -5521,7 +5594,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5521 /* try to inject new event if pending */ 5594 /* try to inject new event if pending */
5522 if (vcpu->arch.nmi_pending) { 5595 if (vcpu->arch.nmi_pending) {
5523 if (kvm_x86_ops->nmi_allowed(vcpu)) { 5596 if (kvm_x86_ops->nmi_allowed(vcpu)) {
5524 vcpu->arch.nmi_pending = false; 5597 --vcpu->arch.nmi_pending;
5525 vcpu->arch.nmi_injected = true; 5598 vcpu->arch.nmi_injected = true;
5526 kvm_x86_ops->set_nmi(vcpu); 5599 kvm_x86_ops->set_nmi(vcpu);
5527 } 5600 }
@@ -5553,10 +5626,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5553 } 5626 }
5554} 5627}
5555 5628
5629static void process_nmi(struct kvm_vcpu *vcpu)
5630{
5631 unsigned limit = 2;
5632
5633 /*
5634 * x86 is limited to one NMI running, and one NMI pending after it.
5635 * If an NMI is already in progress, limit further NMIs to just one.
5636 * Otherwise, allow two (and we'll inject the first one immediately).
5637 */
5638 if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
5639 limit = 1;
5640
5641 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
5642 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
5643 kvm_make_request(KVM_REQ_EVENT, vcpu);
5644}
5645
5556static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5646static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5557{ 5647{
5558 int r; 5648 int r;
5559 bool nmi_pending;
5560 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5649 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5561 vcpu->run->request_interrupt_window; 5650 vcpu->run->request_interrupt_window;
5562 5651
@@ -5596,6 +5685,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5596 } 5685 }
5597 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) 5686 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5598 record_steal_time(vcpu); 5687 record_steal_time(vcpu);
5688 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5689 process_nmi(vcpu);
5599 5690
5600 } 5691 }
5601 5692
@@ -5603,19 +5694,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5603 if (unlikely(r)) 5694 if (unlikely(r))
5604 goto out; 5695 goto out;
5605 5696
5606 /*
5607 * An NMI can be injected between local nmi_pending read and
5608 * vcpu->arch.nmi_pending read inside inject_pending_event().
5609 * But in that case, KVM_REQ_EVENT will be set, which makes
5610 * the race described above benign.
5611 */
5612 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5613
5614 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5697 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5615 inject_pending_event(vcpu); 5698 inject_pending_event(vcpu);
5616 5699
5617 /* enable NMI/IRQ window open exits if needed */ 5700 /* enable NMI/IRQ window open exits if needed */
5618 if (nmi_pending) 5701 if (vcpu->arch.nmi_pending)
5619 kvm_x86_ops->enable_nmi_window(vcpu); 5702 kvm_x86_ops->enable_nmi_window(vcpu);
5620 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5703 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5621 kvm_x86_ops->enable_irq_window(vcpu); 5704 kvm_x86_ops->enable_irq_window(vcpu);
@@ -5678,7 +5761,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5678 if (hw_breakpoint_active()) 5761 if (hw_breakpoint_active())
5679 hw_breakpoint_restore(); 5762 hw_breakpoint_restore();
5680 5763
5681 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5764 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
5682 5765
5683 vcpu->mode = OUTSIDE_GUEST_MODE; 5766 vcpu->mode = OUTSIDE_GUEST_MODE;
5684 smp_wmb(); 5767 smp_wmb();
@@ -6323,7 +6406,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6323 6406
6324int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6407int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6325{ 6408{
6326 vcpu->arch.nmi_pending = false; 6409 atomic_set(&vcpu->arch.nmi_queued, 0);
6410 vcpu->arch.nmi_pending = 0;
6327 vcpu->arch.nmi_injected = false; 6411 vcpu->arch.nmi_injected = false;
6328 6412
6329 vcpu->arch.switch_db_regs = 0; 6413 vcpu->arch.switch_db_regs = 0;
@@ -6598,7 +6682,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6598 !vcpu->arch.apf.halted) 6682 !vcpu->arch.apf.halted)
6599 || !list_empty_careful(&vcpu->async_pf.done) 6683 || !list_empty_careful(&vcpu->async_pf.done)
6600 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6684 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6601 || vcpu->arch.nmi_pending || 6685 || atomic_read(&vcpu->arch.nmi_queued) ||
6602 (kvm_arch_interrupt_allowed(vcpu) && 6686 (kvm_arch_interrupt_allowed(vcpu) &&
6603 kvm_cpu_has_interrupt(vcpu)); 6687 kvm_cpu_has_interrupt(vcpu));
6604} 6688}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index aace6b8691a2..f47fcd30273d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -371,6 +371,7 @@ struct kvm_s390_psw {
371#define KVM_S390_INT_VIRTIO 0xffff2603u 371#define KVM_S390_INT_VIRTIO 0xffff2603u
372#define KVM_S390_INT_SERVICE 0xffff2401u 372#define KVM_S390_INT_SERVICE 0xffff2401u
373#define KVM_S390_INT_EMERGENCY 0xffff1201u 373#define KVM_S390_INT_EMERGENCY 0xffff1201u
374#define KVM_S390_INT_EXTERNAL_CALL 0xffff1202u
374 375
375struct kvm_s390_interrupt { 376struct kvm_s390_interrupt {
376 __u32 type; 377 __u32 type;
@@ -463,7 +464,7 @@ struct kvm_ppc_pvinfo {
463#define KVM_CAP_VAPIC 6 464#define KVM_CAP_VAPIC 6
464#define KVM_CAP_EXT_CPUID 7 465#define KVM_CAP_EXT_CPUID 7
465#define KVM_CAP_CLOCKSOURCE 8 466#define KVM_CAP_CLOCKSOURCE 8
466#define KVM_CAP_NR_VCPUS 9 /* returns max vcpus per vm */ 467#define KVM_CAP_NR_VCPUS 9 /* returns recommended max vcpus per vm */
467#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */ 468#define KVM_CAP_NR_MEMSLOTS 10 /* returns max memory slots per vm */
468#define KVM_CAP_PIT 11 469#define KVM_CAP_PIT 11
469#define KVM_CAP_NOP_IO_DELAY 12 470#define KVM_CAP_NOP_IO_DELAY 12
@@ -553,6 +554,9 @@ struct kvm_ppc_pvinfo {
553#define KVM_CAP_SPAPR_TCE 63 554#define KVM_CAP_SPAPR_TCE 63
554#define KVM_CAP_PPC_SMT 64 555#define KVM_CAP_PPC_SMT 64
555#define KVM_CAP_PPC_RMA 65 556#define KVM_CAP_PPC_RMA 65
557#define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */
558#define KVM_CAP_PPC_HIOR 67
559#define KVM_CAP_PPC_PAPR 68
556#define KVM_CAP_S390_GMAP 71 560#define KVM_CAP_S390_GMAP 71
557 561
558#ifdef KVM_CAP_IRQ_ROUTING 562#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eabb21a30c34..d52623199978 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -18,6 +18,7 @@
18#include <linux/msi.h> 18#include <linux/msi.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/rcupdate.h> 20#include <linux/rcupdate.h>
21#include <linux/ratelimit.h>
21#include <asm/signal.h> 22#include <asm/signal.h>
22 23
23#include <linux/kvm.h> 24#include <linux/kvm.h>
@@ -48,6 +49,7 @@
48#define KVM_REQ_EVENT 11 49#define KVM_REQ_EVENT 11
49#define KVM_REQ_APF_HALT 12 50#define KVM_REQ_APF_HALT 12
50#define KVM_REQ_STEAL_UPDATE 13 51#define KVM_REQ_STEAL_UPDATE 13
52#define KVM_REQ_NMI 14
51 53
52#define KVM_USERSPACE_IRQ_SOURCE_ID 0 54#define KVM_USERSPACE_IRQ_SOURCE_ID 0
53 55
@@ -55,16 +57,16 @@ struct kvm;
55struct kvm_vcpu; 57struct kvm_vcpu;
56extern struct kmem_cache *kvm_vcpu_cache; 58extern struct kmem_cache *kvm_vcpu_cache;
57 59
58/* 60struct kvm_io_range {
59 * It would be nice to use something smarter than a linear search, TBD... 61 gpa_t addr;
60 * Thankfully we dont expect many devices to register (famous last words :), 62 int len;
61 * so until then it will suffice. At least its abstracted so we can change 63 struct kvm_io_device *dev;
62 * in one place. 64};
63 */ 65
64struct kvm_io_bus { 66struct kvm_io_bus {
65 int dev_count; 67 int dev_count;
66#define NR_IOBUS_DEVS 200 68#define NR_IOBUS_DEVS 300
67 struct kvm_io_device *devs[NR_IOBUS_DEVS]; 69 struct kvm_io_range range[NR_IOBUS_DEVS];
68}; 70};
69 71
70enum kvm_bus { 72enum kvm_bus {
@@ -77,8 +79,8 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
77 int len, const void *val); 79 int len, const void *val);
78int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, 80int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
79 void *val); 81 void *val);
80int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 82int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
81 struct kvm_io_device *dev); 83 int len, struct kvm_io_device *dev);
82int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 84int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
83 struct kvm_io_device *dev); 85 struct kvm_io_device *dev);
84 86
@@ -256,8 +258,9 @@ struct kvm {
256 struct kvm_arch arch; 258 struct kvm_arch arch;
257 atomic_t users_count; 259 atomic_t users_count;
258#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 260#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
259 struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
260 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 261 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
262 spinlock_t ring_lock;
263 struct list_head coalesced_zones;
261#endif 264#endif
262 265
263 struct mutex irq_lock; 266 struct mutex irq_lock;
@@ -281,11 +284,8 @@ struct kvm {
281 284
282/* The guest did something we don't support. */ 285/* The guest did something we don't support. */
283#define pr_unimpl(vcpu, fmt, ...) \ 286#define pr_unimpl(vcpu, fmt, ...) \
284 do { \ 287 pr_err_ratelimited("kvm: %i: cpu%i " fmt, \
285 if (printk_ratelimit()) \ 288 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__)
286 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
287 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
288 } while (0)
289 289
290#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 290#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
291#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) 291#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index eaf3a50f9769..3ad0925d23a9 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -58,8 +58,6 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
58static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) 58static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
59{ 59{
60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 60 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
61 u32 vector;
62 int index;
63 61
64 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { 62 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
65 spin_lock(&assigned_dev->intx_lock); 63 spin_lock(&assigned_dev->intx_lock);
@@ -68,31 +66,35 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
68 spin_unlock(&assigned_dev->intx_lock); 66 spin_unlock(&assigned_dev->intx_lock);
69 } 67 }
70 68
71 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 69 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
72 index = find_index_from_host_irq(assigned_dev, irq); 70 assigned_dev->guest_irq, 1);
73 if (index >= 0) { 71
74 vector = assigned_dev-> 72 return IRQ_HANDLED;
75 guest_msix_entries[index].vector; 73}
76 kvm_set_irq(assigned_dev->kvm, 74
77 assigned_dev->irq_source_id, vector, 1); 75#ifdef __KVM_HAVE_MSIX
78 } 76static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
79 } else 77{
78 struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
79 int index = find_index_from_host_irq(assigned_dev, irq);
80 u32 vector;
81
82 if (index >= 0) {
83 vector = assigned_dev->guest_msix_entries[index].vector;
80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 84 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
81 assigned_dev->guest_irq, 1); 85 vector, 1);
86 }
82 87
83 return IRQ_HANDLED; 88 return IRQ_HANDLED;
84} 89}
90#endif
85 91
86/* Ack the irq line for an assigned device */ 92/* Ack the irq line for an assigned device */
87static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 93static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
88{ 94{
89 struct kvm_assigned_dev_kernel *dev; 95 struct kvm_assigned_dev_kernel *dev =
90 96 container_of(kian, struct kvm_assigned_dev_kernel,
91 if (kian->gsi == -1) 97 ack_notifier);
92 return;
93
94 dev = container_of(kian, struct kvm_assigned_dev_kernel,
95 ack_notifier);
96 98
97 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 99 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
98 100
@@ -110,8 +112,9 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
110static void deassign_guest_irq(struct kvm *kvm, 112static void deassign_guest_irq(struct kvm *kvm,
111 struct kvm_assigned_dev_kernel *assigned_dev) 113 struct kvm_assigned_dev_kernel *assigned_dev)
112{ 114{
113 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 115 if (assigned_dev->ack_notifier.gsi != -1)
114 assigned_dev->ack_notifier.gsi = -1; 116 kvm_unregister_irq_ack_notifier(kvm,
117 &assigned_dev->ack_notifier);
115 118
116 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 119 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
117 assigned_dev->guest_irq, 0); 120 assigned_dev->guest_irq, 0);
@@ -143,7 +146,7 @@ static void deassign_host_irq(struct kvm *kvm,
143 146
144 for (i = 0; i < assigned_dev->entries_nr; i++) 147 for (i = 0; i < assigned_dev->entries_nr; i++)
145 free_irq(assigned_dev->host_msix_entries[i].vector, 148 free_irq(assigned_dev->host_msix_entries[i].vector,
146 (void *)assigned_dev); 149 assigned_dev);
147 150
148 assigned_dev->entries_nr = 0; 151 assigned_dev->entries_nr = 0;
149 kfree(assigned_dev->host_msix_entries); 152 kfree(assigned_dev->host_msix_entries);
@@ -153,7 +156,7 @@ static void deassign_host_irq(struct kvm *kvm,
153 /* Deal with MSI and INTx */ 156 /* Deal with MSI and INTx */
154 disable_irq(assigned_dev->host_irq); 157 disable_irq(assigned_dev->host_irq);
155 158
156 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 159 free_irq(assigned_dev->host_irq, assigned_dev);
157 160
158 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 161 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
159 pci_disable_msi(assigned_dev->dev); 162 pci_disable_msi(assigned_dev->dev);
@@ -239,7 +242,7 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
239 * are going to be long delays in accepting, acking, etc. 242 * are going to be long delays in accepting, acking, etc.
240 */ 243 */
241 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, 244 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
242 IRQF_ONESHOT, dev->irq_name, (void *)dev)) 245 IRQF_ONESHOT, dev->irq_name, dev))
243 return -EIO; 246 return -EIO;
244 return 0; 247 return 0;
245} 248}
@@ -258,7 +261,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
258 261
259 dev->host_irq = dev->dev->irq; 262 dev->host_irq = dev->dev->irq;
260 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, 263 if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
261 0, dev->irq_name, (void *)dev)) { 264 0, dev->irq_name, dev)) {
262 pci_disable_msi(dev->dev); 265 pci_disable_msi(dev->dev);
263 return -EIO; 266 return -EIO;
264 } 267 }
@@ -284,8 +287,8 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
284 287
285 for (i = 0; i < dev->entries_nr; i++) { 288 for (i = 0; i < dev->entries_nr; i++) {
286 r = request_threaded_irq(dev->host_msix_entries[i].vector, 289 r = request_threaded_irq(dev->host_msix_entries[i].vector,
287 NULL, kvm_assigned_dev_thread, 290 NULL, kvm_assigned_dev_thread_msix,
288 0, dev->irq_name, (void *)dev); 291 0, dev->irq_name, dev);
289 if (r) 292 if (r)
290 goto err; 293 goto err;
291 } 294 }
@@ -293,7 +296,7 @@ static int assigned_device_enable_host_msix(struct kvm *kvm,
293 return 0; 296 return 0;
294err: 297err:
295 for (i -= 1; i >= 0; i--) 298 for (i -= 1; i >= 0; i--)
296 free_irq(dev->host_msix_entries[i].vector, (void *)dev); 299 free_irq(dev->host_msix_entries[i].vector, dev);
297 pci_disable_msix(dev->dev); 300 pci_disable_msix(dev->dev);
298 return r; 301 return r;
299} 302}
@@ -406,7 +409,8 @@ static int assign_guest_irq(struct kvm *kvm,
406 409
407 if (!r) { 410 if (!r) {
408 dev->irq_requested_type |= guest_irq_type; 411 dev->irq_requested_type |= guest_irq_type;
409 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 412 if (dev->ack_notifier.gsi != -1)
413 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
410 } else 414 } else
411 kvm_free_irq_source_id(kvm, dev->irq_source_id); 415 kvm_free_irq_source_id(kvm, dev->irq_source_id);
412 416
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index fc8487564d1f..a6ec206f36ba 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -24,10 +24,19 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
24static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, 24static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
25 gpa_t addr, int len) 25 gpa_t addr, int len)
26{ 26{
27 struct kvm_coalesced_mmio_zone *zone; 27 /* is it in a batchable area ?
28 * (addr,len) is fully included in
29 * (zone->addr, zone->size)
30 */
31
32 return (dev->zone.addr <= addr &&
33 addr + len <= dev->zone.addr + dev->zone.size);
34}
35
36static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
37{
28 struct kvm_coalesced_mmio_ring *ring; 38 struct kvm_coalesced_mmio_ring *ring;
29 unsigned avail; 39 unsigned avail;
30 int i;
31 40
32 /* Are we able to batch it ? */ 41 /* Are we able to batch it ? */
33 42
@@ -37,25 +46,12 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
37 */ 46 */
38 ring = dev->kvm->coalesced_mmio_ring; 47 ring = dev->kvm->coalesced_mmio_ring;
39 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX; 48 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
40 if (avail < KVM_MAX_VCPUS) { 49 if (avail == 0) {
41 /* full */ 50 /* full */
42 return 0; 51 return 0;
43 } 52 }
44 53
45 /* is it in a batchable area ? */ 54 return 1;
46
47 for (i = 0; i < dev->nb_zones; i++) {
48 zone = &dev->zone[i];
49
50 /* (addr,len) is fully included in
51 * (zone->addr, zone->size)
52 */
53
54 if (zone->addr <= addr &&
55 addr + len <= zone->addr + zone->size)
56 return 1;
57 }
58 return 0;
59} 55}
60 56
61static int coalesced_mmio_write(struct kvm_io_device *this, 57static int coalesced_mmio_write(struct kvm_io_device *this,
@@ -63,10 +59,16 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
63{ 59{
64 struct kvm_coalesced_mmio_dev *dev = to_mmio(this); 60 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
65 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; 61 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
62
66 if (!coalesced_mmio_in_range(dev, addr, len)) 63 if (!coalesced_mmio_in_range(dev, addr, len))
67 return -EOPNOTSUPP; 64 return -EOPNOTSUPP;
68 65
69 spin_lock(&dev->lock); 66 spin_lock(&dev->kvm->ring_lock);
67
68 if (!coalesced_mmio_has_room(dev)) {
69 spin_unlock(&dev->kvm->ring_lock);
70 return -EOPNOTSUPP;
71 }
70 72
71 /* copy data in first free entry of the ring */ 73 /* copy data in first free entry of the ring */
72 74
@@ -75,7 +77,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
75 memcpy(ring->coalesced_mmio[ring->last].data, val, len); 77 memcpy(ring->coalesced_mmio[ring->last].data, val, len);
76 smp_wmb(); 78 smp_wmb();
77 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; 79 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
78 spin_unlock(&dev->lock); 80 spin_unlock(&dev->kvm->ring_lock);
79 return 0; 81 return 0;
80} 82}
81 83
@@ -83,6 +85,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
83{ 85{
84 struct kvm_coalesced_mmio_dev *dev = to_mmio(this); 86 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
85 87
88 list_del(&dev->list);
89
86 kfree(dev); 90 kfree(dev);
87} 91}
88 92
@@ -93,7 +97,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
93 97
94int kvm_coalesced_mmio_init(struct kvm *kvm) 98int kvm_coalesced_mmio_init(struct kvm *kvm)
95{ 99{
96 struct kvm_coalesced_mmio_dev *dev;
97 struct page *page; 100 struct page *page;
98 int ret; 101 int ret;
99 102
@@ -101,31 +104,18 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
101 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 104 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
102 if (!page) 105 if (!page)
103 goto out_err; 106 goto out_err;
104 kvm->coalesced_mmio_ring = page_address(page);
105
106 ret = -ENOMEM;
107 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
108 if (!dev)
109 goto out_free_page;
110 spin_lock_init(&dev->lock);
111 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
112 dev->kvm = kvm;
113 kvm->coalesced_mmio_dev = dev;
114 107
115 mutex_lock(&kvm->slots_lock); 108 ret = 0;
116 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev); 109 kvm->coalesced_mmio_ring = page_address(page);
117 mutex_unlock(&kvm->slots_lock);
118 if (ret < 0)
119 goto out_free_dev;
120 110
121 return ret; 111 /*
112 * We're using this spinlock to sync access to the coalesced ring.
113 * The list doesn't need it's own lock since device registration and
114 * unregistration should only happen when kvm->slots_lock is held.
115 */
116 spin_lock_init(&kvm->ring_lock);
117 INIT_LIST_HEAD(&kvm->coalesced_zones);
122 118
123out_free_dev:
124 kvm->coalesced_mmio_dev = NULL;
125 kfree(dev);
126out_free_page:
127 kvm->coalesced_mmio_ring = NULL;
128 __free_page(page);
129out_err: 119out_err:
130 return ret; 120 return ret;
131} 121}
@@ -139,51 +129,50 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
139int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, 129int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
140 struct kvm_coalesced_mmio_zone *zone) 130 struct kvm_coalesced_mmio_zone *zone)
141{ 131{
142 struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev; 132 int ret;
133 struct kvm_coalesced_mmio_dev *dev;
143 134
144 if (dev == NULL) 135 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
145 return -ENXIO; 136 if (!dev)
137 return -ENOMEM;
138
139 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
140 dev->kvm = kvm;
141 dev->zone = *zone;
146 142
147 mutex_lock(&kvm->slots_lock); 143 mutex_lock(&kvm->slots_lock);
148 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { 144 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr,
149 mutex_unlock(&kvm->slots_lock); 145 zone->size, &dev->dev);
150 return -ENOBUFS; 146 if (ret < 0)
151 } 147 goto out_free_dev;
148 list_add_tail(&dev->list, &kvm->coalesced_zones);
149 mutex_unlock(&kvm->slots_lock);
152 150
153 dev->zone[dev->nb_zones] = *zone; 151 return ret;
154 dev->nb_zones++;
155 152
153out_free_dev:
156 mutex_unlock(&kvm->slots_lock); 154 mutex_unlock(&kvm->slots_lock);
155
156 kfree(dev);
157
158 if (dev == NULL)
159 return -ENXIO;
160
157 return 0; 161 return 0;
158} 162}
159 163
160int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, 164int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
161 struct kvm_coalesced_mmio_zone *zone) 165 struct kvm_coalesced_mmio_zone *zone)
162{ 166{
163 int i; 167 struct kvm_coalesced_mmio_dev *dev, *tmp;
164 struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
165 struct kvm_coalesced_mmio_zone *z;
166
167 if (dev == NULL)
168 return -ENXIO;
169 168
170 mutex_lock(&kvm->slots_lock); 169 mutex_lock(&kvm->slots_lock);
171 170
172 i = dev->nb_zones; 171 list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
173 while (i) { 172 if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
174 z = &dev->zone[i - 1]; 173 kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
175 174 kvm_iodevice_destructor(&dev->dev);
176 /* unregister all zones
177 * included in (zone->addr, zone->size)
178 */
179
180 if (zone->addr <= z->addr &&
181 z->addr + z->size <= zone->addr + zone->size) {
182 dev->nb_zones--;
183 *z = dev->zone[dev->nb_zones];
184 } 175 }
185 i--;
186 }
187 176
188 mutex_unlock(&kvm->slots_lock); 177 mutex_unlock(&kvm->slots_lock);
189 178
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 8a5959e3535f..b280c20444d1 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,14 +12,13 @@
12 12
13#ifdef CONFIG_KVM_MMIO 13#ifdef CONFIG_KVM_MMIO
14 14
15#define KVM_COALESCED_MMIO_ZONE_MAX 100 15#include <linux/list.h>
16 16
17struct kvm_coalesced_mmio_dev { 17struct kvm_coalesced_mmio_dev {
18 struct list_head list;
18 struct kvm_io_device dev; 19 struct kvm_io_device dev;
19 struct kvm *kvm; 20 struct kvm *kvm;
20 spinlock_t lock; 21 struct kvm_coalesced_mmio_zone zone;
21 int nb_zones;
22 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
23}; 22};
24 23
25int kvm_coalesced_mmio_init(struct kvm *kvm); 24int kvm_coalesced_mmio_init(struct kvm *kvm);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 73358d256fa2..f59c1e8de7a2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -586,7 +586,8 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
586 586
587 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 587 kvm_iodevice_init(&p->dev, &ioeventfd_ops);
588 588
589 ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); 589 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
590 &p->dev);
590 if (ret < 0) 591 if (ret < 0)
591 goto unlock_fail; 592 goto unlock_fail;
592 593
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 8df1ca104a7f..3eed61eb4867 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -394,7 +394,8 @@ int kvm_ioapic_init(struct kvm *kvm)
394 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops); 394 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
395 ioapic->kvm = kvm; 395 ioapic->kvm = kvm;
396 mutex_lock(&kvm->slots_lock); 396 mutex_lock(&kvm->slots_lock);
397 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); 397 ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
398 IOAPIC_MEM_LENGTH, &ioapic->dev);
398 mutex_unlock(&kvm->slots_lock); 399 mutex_unlock(&kvm->slots_lock);
399 if (ret < 0) { 400 if (ret < 0) {
400 kvm->arch.vioapic = NULL; 401 kvm->arch.vioapic = NULL;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aefdda390f5e..d9cfb782cb81 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,6 +47,8 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/hugetlb.h> 48#include <linux/hugetlb.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
50#include <linux/sort.h>
51#include <linux/bsearch.h>
50 52
51#include <asm/processor.h> 53#include <asm/processor.h>
52#include <asm/io.h> 54#include <asm/io.h>
@@ -2391,24 +2393,92 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2391 int i; 2393 int i;
2392 2394
2393 for (i = 0; i < bus->dev_count; i++) { 2395 for (i = 0; i < bus->dev_count; i++) {
2394 struct kvm_io_device *pos = bus->devs[i]; 2396 struct kvm_io_device *pos = bus->range[i].dev;
2395 2397
2396 kvm_iodevice_destructor(pos); 2398 kvm_iodevice_destructor(pos);
2397 } 2399 }
2398 kfree(bus); 2400 kfree(bus);
2399} 2401}
2400 2402
2403int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2404{
2405 const struct kvm_io_range *r1 = p1;
2406 const struct kvm_io_range *r2 = p2;
2407
2408 if (r1->addr < r2->addr)
2409 return -1;
2410 if (r1->addr + r1->len > r2->addr + r2->len)
2411 return 1;
2412 return 0;
2413}
2414
2415int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2416 gpa_t addr, int len)
2417{
2418 if (bus->dev_count == NR_IOBUS_DEVS)
2419 return -ENOSPC;
2420
2421 bus->range[bus->dev_count++] = (struct kvm_io_range) {
2422 .addr = addr,
2423 .len = len,
2424 .dev = dev,
2425 };
2426
2427 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
2428 kvm_io_bus_sort_cmp, NULL);
2429
2430 return 0;
2431}
2432
2433int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2434 gpa_t addr, int len)
2435{
2436 struct kvm_io_range *range, key;
2437 int off;
2438
2439 key = (struct kvm_io_range) {
2440 .addr = addr,
2441 .len = len,
2442 };
2443
2444 range = bsearch(&key, bus->range, bus->dev_count,
2445 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
2446 if (range == NULL)
2447 return -ENOENT;
2448
2449 off = range - bus->range;
2450
2451 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
2452 off--;
2453
2454 return off;
2455}
2456
2401/* kvm_io_bus_write - called under kvm->slots_lock */ 2457/* kvm_io_bus_write - called under kvm->slots_lock */
2402int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2458int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2403 int len, const void *val) 2459 int len, const void *val)
2404{ 2460{
2405 int i; 2461 int idx;
2406 struct kvm_io_bus *bus; 2462 struct kvm_io_bus *bus;
2463 struct kvm_io_range range;
2464
2465 range = (struct kvm_io_range) {
2466 .addr = addr,
2467 .len = len,
2468 };
2407 2469
2408 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2470 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2409 for (i = 0; i < bus->dev_count; i++) 2471 idx = kvm_io_bus_get_first_dev(bus, addr, len);
2410 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2472 if (idx < 0)
2473 return -EOPNOTSUPP;
2474
2475 while (idx < bus->dev_count &&
2476 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2477 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
2411 return 0; 2478 return 0;
2479 idx++;
2480 }
2481
2412 return -EOPNOTSUPP; 2482 return -EOPNOTSUPP;
2413} 2483}
2414 2484
@@ -2416,19 +2486,33 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2416int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2486int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2417 int len, void *val) 2487 int len, void *val)
2418{ 2488{
2419 int i; 2489 int idx;
2420 struct kvm_io_bus *bus; 2490 struct kvm_io_bus *bus;
2491 struct kvm_io_range range;
2492
2493 range = (struct kvm_io_range) {
2494 .addr = addr,
2495 .len = len,
2496 };
2421 2497
2422 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2498 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2423 for (i = 0; i < bus->dev_count; i++) 2499 idx = kvm_io_bus_get_first_dev(bus, addr, len);
2424 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2500 if (idx < 0)
2501 return -EOPNOTSUPP;
2502
2503 while (idx < bus->dev_count &&
2504 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2505 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
2425 return 0; 2506 return 0;
2507 idx++;
2508 }
2509
2426 return -EOPNOTSUPP; 2510 return -EOPNOTSUPP;
2427} 2511}
2428 2512
2429/* Caller must hold slots_lock. */ 2513/* Caller must hold slots_lock. */
2430int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2514int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2431 struct kvm_io_device *dev) 2515 int len, struct kvm_io_device *dev)
2432{ 2516{
2433 struct kvm_io_bus *new_bus, *bus; 2517 struct kvm_io_bus *new_bus, *bus;
2434 2518
@@ -2440,7 +2524,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2440 if (!new_bus) 2524 if (!new_bus)
2441 return -ENOMEM; 2525 return -ENOMEM;
2442 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2526 memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
2443 new_bus->devs[new_bus->dev_count++] = dev; 2527 kvm_io_bus_insert_dev(new_bus, dev, addr, len);
2444 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2528 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2445 synchronize_srcu_expedited(&kvm->srcu); 2529 synchronize_srcu_expedited(&kvm->srcu);
2446 kfree(bus); 2530 kfree(bus);
@@ -2464,9 +2548,13 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2464 2548
2465 r = -ENOENT; 2549 r = -ENOENT;
2466 for (i = 0; i < new_bus->dev_count; i++) 2550 for (i = 0; i < new_bus->dev_count; i++)
2467 if (new_bus->devs[i] == dev) { 2551 if (new_bus->range[i].dev == dev) {
2468 r = 0; 2552 r = 0;
2469 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; 2553 new_bus->dev_count--;
2554 new_bus->range[i] = new_bus->range[new_bus->dev_count];
2555 sort(new_bus->range, new_bus->dev_count,
2556 sizeof(struct kvm_io_range),
2557 kvm_io_bus_sort_cmp, NULL);
2470 break; 2558 break;
2471 } 2559 }
2472 2560