aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-10-30 18:36:45 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-30 18:36:45 -0400
commit1bc87b00556e8f7ba30a1010471951c5b8f71114 (patch)
treee73c2d187e2dff0df97ed82e32b45e362b923117 /arch
parentacff987d94cbdb4049f3706bed1f1792f8ef6837 (diff)
parentf1c1da2bde712812a3e0f9a7a7ebe7a916a4b5f4 (diff)
Merge branch 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates/3.2' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (75 commits) KVM: SVM: Keep intercepting task switching with NPT enabled KVM: s390: implement sigp external call KVM: s390: fix register setting KVM: s390: fix return value of kvm_arch_init_vm KVM: s390: check cpu_id prior to using it KVM: emulate lapic tsc deadline timer for guest x86: TSC deadline definitions KVM: Fix simultaneous NMIs KVM: x86 emulator: convert push %sreg/pop %sreg to direct decode KVM: x86 emulator: switch lds/les/lss/lfs/lgs to direct decode KVM: x86 emulator: streamline decode of segment registers KVM: x86 emulator: simplify OpMem64 decode KVM: x86 emulator: switch src decode to decode_operand() KVM: x86 emulator: qualify OpReg inhibit_byte_regs hack KVM: x86 emulator: switch OpImmUByte decode to decode_imm() KVM: x86 emulator: free up some flag bits near src, dst KVM: x86 emulator: switch src2 to generic decode_operand() KVM: x86 emulator: expand decode flags to 64 bits KVM: x86 emulator: split dst decode to a generic decode_operand() KVM: x86 emulator: move memop, memopp into emulation context ...
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/kvm.h13
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h40
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_host.h30
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c13
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S10
-rw-r--r--arch/powerpc/kvm/44x.c2
-rw-r--r--arch/powerpc/kvm/Makefile4
-rw-r--r--arch/powerpc/kvm/book3s_32_sr.S2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S2
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c29
-rw-r--r--arch/powerpc/kvm/book3s_exports.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c343
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c33
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S300
-rw-r--r--arch/powerpc/kvm/book3s_interrupts.S129
-rw-r--r--arch/powerpc/kvm/book3s_pr.c58
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c158
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S54
-rw-r--r--arch/powerpc/kvm/book3s_segment.S117
-rw-r--r--arch/powerpc/kvm/booke.c10
-rw-r--r--arch/powerpc/kvm/e500.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c55
-rw-r--r--arch/s390/include/asm/kvm_host.h7
-rw-r--r--arch/s390/kvm/interrupt.c30
-rw-r--r--arch/s390/kvm/kvm-s390.c20
-rw-r--r--arch/s390/kvm/sigp.c45
-rw-r--r--arch/x86/include/asm/apicdef.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/msr-index.h2
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kvm/emulate.c867
-rw-r--r--arch/x86/kvm/i8254.c6
-rw-r--r--arch/x86/kvm/i8259.c123
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h7
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c167
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c5
-rw-r--r--arch/x86/kvm/mmu_audit.c6
-rw-r--r--arch/x86/kvm/paging_tmpl.h24
-rw-r--r--arch/x86/kvm/svm.c93
-rw-r--r--arch/x86/kvm/trace.h118
-rw-r--r--arch/x86/kvm/vmx.c131
-rw-r--r--arch/x86/kvm/x86.c274
50 files changed, 2155 insertions, 1232 deletions
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a4f6c85431f8..08fe69edcd10 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -149,6 +149,12 @@ struct kvm_regs {
149#define KVM_SREGS_E_UPDATE_DBSR (1 << 3) 149#define KVM_SREGS_E_UPDATE_DBSR (1 << 3)
150 150
151/* 151/*
152 * Book3S special bits to indicate contents in the struct by maintaining
153 * backwards compatibility with older structs. If adding a new field,
154 * please make sure to add a flag for that new field */
155#define KVM_SREGS_S_HIOR (1 << 0)
156
157/*
152 * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a 158 * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a
153 * previous KVM_GET_REGS. 159 * previous KVM_GET_REGS.
154 * 160 *
@@ -173,6 +179,8 @@ struct kvm_sregs {
173 __u64 ibat[8]; 179 __u64 ibat[8];
174 __u64 dbat[8]; 180 __u64 dbat[8];
175 } ppc32; 181 } ppc32;
182 __u64 flags; /* KVM_SREGS_S_ */
183 __u64 hior;
176 } s; 184 } s;
177 struct { 185 struct {
178 union { 186 union {
@@ -276,6 +284,11 @@ struct kvm_guest_debug_arch {
276#define KVM_INTERRUPT_UNSET -2U 284#define KVM_INTERRUPT_UNSET -2U
277#define KVM_INTERRUPT_SET_LEVEL -3U 285#define KVM_INTERRUPT_SET_LEVEL -3U
278 286
287#define KVM_CPU_440 1
288#define KVM_CPU_E500V2 2
289#define KVM_CPU_3S_32 3
290#define KVM_CPU_3S_64 4
291
279/* for KVM_CAP_SPAPR_TCE */ 292/* for KVM_CAP_SPAPR_TCE */
280struct kvm_create_spapr_tce { 293struct kvm_create_spapr_tce {
281 __u64 liobn; 294 __u64 liobn;
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 98da010252a3..a384ffdf33de 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s {
90#endif 90#endif
91 int context_id[SID_CONTEXTS]; 91 int context_id[SID_CONTEXTS];
92 92
93 bool hior_sregs; /* HIOR is set by SREGS, not PVR */
94
93 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 95 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
94 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; 96 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
95 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 97 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
@@ -139,15 +141,14 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
139extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 141extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
140extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); 142extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
141 143
142extern void kvmppc_handler_lowmem_trampoline(void); 144extern void kvmppc_entry_trampoline(void);
143extern void kvmppc_handler_trampoline_enter(void);
144extern void kvmppc_rmcall(ulong srr0, ulong srr1);
145extern void kvmppc_hv_entry_trampoline(void); 145extern void kvmppc_hv_entry_trampoline(void);
146extern void kvmppc_load_up_fpu(void); 146extern void kvmppc_load_up_fpu(void);
147extern void kvmppc_load_up_altivec(void); 147extern void kvmppc_load_up_altivec(void);
148extern void kvmppc_load_up_vsx(void); 148extern void kvmppc_load_up_vsx(void);
149extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst); 149extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
150extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst); 150extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
151extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
151 152
152static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) 153static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
153{ 154{
@@ -382,6 +383,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
382} 383}
383#endif 384#endif
384 385
386static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
387 unsigned long pte_index)
388{
389 unsigned long rb, va_low;
390
391 rb = (v & ~0x7fUL) << 16; /* AVA field */
392 va_low = pte_index >> 3;
393 if (v & HPTE_V_SECONDARY)
394 va_low = ~va_low;
395 /* xor vsid from AVA */
396 if (!(v & HPTE_V_1TB_SEG))
397 va_low ^= v >> 12;
398 else
399 va_low ^= v >> 24;
400 va_low &= 0x7ff;
401 if (v & HPTE_V_LARGE) {
402 rb |= 1; /* L field */
403 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
404 (r & 0xff000)) {
405 /* non-16MB large page, must be 64k */
406 /* (masks depend on page size) */
407 rb |= 0x1000; /* page encoding in LP field */
408 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
409 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
410 }
411 } else {
412 /* 4kB page */
413 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
414 }
415 rb |= (v >> 54) & 0x300; /* B field */
416 return rb;
417}
418
385/* Magic register values loaded into r3 and r4 before the 'sc' assembly 419/* Magic register values loaded into r3 and r4 before the 'sc' assembly
386 * instruction for the OSI hypercalls */ 420 * instruction for the OSI hypercalls */
387#define OSI_SC_MAGIC_R3 0x113724FA 421#define OSI_SC_MAGIC_R3 0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index ef7b3688c3b6..1f2f5b6156bd 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -75,6 +75,8 @@ struct kvmppc_host_state {
75 ulong scratch0; 75 ulong scratch0;
76 ulong scratch1; 76 ulong scratch1;
77 u8 in_guest; 77 u8 in_guest;
78 u8 restore_hid5;
79 u8 napping;
78 80
79#ifdef CONFIG_KVM_BOOK3S_64_HV 81#ifdef CONFIG_KVM_BOOK3S_64_HV
80 struct kvm_vcpu *kvm_vcpu; 82 struct kvm_vcpu *kvm_vcpu;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index cc22b282d755..bf8af5d5d5dc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -198,21 +198,29 @@ struct kvm_arch {
198 */ 198 */
199struct kvmppc_vcore { 199struct kvmppc_vcore {
200 int n_runnable; 200 int n_runnable;
201 int n_blocked; 201 int n_busy;
202 int num_threads; 202 int num_threads;
203 int entry_exit_count; 203 int entry_exit_count;
204 int n_woken; 204 int n_woken;
205 int nap_count; 205 int nap_count;
206 int napping_threads;
206 u16 pcpu; 207 u16 pcpu;
207 u8 vcore_running; 208 u8 vcore_state;
208 u8 in_guest; 209 u8 in_guest;
209 struct list_head runnable_threads; 210 struct list_head runnable_threads;
210 spinlock_t lock; 211 spinlock_t lock;
212 wait_queue_head_t wq;
211}; 213};
212 214
213#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) 215#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff)
214#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) 216#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8)
215 217
218/* Values for vcore_state */
219#define VCORE_INACTIVE 0
220#define VCORE_RUNNING 1
221#define VCORE_EXITING 2
222#define VCORE_SLEEPING 3
223
216struct kvmppc_pte { 224struct kvmppc_pte {
217 ulong eaddr; 225 ulong eaddr;
218 u64 vpage; 226 u64 vpage;
@@ -258,14 +266,6 @@ struct kvm_vcpu_arch {
258 ulong host_stack; 266 ulong host_stack;
259 u32 host_pid; 267 u32 host_pid;
260#ifdef CONFIG_PPC_BOOK3S 268#ifdef CONFIG_PPC_BOOK3S
261 ulong host_msr;
262 ulong host_r2;
263 void *host_retip;
264 ulong trampoline_lowmem;
265 ulong trampoline_enter;
266 ulong highmem_handler;
267 ulong rmcall;
268 ulong host_paca_phys;
269 struct kvmppc_slb slb[64]; 269 struct kvmppc_slb slb[64];
270 int slb_max; /* 1 + index of last valid entry in slb[] */ 270 int slb_max; /* 1 + index of last valid entry in slb[] */
271 int slb_nr; /* total number of entries in SLB */ 271 int slb_nr; /* total number of entries in SLB */
@@ -389,6 +389,9 @@ struct kvm_vcpu_arch {
389 u8 dcr_is_write; 389 u8 dcr_is_write;
390 u8 osi_needed; 390 u8 osi_needed;
391 u8 osi_enabled; 391 u8 osi_enabled;
392 u8 papr_enabled;
393 u8 sane;
394 u8 cpu_type;
392 u8 hcall_needed; 395 u8 hcall_needed;
393 396
394 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ 397 u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -408,11 +411,13 @@ struct kvm_vcpu_arch {
408 struct dtl *dtl; 411 struct dtl *dtl;
409 struct dtl *dtl_end; 412 struct dtl *dtl_end;
410 413
414 wait_queue_head_t *wqp;
411 struct kvmppc_vcore *vcore; 415 struct kvmppc_vcore *vcore;
412 int ret; 416 int ret;
413 int trap; 417 int trap;
414 int state; 418 int state;
415 int ptid; 419 int ptid;
420 bool timer_running;
416 wait_queue_head_t cpu_run; 421 wait_queue_head_t cpu_run;
417 422
418 struct kvm_vcpu_arch_shared *shared; 423 struct kvm_vcpu_arch_shared *shared;
@@ -428,8 +433,9 @@ struct kvm_vcpu_arch {
428#endif 433#endif
429}; 434};
430 435
431#define KVMPPC_VCPU_BUSY_IN_HOST 0 436/* Values for vcpu->arch.state */
432#define KVMPPC_VCPU_BLOCKED 1 437#define KVMPPC_VCPU_STOPPED 0
438#define KVMPPC_VCPU_BUSY_IN_HOST 1
433#define KVMPPC_VCPU_RUNNABLE 2 439#define KVMPPC_VCPU_RUNNABLE 2
434 440
435#endif /* __POWERPC_KVM_HOST_H__ */ 441#endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index d121f49d62b8..46efd1a265c9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); 66extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); 67extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 68extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
69extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
69 70
70/* Core-specific hooks */ 71/* Core-specific hooks */
71 72
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 5f078bc2063e..69f7ffe7f674 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -44,6 +44,7 @@
44#include <asm/compat.h> 44#include <asm/compat.h>
45#include <asm/mmu.h> 45#include <asm/mmu.h>
46#include <asm/hvcall.h> 46#include <asm/hvcall.h>
47#include <asm/xics.h>
47#endif 48#endif
48#ifdef CONFIG_PPC_ISERIES 49#ifdef CONFIG_PPC_ISERIES
49#include <asm/iseries/alpaca.h> 50#include <asm/iseries/alpaca.h>
@@ -449,8 +450,6 @@ int main(void)
449#ifdef CONFIG_PPC_BOOK3S 450#ifdef CONFIG_PPC_BOOK3S
450 DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); 451 DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
451 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); 452 DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
452 DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
453 DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
454 DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); 453 DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
455 DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); 454 DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
456 DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); 455 DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
@@ -458,14 +457,12 @@ int main(void)
458 DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); 457 DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
459 DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); 458 DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
460 DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); 459 DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
461 DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
462 DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
463 DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
464 DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
465 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); 460 DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
466 DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); 461 DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
467 DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); 462 DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
468 DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); 463 DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
464 DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
465 DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
469 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); 466 DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
470 DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); 467 DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
471 DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); 468 DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
@@ -481,6 +478,7 @@ int main(void)
481 DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); 478 DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
482 DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); 479 DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
483 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); 480 DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
481 DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
484 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - 482 DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
485 offsetof(struct kvmppc_vcpu_book3s, vcpu)); 483 offsetof(struct kvmppc_vcpu_book3s, vcpu));
486 DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); 484 DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
@@ -537,6 +535,8 @@ int main(void)
537 HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); 535 HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
538 HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); 536 HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
539 HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); 537 HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
538 HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
539 HSTATE_FIELD(HSTATE_NAPPING, napping);
540 540
541#ifdef CONFIG_KVM_BOOK3S_64_HV 541#ifdef CONFIG_KVM_BOOK3S_64_HV
542 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); 542 HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
@@ -549,6 +549,7 @@ int main(void)
549 HSTATE_FIELD(HSTATE_DSCR, host_dscr); 549 HSTATE_FIELD(HSTATE_DSCR, host_dscr);
550 HSTATE_FIELD(HSTATE_DABR, dabr); 550 HSTATE_FIELD(HSTATE_DABR, dabr);
551 HSTATE_FIELD(HSTATE_DECEXP, dec_expires); 551 HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
552 DEFINE(IPI_PRIORITY, IPI_PRIORITY);
552#endif /* CONFIG_KVM_BOOK3S_64_HV */ 553#endif /* CONFIG_KVM_BOOK3S_64_HV */
553 554
554#else /* CONFIG_PPC_BOOK3S */ 555#else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 41b02c792aa3..29ddd8b1c274 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -427,16 +427,6 @@ slb_miss_user_pseries:
427 b . /* prevent spec. execution */ 427 b . /* prevent spec. execution */
428#endif /* __DISABLED__ */ 428#endif /* __DISABLED__ */
429 429
430/* KVM's trampoline code needs to be close to the interrupt handlers */
431
432#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
433#ifdef CONFIG_KVM_BOOK3S_PR
434#include "../kvm/book3s_rmhandlers.S"
435#else
436#include "../kvm/book3s_hv_rmhandlers.S"
437#endif
438#endif
439
440 .align 7 430 .align 7
441 .globl __end_interrupts 431 .globl __end_interrupts
442__end_interrupts: 432__end_interrupts:
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index da3a1225c0ac..ca1f88b3dc59 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
78 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) 78 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
79 vcpu_44x->shadow_refs[i].gtlb_index = -1; 79 vcpu_44x->shadow_refs[i].gtlb_index = -1;
80 80
81 vcpu->arch.cpu_type = KVM_CPU_440;
82
81 return 0; 83 return 0;
82} 84}
83 85
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 08428e2c188d..3688aeecc4b2 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -43,18 +43,22 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
43 fpu.o \ 43 fpu.o \
44 book3s_paired_singles.o \ 44 book3s_paired_singles.o \
45 book3s_pr.o \ 45 book3s_pr.o \
46 book3s_pr_papr.o \
46 book3s_emulate.o \ 47 book3s_emulate.o \
47 book3s_interrupts.o \ 48 book3s_interrupts.o \
48 book3s_mmu_hpte.o \ 49 book3s_mmu_hpte.o \
49 book3s_64_mmu_host.o \ 50 book3s_64_mmu_host.o \
50 book3s_64_mmu.o \ 51 book3s_64_mmu.o \
51 book3s_32_mmu.o 52 book3s_32_mmu.o
53kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
54 book3s_rmhandlers.o
52 55
53kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 56kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
54 book3s_hv.o \ 57 book3s_hv.o \
55 book3s_hv_interrupts.o \ 58 book3s_hv_interrupts.o \
56 book3s_64_mmu_hv.o 59 book3s_64_mmu_hv.o
57kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ 60kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
61 book3s_hv_rmhandlers.o \
58 book3s_hv_rm_mmu.o \ 62 book3s_hv_rm_mmu.o \
59 book3s_64_vio_hv.o \ 63 book3s_64_vio_hv.o \
60 book3s_hv_builtin.o 64 book3s_hv_builtin.o
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
index 3608471ad2d8..7e06a6fc8d07 100644
--- a/arch/powerpc/kvm/book3s_32_sr.S
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -31,7 +31,7 @@
31 * R1 = host R1 31 * R1 = host R1
32 * R2 = host R2 32 * R2 = host R2
33 * R3 = shadow vcpu 33 * R3 = shadow vcpu
34 * all other volatile GPRS = free 34 * all other volatile GPRS = free except R4, R6
35 * SVCPU[CR] = guest CR 35 * SVCPU[CR] = guest CR
36 * SVCPU[XER] = guest XER 36 * SVCPU[XER] = guest XER
37 * SVCPU[CTR] = guest CTR 37 * SVCPU[CTR] = guest CTR
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index c6d3e194b6b4..b871721c0050 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -128,7 +128,13 @@ static hva_t kvmppc_mmu_book3s_64_get_pteg(
128 dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", 128 dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
129 page, vcpu_book3s->sdr1, pteg, slbe->vsid); 129 page, vcpu_book3s->sdr1, pteg, slbe->vsid);
130 130
131 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); 131 /* When running a PAPR guest, SDR1 contains a HVA address instead
132 of a GPA */
133 if (vcpu_book3s->vcpu.arch.papr_enabled)
134 r = pteg;
135 else
136 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
137
132 if (kvm_is_error_hva(r)) 138 if (kvm_is_error_hva(r))
133 return r; 139 return r;
134 return r | (pteg & ~PAGE_MASK); 140 return r | (pteg & ~PAGE_MASK);
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 04e7d3bbfe8b..f2e6e48ea463 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -53,7 +53,7 @@ slb_exit_skip_ ## num:
53 * R1 = host R1 53 * R1 = host R1
54 * R2 = host R2 54 * R2 = host R2
55 * R3 = shadow vcpu 55 * R3 = shadow vcpu
56 * all other volatile GPRS = free 56 * all other volatile GPRS = free except R4, R6
57 * SVCPU[CR] = guest CR 57 * SVCPU[CR] = guest CR
58 * SVCPU[XER] = guest XER 58 * SVCPU[XER] = guest XER
59 * SVCPU[CTR] = guest CTR 59 * SVCPU[CTR] = guest CTR
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 466846557089..0c9dc62532d0 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -63,6 +63,25 @@
63 * function pointers, so let's just disable the define. */ 63 * function pointers, so let's just disable the define. */
64#undef mfsrin 64#undef mfsrin
65 65
66enum priv_level {
67 PRIV_PROBLEM = 0,
68 PRIV_SUPER = 1,
69 PRIV_HYPER = 2,
70};
71
72static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
73{
74 /* PAPR VMs only access supervisor SPRs */
75 if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
76 return false;
77
78 /* Limit user space to its own small SPR set */
79 if ((vcpu->arch.shared->msr & MSR_PR) && level > PRIV_PROBLEM)
80 return false;
81
82 return true;
83}
84
66int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 85int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
67 unsigned int inst, int *advance) 86 unsigned int inst, int *advance)
68{ 87{
@@ -296,6 +315,8 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
296 315
297 switch (sprn) { 316 switch (sprn) {
298 case SPRN_SDR1: 317 case SPRN_SDR1:
318 if (!spr_allowed(vcpu, PRIV_HYPER))
319 goto unprivileged;
299 to_book3s(vcpu)->sdr1 = spr_val; 320 to_book3s(vcpu)->sdr1 = spr_val;
300 break; 321 break;
301 case SPRN_DSISR: 322 case SPRN_DSISR:
@@ -390,6 +411,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
390 case SPRN_PMC4_GEKKO: 411 case SPRN_PMC4_GEKKO:
391 case SPRN_WPAR_GEKKO: 412 case SPRN_WPAR_GEKKO:
392 break; 413 break;
414unprivileged:
393 default: 415 default:
394 printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); 416 printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
395#ifndef DEBUG_SPR 417#ifndef DEBUG_SPR
@@ -421,6 +443,8 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
421 break; 443 break;
422 } 444 }
423 case SPRN_SDR1: 445 case SPRN_SDR1:
446 if (!spr_allowed(vcpu, PRIV_HYPER))
447 goto unprivileged;
424 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); 448 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
425 break; 449 break;
426 case SPRN_DSISR: 450 case SPRN_DSISR:
@@ -449,6 +473,10 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
449 case SPRN_HID5: 473 case SPRN_HID5:
450 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]); 474 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hid[5]);
451 break; 475 break;
476 case SPRN_CFAR:
477 case SPRN_PURR:
478 kvmppc_set_gpr(vcpu, rt, 0);
479 break;
452 case SPRN_GQR0: 480 case SPRN_GQR0:
453 case SPRN_GQR1: 481 case SPRN_GQR1:
454 case SPRN_GQR2: 482 case SPRN_GQR2:
@@ -476,6 +504,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
476 kvmppc_set_gpr(vcpu, rt, 0); 504 kvmppc_set_gpr(vcpu, rt, 0);
477 break; 505 break;
478 default: 506 default:
507unprivileged:
479 printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); 508 printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
480#ifndef DEBUG_SPR 509#ifndef DEBUG_SPR
481 emulated = EMULATE_FAIL; 510 emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 88c8f26add02..f7f63a00ab1f 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -23,9 +23,7 @@
23#ifdef CONFIG_KVM_BOOK3S_64_HV 23#ifdef CONFIG_KVM_BOOK3S_64_HV
24EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); 24EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
25#else 25#else
26EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter); 26EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
27EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
28EXPORT_SYMBOL_GPL(kvmppc_rmcall);
29EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); 27EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
30#ifdef CONFIG_ALTIVEC 28#ifdef CONFIG_ALTIVEC
31EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); 29EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cc0d7f1b19ab..4644c7986d80 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -62,6 +62,8 @@
62/* #define EXIT_DEBUG_SIMPLE */ 62/* #define EXIT_DEBUG_SIMPLE */
63/* #define EXIT_DEBUG_INT */ 63/* #define EXIT_DEBUG_INT */
64 64
65static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
66
65void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 67void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
66{ 68{
67 local_paca->kvm_hstate.kvm_vcpu = vcpu; 69 local_paca->kvm_hstate.kvm_vcpu = vcpu;
@@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
72{ 74{
73} 75}
74 76
75static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
76static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
77
78void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
79{
80 u64 now;
81 unsigned long dec_nsec;
82
83 now = get_tb();
84 if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
85 kvmppc_core_queue_dec(vcpu);
86 if (vcpu->arch.pending_exceptions)
87 return;
88 if (vcpu->arch.dec_expires != ~(u64)0) {
89 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
90 tb_ticks_per_sec;
91 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
92 HRTIMER_MODE_REL);
93 }
94
95 kvmppc_vcpu_blocked(vcpu);
96
97 kvm_vcpu_block(vcpu);
98 vcpu->stat.halt_wakeup++;
99
100 if (vcpu->arch.dec_expires != ~(u64)0)
101 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
102
103 kvmppc_vcpu_unblocked(vcpu);
104}
105
106void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) 77void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
107{ 78{
108 vcpu->arch.shregs.msr = msr; 79 vcpu->arch.shregs.msr = msr;
80 kvmppc_end_cede(vcpu);
109} 81}
110 82
111void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) 83void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
257 229
258 switch (req) { 230 switch (req) {
259 case H_CEDE: 231 case H_CEDE:
260 vcpu->arch.shregs.msr |= MSR_EE;
261 vcpu->arch.ceded = 1;
262 smp_mb();
263 if (!vcpu->arch.prodded)
264 kvmppc_vcpu_block(vcpu);
265 else
266 vcpu->arch.prodded = 0;
267 smp_mb();
268 vcpu->arch.ceded = 0;
269 break; 232 break;
270 case H_PROD: 233 case H_PROD:
271 target = kvmppc_get_gpr(vcpu, 4); 234 target = kvmppc_get_gpr(vcpu, 4);
@@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
388 break; 351 break;
389 } 352 }
390 353
391
392 if (!(r & RESUME_HOST)) {
393 /* To avoid clobbering exit_reason, only check for signals if
394 * we aren't already exiting to userspace for some other
395 * reason. */
396 if (signal_pending(tsk)) {
397 vcpu->stat.signal_exits++;
398 run->exit_reason = KVM_EXIT_INTR;
399 r = -EINTR;
400 } else {
401 kvmppc_core_deliver_interrupts(vcpu);
402 }
403 }
404
405 return r; 354 return r;
406} 355}
407 356
@@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
479 kvmppc_mmu_book3s_hv_init(vcpu); 428 kvmppc_mmu_book3s_hv_init(vcpu);
480 429
481 /* 430 /*
482 * Some vcpus may start out in stopped state. If we initialize 431 * We consider the vcpu stopped until we see the first run ioctl for it.
483 * them to busy-in-host state they will stop other vcpus in the
484 * vcore from running. Instead we initialize them to blocked
485 * state, effectively considering them to be stopped until we
486 * see the first run ioctl for them.
487 */ 432 */
488 vcpu->arch.state = KVMPPC_VCPU_BLOCKED; 433 vcpu->arch.state = KVMPPC_VCPU_STOPPED;
489 434
490 init_waitqueue_head(&vcpu->arch.cpu_run); 435 init_waitqueue_head(&vcpu->arch.cpu_run);
491 436
@@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
496 if (vcore) { 441 if (vcore) {
497 INIT_LIST_HEAD(&vcore->runnable_threads); 442 INIT_LIST_HEAD(&vcore->runnable_threads);
498 spin_lock_init(&vcore->lock); 443 spin_lock_init(&vcore->lock);
444 init_waitqueue_head(&vcore->wq);
499 } 445 }
500 kvm->arch.vcores[core] = vcore; 446 kvm->arch.vcores[core] = vcore;
501 } 447 }
@@ -506,10 +452,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
506 452
507 spin_lock(&vcore->lock); 453 spin_lock(&vcore->lock);
508 ++vcore->num_threads; 454 ++vcore->num_threads;
509 ++vcore->n_blocked;
510 spin_unlock(&vcore->lock); 455 spin_unlock(&vcore->lock);
511 vcpu->arch.vcore = vcore; 456 vcpu->arch.vcore = vcore;
512 457
458 vcpu->arch.cpu_type = KVM_CPU_3S_64;
459 kvmppc_sanity_check(vcpu);
460
513 return vcpu; 461 return vcpu;
514 462
515free_vcpu: 463free_vcpu:
@@ -524,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
524 kfree(vcpu); 472 kfree(vcpu);
525} 473}
526 474
527static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) 475static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
528{ 476{
529 struct kvmppc_vcore *vc = vcpu->arch.vcore; 477 unsigned long dec_nsec, now;
530 478
531 spin_lock(&vc->lock); 479 now = get_tb();
532 vcpu->arch.state = KVMPPC_VCPU_BLOCKED; 480 if (now > vcpu->arch.dec_expires) {
533 ++vc->n_blocked; 481 /* decrementer has already gone negative */
534 if (vc->n_runnable > 0 && 482 kvmppc_core_queue_dec(vcpu);
535 vc->n_runnable + vc->n_blocked == vc->num_threads) { 483 kvmppc_core_deliver_interrupts(vcpu);
536 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, 484 return;
537 arch.run_list);
538 wake_up(&vcpu->arch.cpu_run);
539 } 485 }
540 spin_unlock(&vc->lock); 486 dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
487 / tb_ticks_per_sec;
488 hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
489 HRTIMER_MODE_REL);
490 vcpu->arch.timer_running = 1;
541} 491}
542 492
543static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) 493static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
544{ 494{
545 struct kvmppc_vcore *vc = vcpu->arch.vcore; 495 vcpu->arch.ceded = 0;
546 496 if (vcpu->arch.timer_running) {
547 spin_lock(&vc->lock); 497 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
548 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 498 vcpu->arch.timer_running = 0;
549 --vc->n_blocked; 499 }
550 spin_unlock(&vc->lock);
551} 500}
552 501
553extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); 502extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -562,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
562 return; 511 return;
563 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 512 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
564 --vc->n_runnable; 513 --vc->n_runnable;
514 ++vc->n_busy;
565 /* decrement the physical thread id of each following vcpu */ 515 /* decrement the physical thread id of each following vcpu */
566 v = vcpu; 516 v = vcpu;
567 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) 517 list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
@@ -575,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
575 struct paca_struct *tpaca; 525 struct paca_struct *tpaca;
576 struct kvmppc_vcore *vc = vcpu->arch.vcore; 526 struct kvmppc_vcore *vc = vcpu->arch.vcore;
577 527
528 if (vcpu->arch.timer_running) {
529 hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
530 vcpu->arch.timer_running = 0;
531 }
578 cpu = vc->pcpu + vcpu->arch.ptid; 532 cpu = vc->pcpu + vcpu->arch.ptid;
579 tpaca = &paca[cpu]; 533 tpaca = &paca[cpu];
580 tpaca->kvm_hstate.kvm_vcpu = vcpu; 534 tpaca->kvm_hstate.kvm_vcpu = vcpu;
581 tpaca->kvm_hstate.kvm_vcore = vc; 535 tpaca->kvm_hstate.kvm_vcore = vc;
536 tpaca->kvm_hstate.napping = 0;
537 vcpu->cpu = vc->pcpu;
582 smp_wmb(); 538 smp_wmb();
583#ifdef CONFIG_PPC_ICP_NATIVE 539#ifdef CONFIG_PPC_ICP_NATIVE
584 if (vcpu->arch.ptid) { 540 if (vcpu->arch.ptid) {
585 tpaca->cpu_start = 0x80; 541 tpaca->cpu_start = 0x80;
586 tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
587 wmb(); 542 wmb();
588 xics_wake_cpu(cpu); 543 xics_wake_cpu(cpu);
589 ++vc->n_woken; 544 ++vc->n_woken;
@@ -631,9 +586,10 @@ static int on_primary_thread(void)
631 */ 586 */
632static int kvmppc_run_core(struct kvmppc_vcore *vc) 587static int kvmppc_run_core(struct kvmppc_vcore *vc)
633{ 588{
634 struct kvm_vcpu *vcpu, *vnext; 589 struct kvm_vcpu *vcpu, *vcpu0, *vnext;
635 long ret; 590 long ret;
636 u64 now; 591 u64 now;
592 int ptid;
637 593
638 /* don't start if any threads have a signal pending */ 594 /* don't start if any threads have a signal pending */
639 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 595 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -652,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
652 goto out; 608 goto out;
653 } 609 }
654 610
611 /*
612 * Assign physical thread IDs, first to non-ceded vcpus
613 * and then to ceded ones.
614 */
615 ptid = 0;
616 vcpu0 = NULL;
617 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
618 if (!vcpu->arch.ceded) {
619 if (!ptid)
620 vcpu0 = vcpu;
621 vcpu->arch.ptid = ptid++;
622 }
623 }
624 if (!vcpu0)
625 return 0; /* nothing to run */
626 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
627 if (vcpu->arch.ceded)
628 vcpu->arch.ptid = ptid++;
629
655 vc->n_woken = 0; 630 vc->n_woken = 0;
656 vc->nap_count = 0; 631 vc->nap_count = 0;
657 vc->entry_exit_count = 0; 632 vc->entry_exit_count = 0;
658 vc->vcore_running = 1; 633 vc->vcore_state = VCORE_RUNNING;
659 vc->in_guest = 0; 634 vc->in_guest = 0;
660 vc->pcpu = smp_processor_id(); 635 vc->pcpu = smp_processor_id();
636 vc->napping_threads = 0;
661 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 637 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
662 kvmppc_start_thread(vcpu); 638 kvmppc_start_thread(vcpu);
663 vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
664 arch.run_list);
665 639
640 preempt_disable();
666 spin_unlock(&vc->lock); 641 spin_unlock(&vc->lock);
667 642
668 preempt_disable();
669 kvm_guest_enter(); 643 kvm_guest_enter();
670 __kvmppc_vcore_entry(NULL, vcpu); 644 __kvmppc_vcore_entry(NULL, vcpu0);
671 645
672 /* wait for secondary threads to finish writing their state to memory */
673 spin_lock(&vc->lock); 646 spin_lock(&vc->lock);
647 /* disable sending of IPIs on virtual external irqs */
648 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
649 vcpu->cpu = -1;
650 /* wait for secondary threads to finish writing their state to memory */
674 if (vc->nap_count < vc->n_woken) 651 if (vc->nap_count < vc->n_woken)
675 kvmppc_wait_for_nap(vc); 652 kvmppc_wait_for_nap(vc);
676 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 653 /* prevent other vcpu threads from doing kvmppc_start_thread() now */
677 vc->vcore_running = 2; 654 vc->vcore_state = VCORE_EXITING;
678 spin_unlock(&vc->lock); 655 spin_unlock(&vc->lock);
679 656
680 /* make sure updates to secondary vcpu structs are visible now */ 657 /* make sure updates to secondary vcpu structs are visible now */
@@ -690,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
690 if (now < vcpu->arch.dec_expires && 667 if (now < vcpu->arch.dec_expires &&
691 kvmppc_core_pending_dec(vcpu)) 668 kvmppc_core_pending_dec(vcpu))
692 kvmppc_core_dequeue_dec(vcpu); 669 kvmppc_core_dequeue_dec(vcpu);
693 if (!vcpu->arch.trap) { 670
694 if (signal_pending(vcpu->arch.run_task)) { 671 ret = RESUME_GUEST;
695 vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; 672 if (vcpu->arch.trap)
696 vcpu->arch.ret = -EINTR; 673 ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
697 } 674 vcpu->arch.run_task);
698 continue; /* didn't get to run */ 675
699 }
700 ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
701 vcpu->arch.run_task);
702 vcpu->arch.ret = ret; 676 vcpu->arch.ret = ret;
703 vcpu->arch.trap = 0; 677 vcpu->arch.trap = 0;
678
679 if (vcpu->arch.ceded) {
680 if (ret != RESUME_GUEST)
681 kvmppc_end_cede(vcpu);
682 else
683 kvmppc_set_timer(vcpu);
684 }
704 } 685 }
705 686
706 spin_lock(&vc->lock); 687 spin_lock(&vc->lock);
707 out: 688 out:
708 vc->vcore_running = 0; 689 vc->vcore_state = VCORE_INACTIVE;
709 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 690 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
710 arch.run_list) { 691 arch.run_list) {
711 if (vcpu->arch.ret != RESUME_GUEST) { 692 if (vcpu->arch.ret != RESUME_GUEST) {
@@ -717,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
717 return 1; 698 return 1;
718} 699}
719 700
720static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 701/*
702 * Wait for some other vcpu thread to execute us, and
703 * wake us up when we need to handle something in the host.
704 */
705static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
721{ 706{
722 int ptid;
723 int wait_state;
724 struct kvmppc_vcore *vc;
725 DEFINE_WAIT(wait); 707 DEFINE_WAIT(wait);
726 708
727 /* No need to go into the guest when all we do is going out */ 709 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
728 if (signal_pending(current)) { 710 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
729 kvm_run->exit_reason = KVM_EXIT_INTR; 711 schedule();
730 return -EINTR; 712 finish_wait(&vcpu->arch.cpu_run, &wait);
713}
714
715/*
716 * All the vcpus in this vcore are idle, so wait for a decrementer
717 * or external interrupt to one of the vcpus. vc->lock is held.
718 */
719static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
720{
721 DEFINE_WAIT(wait);
722 struct kvm_vcpu *v;
723 int all_idle = 1;
724
725 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
726 vc->vcore_state = VCORE_SLEEPING;
727 spin_unlock(&vc->lock);
728 list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
729 if (!v->arch.ceded || v->arch.pending_exceptions) {
730 all_idle = 0;
731 break;
732 }
731 } 733 }
734 if (all_idle)
735 schedule();
736 finish_wait(&vc->wq, &wait);
737 spin_lock(&vc->lock);
738 vc->vcore_state = VCORE_INACTIVE;
739}
732 740
733 /* On PPC970, check that we have an RMA region */ 741static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
734 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) 742{
735 return -EPERM; 743 int n_ceded;
744 int prev_state;
745 struct kvmppc_vcore *vc;
746 struct kvm_vcpu *v, *vn;
736 747
737 kvm_run->exit_reason = 0; 748 kvm_run->exit_reason = 0;
738 vcpu->arch.ret = RESUME_GUEST; 749 vcpu->arch.ret = RESUME_GUEST;
739 vcpu->arch.trap = 0; 750 vcpu->arch.trap = 0;
740 751
741 flush_fp_to_thread(current);
742 flush_altivec_to_thread(current);
743 flush_vsx_to_thread(current);
744
745 /* 752 /*
746 * Synchronize with other threads in this virtual core 753 * Synchronize with other threads in this virtual core
747 */ 754 */
748 vc = vcpu->arch.vcore; 755 vc = vcpu->arch.vcore;
749 spin_lock(&vc->lock); 756 spin_lock(&vc->lock);
750 /* This happens the first time this is called for a vcpu */ 757 vcpu->arch.ceded = 0;
751 if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
752 --vc->n_blocked;
753 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
754 ptid = vc->n_runnable;
755 vcpu->arch.run_task = current; 758 vcpu->arch.run_task = current;
756 vcpu->arch.kvm_run = kvm_run; 759 vcpu->arch.kvm_run = kvm_run;
757 vcpu->arch.ptid = ptid; 760 prev_state = vcpu->arch.state;
761 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
758 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); 762 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
759 ++vc->n_runnable; 763 ++vc->n_runnable;
760 764
761 wait_state = TASK_INTERRUPTIBLE; 765 /*
762 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 766 * This happens the first time this is called for a vcpu.
763 if (signal_pending(current)) { 767 * If the vcore is already running, we may be able to start
764 if (!vc->vcore_running) { 768 * this thread straight away and have it join in.
765 kvm_run->exit_reason = KVM_EXIT_INTR; 769 */
766 vcpu->arch.ret = -EINTR; 770 if (prev_state == KVMPPC_VCPU_STOPPED) {
767 break; 771 if (vc->vcore_state == VCORE_RUNNING &&
768 } 772 VCORE_EXIT_COUNT(vc) == 0) {
769 /* have to wait for vcore to stop executing guest */ 773 vcpu->arch.ptid = vc->n_runnable - 1;
770 wait_state = TASK_UNINTERRUPTIBLE; 774 kvmppc_start_thread(vcpu);
771 smp_send_reschedule(vc->pcpu);
772 } 775 }
773 776
774 if (!vc->vcore_running && 777 } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
775 vc->n_runnable + vc->n_blocked == vc->num_threads) { 778 --vc->n_busy;
776 /* we can run now */
777 if (kvmppc_run_core(vc))
778 continue;
779 }
780 779
781 if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) 780 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
782 kvmppc_start_thread(vcpu); 781 !signal_pending(current)) {
782 if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
783 spin_unlock(&vc->lock);
784 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
785 spin_lock(&vc->lock);
786 continue;
787 }
788 n_ceded = 0;
789 list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
790 n_ceded += v->arch.ceded;
791 if (n_ceded == vc->n_runnable)
792 kvmppc_vcore_blocked(vc);
793 else
794 kvmppc_run_core(vc);
795
796 list_for_each_entry_safe(v, vn, &vc->runnable_threads,
797 arch.run_list) {
798 kvmppc_core_deliver_interrupts(v);
799 if (signal_pending(v->arch.run_task)) {
800 kvmppc_remove_runnable(vc, v);
801 v->stat.signal_exits++;
802 v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
803 v->arch.ret = -EINTR;
804 wake_up(&v->arch.cpu_run);
805 }
806 }
807 }
783 808
784 /* wait for other threads to come in, or wait for vcore */ 809 if (signal_pending(current)) {
785 prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); 810 if (vc->vcore_state == VCORE_RUNNING ||
786 spin_unlock(&vc->lock); 811 vc->vcore_state == VCORE_EXITING) {
787 schedule(); 812 spin_unlock(&vc->lock);
788 finish_wait(&vcpu->arch.cpu_run, &wait); 813 kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
789 spin_lock(&vc->lock); 814 spin_lock(&vc->lock);
815 }
816 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
817 kvmppc_remove_runnable(vc, vcpu);
818 vcpu->stat.signal_exits++;
819 kvm_run->exit_reason = KVM_EXIT_INTR;
820 vcpu->arch.ret = -EINTR;
821 }
790 } 822 }
791 823
792 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
793 kvmppc_remove_runnable(vc, vcpu);
794 spin_unlock(&vc->lock); 824 spin_unlock(&vc->lock);
795
796 return vcpu->arch.ret; 825 return vcpu->arch.ret;
797} 826}
798 827
@@ -800,6 +829,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
800{ 829{
801 int r; 830 int r;
802 831
832 if (!vcpu->arch.sane) {
833 run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
834 return -EINVAL;
835 }
836
837 /* No need to go into the guest when all we'll do is come back out */
838 if (signal_pending(current)) {
839 run->exit_reason = KVM_EXIT_INTR;
840 return -EINTR;
841 }
842
843 /* On PPC970, check that we have an RMA region */
844 if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
845 return -EPERM;
846
847 flush_fp_to_thread(current);
848 flush_altivec_to_thread(current);
849 flush_vsx_to_thread(current);
850 vcpu->arch.wqp = &vcpu->arch.vcore->wq;
851
803 do { 852 do {
804 r = kvmppc_run_vcpu(run, vcpu); 853 r = kvmppc_run_vcpu(run, vcpu);
805 854
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fcfe6b055558..bacb0cfa3602 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -110,39 +110,6 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
110 return H_SUCCESS; 110 return H_SUCCESS;
111} 111}
112 112
113static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
114 unsigned long pte_index)
115{
116 unsigned long rb, va_low;
117
118 rb = (v & ~0x7fUL) << 16; /* AVA field */
119 va_low = pte_index >> 3;
120 if (v & HPTE_V_SECONDARY)
121 va_low = ~va_low;
122 /* xor vsid from AVA */
123 if (!(v & HPTE_V_1TB_SEG))
124 va_low ^= v >> 12;
125 else
126 va_low ^= v >> 24;
127 va_low &= 0x7ff;
128 if (v & HPTE_V_LARGE) {
129 rb |= 1; /* L field */
130 if (cpu_has_feature(CPU_FTR_ARCH_206) &&
131 (r & 0xff000)) {
132 /* non-16MB large page, must be 64k */
133 /* (masks depend on page size) */
134 rb |= 0x1000; /* page encoding in LP field */
135 rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
136 rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */
137 }
138 } else {
139 /* 4kB page */
140 rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */
141 }
142 rb |= (v >> 54) & 0x300; /* B field */
143 return rb;
144}
145
146#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 113#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
147 114
148static inline int try_lock_tlbie(unsigned int *lock) 115static inline int try_lock_tlbie(unsigned int *lock)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index de2950135e6e..f422231d9235 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -20,7 +20,10 @@
20#include <asm/ppc_asm.h> 20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
22#include <asm/reg.h> 22#include <asm/reg.h>
23#include <asm/mmu.h>
23#include <asm/page.h> 24#include <asm/page.h>
25#include <asm/ptrace.h>
26#include <asm/hvcall.h>
24#include <asm/asm-offsets.h> 27#include <asm/asm-offsets.h>
25#include <asm/exception-64s.h> 28#include <asm/exception-64s.h>
26 29
@@ -49,7 +52,7 @@ kvmppc_skip_Hinterrupt:
49 b . 52 b .
50 53
51/* 54/*
52 * Call kvmppc_handler_trampoline_enter in real mode. 55 * Call kvmppc_hv_entry in real mode.
53 * Must be called with interrupts hard-disabled. 56 * Must be called with interrupts hard-disabled.
54 * 57 *
55 * Input Registers: 58 * Input Registers:
@@ -89,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
89kvm_start_guest: 92kvm_start_guest:
90 ld r1,PACAEMERGSP(r13) 93 ld r1,PACAEMERGSP(r13)
91 subi r1,r1,STACK_FRAME_OVERHEAD 94 subi r1,r1,STACK_FRAME_OVERHEAD
95 ld r2,PACATOC(r13)
96
97 /* were we napping due to cede? */
98 lbz r0,HSTATE_NAPPING(r13)
99 cmpwi r0,0
100 bne kvm_end_cede
92 101
93 /* get vcpu pointer */ 102 /* get vcpu pointer */
94 ld r4, HSTATE_KVM_VCPU(r13) 103 ld r4, HSTATE_KVM_VCPU(r13)
@@ -276,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
276 cmpwi r0,0 285 cmpwi r0,0
277 beq 20b 286 beq 20b
278 287
279 /* Set LPCR. Set the MER bit if there is a pending external irq. */ 288 /* Set LPCR and RMOR. */
28010: ld r8,KVM_LPCR(r9) 28910: ld r8,KVM_LPCR(r9)
281 ld r0,VCPU_PENDING_EXC(r4) 290 mtspr SPRN_LPCR,r8
282 li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
283 oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
284 and. r0,r0,r7
285 beq 11f
286 ori r8,r8,LPCR_MER
28711: mtspr SPRN_LPCR,r8
288 ld r8,KVM_RMOR(r9) 291 ld r8,KVM_RMOR(r9)
289 mtspr SPRN_RMOR,r8 292 mtspr SPRN_RMOR,r8
290 isync 293 isync
@@ -448,19 +451,50 @@ toc_tlbie_lock:
448 mtctr r6 451 mtctr r6
449 mtxer r7 452 mtxer r7
450 453
451 /* Move SRR0 and SRR1 into the respective regs */ 454kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
452 ld r6, VCPU_SRR0(r4) 455 ld r6, VCPU_SRR0(r4)
453 ld r7, VCPU_SRR1(r4) 456 ld r7, VCPU_SRR1(r4)
454 mtspr SPRN_SRR0, r6
455 mtspr SPRN_SRR1, r7
456
457 ld r10, VCPU_PC(r4) 457 ld r10, VCPU_PC(r4)
458 ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */
458 459
459 ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */
460 rldicl r11, r11, 63 - MSR_HV_LG, 1 460 rldicl r11, r11, 63 - MSR_HV_LG, 1
461 rotldi r11, r11, 1 + MSR_HV_LG 461 rotldi r11, r11, 1 + MSR_HV_LG
462 ori r11, r11, MSR_ME 462 ori r11, r11, MSR_ME
463 463
464 /* Check if we can deliver an external or decrementer interrupt now */
465 ld r0,VCPU_PENDING_EXC(r4)
466 li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
467 oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
468 and r0,r0,r8
469 cmpdi cr1,r0,0
470 andi. r0,r11,MSR_EE
471 beq cr1,11f
472BEGIN_FTR_SECTION
473 mfspr r8,SPRN_LPCR
474 ori r8,r8,LPCR_MER
475 mtspr SPRN_LPCR,r8
476 isync
477END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
478 beq 5f
479 li r0,BOOK3S_INTERRUPT_EXTERNAL
48012: mr r6,r10
481 mr r10,r0
482 mr r7,r11
483 li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
484 rotldi r11,r11,63
485 b 5f
48611: beq 5f
487 mfspr r0,SPRN_DEC
488 cmpwi r0,0
489 li r0,BOOK3S_INTERRUPT_DECREMENTER
490 blt 12b
491
492 /* Move SRR0 and SRR1 into the respective regs */
4935: mtspr SPRN_SRR0, r6
494 mtspr SPRN_SRR1, r7
495 li r0,0
496 stb r0,VCPU_CEDED(r4) /* cancel cede */
497
464fast_guest_return: 498fast_guest_return:
465 mtspr SPRN_HSRR0,r10 499 mtspr SPRN_HSRR0,r10
466 mtspr SPRN_HSRR1,r11 500 mtspr SPRN_HSRR1,r11
@@ -574,21 +608,20 @@ kvmppc_interrupt:
574 /* See if this is something we can handle in real mode */ 608 /* See if this is something we can handle in real mode */
575 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL 609 cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
576 beq hcall_try_real_mode 610 beq hcall_try_real_mode
577hcall_real_cont:
578 611
579 /* Check for mediated interrupts (could be done earlier really ...) */ 612 /* Check for mediated interrupts (could be done earlier really ...) */
580BEGIN_FTR_SECTION 613BEGIN_FTR_SECTION
581 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL 614 cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL
582 bne+ 1f 615 bne+ 1f
583 ld r5,VCPU_KVM(r9)
584 ld r5,KVM_LPCR(r5)
585 andi. r0,r11,MSR_EE 616 andi. r0,r11,MSR_EE
586 beq 1f 617 beq 1f
618 mfspr r5,SPRN_LPCR
587 andi. r0,r5,LPCR_MER 619 andi. r0,r5,LPCR_MER
588 bne bounce_ext_interrupt 620 bne bounce_ext_interrupt
5891: 6211:
590END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 622END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
591 623
624hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
592 /* Save DEC */ 625 /* Save DEC */
593 mfspr r5,SPRN_DEC 626 mfspr r5,SPRN_DEC
594 mftb r6 627 mftb r6
@@ -682,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
682 slbia 715 slbia
683 ptesync 716 ptesync
684 717
685hdec_soon: 718hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */
686BEGIN_FTR_SECTION 719BEGIN_FTR_SECTION
687 b 32f 720 b 32f
688END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 721END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
@@ -700,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
700 addi r0,r3,0x100 733 addi r0,r3,0x100
701 stwcx. r0,0,r6 734 stwcx. r0,0,r6
702 bne 41b 735 bne 41b
736 lwsync
703 737
704 /* 738 /*
705 * At this point we have an interrupt that we have to pass 739 * At this point we have an interrupt that we have to pass
@@ -713,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
713 * interrupt, since the other threads will already be on their 747 * interrupt, since the other threads will already be on their
714 * way here in that case. 748 * way here in that case.
715 */ 749 */
750 cmpwi r3,0x100 /* Are we the first here? */
751 bge 43f
752 cmpwi r3,1 /* Are any other threads in the guest? */
753 ble 43f
716 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 754 cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
717 beq 40f 755 beq 40f
718 cmpwi r3,0x100 /* Are we the first here? */
719 bge 40f
720 cmpwi r3,1
721 ble 40f
722 li r0,0 756 li r0,0
723 mtspr SPRN_HDEC,r0 757 mtspr SPRN_HDEC,r0
72440: 75840:
759 /*
760 * Send an IPI to any napping threads, since an HDEC interrupt
761 * doesn't wake CPUs up from nap.
762 */
763 lwz r3,VCORE_NAPPING_THREADS(r5)
764 lwz r4,VCPU_PTID(r9)
765 li r0,1
766 sldi r0,r0,r4
767 andc. r3,r3,r0 /* no sense IPI'ing ourselves */
768 beq 43f
769 mulli r4,r4,PACA_SIZE /* get paca for thread 0 */
770 subf r6,r4,r13
77142: andi. r0,r3,1
772 beq 44f
773 ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
774 li r0,IPI_PRIORITY
775 li r7,XICS_QIRR
776 stbcix r0,r7,r8 /* trigger the IPI */
77744: srdi. r3,r3,1
778 addi r6,r6,PACA_SIZE
779 bne 42b
725 780
726 /* Secondary threads wait for primary to do partition switch */ 781 /* Secondary threads wait for primary to do partition switch */
727 ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ 78243: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */
728 ld r5,HSTATE_KVM_VCORE(r13) 783 ld r5,HSTATE_KVM_VCORE(r13)
729 lwz r3,VCPU_PTID(r9) 784 lwz r3,VCPU_PTID(r9)
730 cmpwi r3,0 785 cmpwi r3,0
@@ -1077,7 +1132,6 @@ hcall_try_real_mode:
1077hcall_real_fallback: 1132hcall_real_fallback:
1078 li r12,BOOK3S_INTERRUPT_SYSCALL 1133 li r12,BOOK3S_INTERRUPT_SYSCALL
1079 ld r9, HSTATE_KVM_VCPU(r13) 1134 ld r9, HSTATE_KVM_VCPU(r13)
1080 ld r11, VCPU_MSR(r9)
1081 1135
1082 b hcall_real_cont 1136 b hcall_real_cont
1083 1137
@@ -1139,7 +1193,7 @@ hcall_real_table:
1139 .long 0 /* 0xd4 */ 1193 .long 0 /* 0xd4 */
1140 .long 0 /* 0xd8 */ 1194 .long 0 /* 0xd8 */
1141 .long 0 /* 0xdc */ 1195 .long 0 /* 0xdc */
1142 .long 0 /* 0xe0 */ 1196 .long .kvmppc_h_cede - hcall_real_table
1143 .long 0 /* 0xe4 */ 1197 .long 0 /* 0xe4 */
1144 .long 0 /* 0xe8 */ 1198 .long 0 /* 0xe8 */
1145 .long 0 /* 0xec */ 1199 .long 0 /* 0xec */
@@ -1168,7 +1222,8 @@ bounce_ext_interrupt:
1168 mtspr SPRN_SRR0,r10 1222 mtspr SPRN_SRR0,r10
1169 mtspr SPRN_SRR1,r11 1223 mtspr SPRN_SRR1,r11
1170 li r10,BOOK3S_INTERRUPT_EXTERNAL 1224 li r10,BOOK3S_INTERRUPT_EXTERNAL
1171 LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); 1225 li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
1226 rotldi r11,r11,63
1172 b fast_guest_return 1227 b fast_guest_return
1173 1228
1174_GLOBAL(kvmppc_h_set_dabr) 1229_GLOBAL(kvmppc_h_set_dabr)
@@ -1177,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr)
1177 li r3,0 1232 li r3,0
1178 blr 1233 blr
1179 1234
1235_GLOBAL(kvmppc_h_cede)
1236 ori r11,r11,MSR_EE
1237 std r11,VCPU_MSR(r3)
1238 li r0,1
1239 stb r0,VCPU_CEDED(r3)
1240 sync /* order setting ceded vs. testing prodded */
1241 lbz r5,VCPU_PRODDED(r3)
1242 cmpwi r5,0
1243 bne 1f
1244 li r0,0 /* set trap to 0 to say hcall is handled */
1245 stw r0,VCPU_TRAP(r3)
1246 li r0,H_SUCCESS
1247 std r0,VCPU_GPR(r3)(r3)
1248BEGIN_FTR_SECTION
1249 b 2f /* just send it up to host on 970 */
1250END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
1251
1252 /*
1253 * Set our bit in the bitmask of napping threads unless all the
1254 * other threads are already napping, in which case we send this
1255 * up to the host.
1256 */
1257 ld r5,HSTATE_KVM_VCORE(r13)
1258 lwz r6,VCPU_PTID(r3)
1259 lwz r8,VCORE_ENTRY_EXIT(r5)
1260 clrldi r8,r8,56
1261 li r0,1
1262 sld r0,r0,r6
1263 addi r6,r5,VCORE_NAPPING_THREADS
126431: lwarx r4,0,r6
1265 or r4,r4,r0
1266 popcntw r7,r4
1267 cmpw r7,r8
1268 bge 2f
1269 stwcx. r4,0,r6
1270 bne 31b
1271 li r0,1
1272 stb r0,HSTATE_NAPPING(r13)
1273 /* order napping_threads update vs testing entry_exit_count */
1274 lwsync
1275 mr r4,r3
1276 lwz r7,VCORE_ENTRY_EXIT(r5)
1277 cmpwi r7,0x100
1278 bge 33f /* another thread already exiting */
1279
1280/*
1281 * Although not specifically required by the architecture, POWER7
1282 * preserves the following registers in nap mode, even if an SMT mode
1283 * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
1284 * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
1285 */
1286 /* Save non-volatile GPRs */
1287 std r14, VCPU_GPR(r14)(r3)
1288 std r15, VCPU_GPR(r15)(r3)
1289 std r16, VCPU_GPR(r16)(r3)
1290 std r17, VCPU_GPR(r17)(r3)
1291 std r18, VCPU_GPR(r18)(r3)
1292 std r19, VCPU_GPR(r19)(r3)
1293 std r20, VCPU_GPR(r20)(r3)
1294 std r21, VCPU_GPR(r21)(r3)
1295 std r22, VCPU_GPR(r22)(r3)
1296 std r23, VCPU_GPR(r23)(r3)
1297 std r24, VCPU_GPR(r24)(r3)
1298 std r25, VCPU_GPR(r25)(r3)
1299 std r26, VCPU_GPR(r26)(r3)
1300 std r27, VCPU_GPR(r27)(r3)
1301 std r28, VCPU_GPR(r28)(r3)
1302 std r29, VCPU_GPR(r29)(r3)
1303 std r30, VCPU_GPR(r30)(r3)
1304 std r31, VCPU_GPR(r31)(r3)
1305
1306 /* save FP state */
1307 bl .kvmppc_save_fp
1308
1309 /*
1310 * Take a nap until a decrementer or external interrupt occurs,
1311 * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
1312 */
1313 li r0,0x80
1314 stb r0,PACAPROCSTART(r13)
1315 mfspr r5,SPRN_LPCR
1316 ori r5,r5,LPCR_PECE0 | LPCR_PECE1
1317 mtspr SPRN_LPCR,r5
1318 isync
1319 li r0, 0
1320 std r0, HSTATE_SCRATCH0(r13)
1321 ptesync
1322 ld r0, HSTATE_SCRATCH0(r13)
13231: cmpd r0, r0
1324 bne 1b
1325 nap
1326 b .
1327
1328kvm_end_cede:
1329 /* Woken by external or decrementer interrupt */
1330 ld r1, HSTATE_HOST_R1(r13)
1331 ld r2, PACATOC(r13)
1332
1333 /* If we're a secondary thread and we got here by an IPI, ack it */
1334 ld r4,HSTATE_KVM_VCPU(r13)
1335 lwz r3,VCPU_PTID(r4)
1336 cmpwi r3,0
1337 beq 27f
1338 mfspr r3,SPRN_SRR1
1339 rlwinm r3,r3,44-31,0x7 /* extract wake reason field */
1340 cmpwi r3,4 /* was it an external interrupt? */
1341 bne 27f
1342 ld r5, HSTATE_XICS_PHYS(r13)
1343 li r0,0xff
1344 li r6,XICS_QIRR
1345 li r7,XICS_XIRR
1346 lwzcix r8,r5,r7 /* ack the interrupt */
1347 sync
1348 stbcix r0,r5,r6 /* clear it */
1349 stwcix r8,r5,r7 /* EOI it */
135027:
1351 /* load up FP state */
1352 bl kvmppc_load_fp
1353
1354 /* Load NV GPRS */
1355 ld r14, VCPU_GPR(r14)(r4)
1356 ld r15, VCPU_GPR(r15)(r4)
1357 ld r16, VCPU_GPR(r16)(r4)
1358 ld r17, VCPU_GPR(r17)(r4)
1359 ld r18, VCPU_GPR(r18)(r4)
1360 ld r19, VCPU_GPR(r19)(r4)
1361 ld r20, VCPU_GPR(r20)(r4)
1362 ld r21, VCPU_GPR(r21)(r4)
1363 ld r22, VCPU_GPR(r22)(r4)
1364 ld r23, VCPU_GPR(r23)(r4)
1365 ld r24, VCPU_GPR(r24)(r4)
1366 ld r25, VCPU_GPR(r25)(r4)
1367 ld r26, VCPU_GPR(r26)(r4)
1368 ld r27, VCPU_GPR(r27)(r4)
1369 ld r28, VCPU_GPR(r28)(r4)
1370 ld r29, VCPU_GPR(r29)(r4)
1371 ld r30, VCPU_GPR(r30)(r4)
1372 ld r31, VCPU_GPR(r31)(r4)
1373
1374 /* clear our bit in vcore->napping_threads */
137533: ld r5,HSTATE_KVM_VCORE(r13)
1376 lwz r3,VCPU_PTID(r4)
1377 li r0,1
1378 sld r0,r0,r3
1379 addi r6,r5,VCORE_NAPPING_THREADS
138032: lwarx r7,0,r6
1381 andc r7,r7,r0
1382 stwcx. r7,0,r6
1383 bne 32b
1384 li r0,0
1385 stb r0,HSTATE_NAPPING(r13)
1386
1387 /* see if any other thread is already exiting */
1388 lwz r0,VCORE_ENTRY_EXIT(r5)
1389 cmpwi r0,0x100
1390 blt kvmppc_cede_reentry /* if not go back to guest */
1391
1392 /* some threads are exiting, so go to the guest exit path */
1393 b hcall_real_fallback
1394
1395 /* cede when already previously prodded case */
13961: li r0,0
1397 stb r0,VCPU_PRODDED(r3)
1398 sync /* order testing prodded vs. clearing ceded */
1399 stb r0,VCPU_CEDED(r3)
1400 li r3,H_SUCCESS
1401 blr
1402
1403 /* we've ceded but we want to give control to the host */
14042: li r3,H_TOO_HARD
1405 blr
1406
1180secondary_too_late: 1407secondary_too_late:
1181 ld r5,HSTATE_KVM_VCORE(r13) 1408 ld r5,HSTATE_KVM_VCORE(r13)
1182 HMT_LOW 1409 HMT_LOW
@@ -1194,14 +1421,20 @@ secondary_too_late:
1194 slbmte r6,r5 1421 slbmte r6,r5
11951: addi r11,r11,16 14221: addi r11,r11,16
1196 .endr 1423 .endr
1197 b 50f
1198 1424
1199secondary_nap: 1425secondary_nap:
1200 /* Clear any pending IPI */ 1426 /* Clear any pending IPI - assume we're a secondary thread */
120150: ld r5, HSTATE_XICS_PHYS(r13) 1427 ld r5, HSTATE_XICS_PHYS(r13)
1428 li r7, XICS_XIRR
1429 lwzcix r3, r5, r7 /* ack any pending interrupt */
1430 rlwinm. r0, r3, 0, 0xffffff /* any pending? */
1431 beq 37f
1432 sync
1202 li r0, 0xff 1433 li r0, 0xff
1203 li r6, XICS_QIRR 1434 li r6, XICS_QIRR
1204 stbcix r0, r5, r6 1435 stbcix r0, r5, r6 /* clear the IPI */
1436 stwcix r3, r5, r7 /* EOI it */
143737: sync
1205 1438
1206 /* increment the nap count and then go to nap mode */ 1439 /* increment the nap count and then go to nap mode */
1207 ld r4, HSTATE_KVM_VCORE(r13) 1440 ld r4, HSTATE_KVM_VCORE(r13)
@@ -1211,13 +1444,12 @@ secondary_nap:
1211 addi r3, r3, 1 1444 addi r3, r3, 1
1212 stwcx. r3, 0, r4 1445 stwcx. r3, 0, r4
1213 bne 51b 1446 bne 51b
1214 isync
1215 1447
1448 li r3, LPCR_PECE0
1216 mfspr r4, SPRN_LPCR 1449 mfspr r4, SPRN_LPCR
1217 li r0, LPCR_PECE 1450 rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
1218 andc r4, r4, r0
1219 ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */
1220 mtspr SPRN_LPCR, r4 1451 mtspr SPRN_LPCR, r4
1452 isync
1221 li r0, 0 1453 li r0, 0
1222 std r0, HSTATE_SCRATCH0(r13) 1454 std r0, HSTATE_SCRATCH0(r13)
1223 ptesync 1455 ptesync
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index c54b0e30cf3f..0a8515a5c042 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -29,27 +29,11 @@
29#define ULONG_SIZE 8 29#define ULONG_SIZE 8
30#define FUNC(name) GLUE(.,name) 30#define FUNC(name) GLUE(.,name)
31 31
32#define GET_SHADOW_VCPU_R13
33
34#define DISABLE_INTERRUPTS \
35 mfmsr r0; \
36 rldicl r0,r0,48,1; \
37 rotldi r0,r0,16; \
38 mtmsrd r0,1; \
39
40#elif defined(CONFIG_PPC_BOOK3S_32) 32#elif defined(CONFIG_PPC_BOOK3S_32)
41 33
42#define ULONG_SIZE 4 34#define ULONG_SIZE 4
43#define FUNC(name) name 35#define FUNC(name) name
44 36
45#define GET_SHADOW_VCPU_R13 \
46 lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2)
47
48#define DISABLE_INTERRUPTS \
49 mfmsr r0; \
50 rlwinm r0,r0,0,17,15; \
51 mtmsr r0; \
52
53#endif /* CONFIG_PPC_BOOK3S_XX */ 37#endif /* CONFIG_PPC_BOOK3S_XX */
54 38
55 39
@@ -108,44 +92,17 @@ kvm_start_entry:
108 92
109kvm_start_lightweight: 93kvm_start_lightweight:
110 94
111 GET_SHADOW_VCPU_R13
112 PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4)
113 PPC_STL r3, HSTATE_VMHANDLER(r13)
114
115 PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
116
117 DISABLE_INTERRUPTS
118
119#ifdef CONFIG_PPC_BOOK3S_64 95#ifdef CONFIG_PPC_BOOK3S_64
120 /* Some guests may need to have dcbz set to 32 byte length.
121 *
122 * Usually we ensure that by patching the guest's instructions
123 * to trap on dcbz and emulate it in the hypervisor.
124 *
125 * If we can, we should tell the CPU to use 32 byte dcbz though,
126 * because that's a lot faster.
127 */
128
129 PPC_LL r3, VCPU_HFLAGS(r4) 96 PPC_LL r3, VCPU_HFLAGS(r4)
130 rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ 97 rldicl r3, r3, 0, 63 /* r3 &= 1 */
131 beq no_dcbz32_on 98 stb r3, HSTATE_RESTORE_HID5(r13)
132
133 mfspr r3,SPRN_HID5
134 ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */
135 mtspr SPRN_HID5,r3
136
137no_dcbz32_on:
138
139#endif /* CONFIG_PPC_BOOK3S_64 */ 99#endif /* CONFIG_PPC_BOOK3S_64 */
140 100
141 PPC_LL r6, VCPU_RMCALL(r4) 101 PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
142 mtctr r6
143
144 PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4)
145 LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR))
146 102
147 /* Jump to segment patching handler and into our guest */ 103 /* Jump to segment patching handler and into our guest */
148 bctr 104 bl FUNC(kvmppc_entry_trampoline)
105 nop
149 106
150/* 107/*
151 * This is the handler in module memory. It gets jumped at from the 108 * This is the handler in module memory. It gets jumped at from the
@@ -170,21 +127,6 @@ kvmppc_handler_highmem:
170 /* R7 = vcpu */ 127 /* R7 = vcpu */
171 PPC_LL r7, GPR4(r1) 128 PPC_LL r7, GPR4(r1)
172 129
173#ifdef CONFIG_PPC_BOOK3S_64
174
175 PPC_LL r5, VCPU_HFLAGS(r7)
176 rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */
177 beq no_dcbz32_off
178
179 li r4, 0
180 mfspr r5,SPRN_HID5
181 rldimi r5,r4,6,56
182 mtspr SPRN_HID5,r5
183
184no_dcbz32_off:
185
186#endif /* CONFIG_PPC_BOOK3S_64 */
187
188 PPC_STL r14, VCPU_GPR(r14)(r7) 130 PPC_STL r14, VCPU_GPR(r14)(r7)
189 PPC_STL r15, VCPU_GPR(r15)(r7) 131 PPC_STL r15, VCPU_GPR(r15)(r7)
190 PPC_STL r16, VCPU_GPR(r16)(r7) 132 PPC_STL r16, VCPU_GPR(r16)(r7)
@@ -204,67 +146,6 @@ no_dcbz32_off:
204 PPC_STL r30, VCPU_GPR(r30)(r7) 146 PPC_STL r30, VCPU_GPR(r30)(r7)
205 PPC_STL r31, VCPU_GPR(r31)(r7) 147 PPC_STL r31, VCPU_GPR(r31)(r7)
206 148
207 /* Restore host msr -> SRR1 */
208 PPC_LL r6, VCPU_HOST_MSR(r7)
209
210 /*
211 * For some interrupts, we need to call the real Linux
212 * handler, so it can do work for us. This has to happen
213 * as if the interrupt arrived from the kernel though,
214 * so let's fake it here where most state is restored.
215 *
216 * Call Linux for hardware interrupts/decrementer
217 * r3 = address of interrupt handler (exit reason)
218 */
219
220 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
221 beq call_linux_handler
222 cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
223 beq call_linux_handler
224 cmpwi r12, BOOK3S_INTERRUPT_PERFMON
225 beq call_linux_handler
226
227 /* Back to EE=1 */
228 mtmsr r6
229 sync
230 b kvm_return_point
231
232call_linux_handler:
233
234 /*
235 * If we land here we need to jump back to the handler we
236 * came from.
237 *
238 * We have a page that we can access from real mode, so let's
239 * jump back to that and use it as a trampoline to get back into the
240 * interrupt handler!
241 *
242 * R3 still contains the exit code,
243 * R5 VCPU_HOST_RETIP and
244 * R6 VCPU_HOST_MSR
245 */
246
247 /* Restore host IP -> SRR0 */
248 PPC_LL r5, VCPU_HOST_RETIP(r7)
249
250 /* XXX Better move to a safe function?
251 * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */
252
253 mtlr r12
254
255 PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7)
256 mtsrr0 r4
257 LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
258 mtsrr1 r3
259
260 RFI
261
262.global kvm_return_point
263kvm_return_point:
264
265 /* Jump back to lightweight entry if we're supposed to */
266 /* go back into the guest */
267
268 /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ 149 /* Pass the exit number as 3rd argument to kvmppc_handle_exit */
269 mr r5, r12 150 mr r5, r12
270 151
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 0c0d3f274437..d417511abfb1 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -150,16 +150,22 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
150#ifdef CONFIG_PPC_BOOK3S_64 150#ifdef CONFIG_PPC_BOOK3S_64
151 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { 151 if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
152 kvmppc_mmu_book3s_64_init(vcpu); 152 kvmppc_mmu_book3s_64_init(vcpu);
153 to_book3s(vcpu)->hior = 0xfff00000; 153 if (!to_book3s(vcpu)->hior_sregs)
154 to_book3s(vcpu)->hior = 0xfff00000;
154 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; 155 to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
156 vcpu->arch.cpu_type = KVM_CPU_3S_64;
155 } else 157 } else
156#endif 158#endif
157 { 159 {
158 kvmppc_mmu_book3s_32_init(vcpu); 160 kvmppc_mmu_book3s_32_init(vcpu);
159 to_book3s(vcpu)->hior = 0; 161 if (!to_book3s(vcpu)->hior_sregs)
162 to_book3s(vcpu)->hior = 0;
160 to_book3s(vcpu)->msr_mask = 0xffffffffULL; 163 to_book3s(vcpu)->msr_mask = 0xffffffffULL;
164 vcpu->arch.cpu_type = KVM_CPU_3S_32;
161 } 165 }
162 166
167 kvmppc_sanity_check(vcpu);
168
163 /* If we are in hypervisor level on 970, we can tell the CPU to 169 /* If we are in hypervisor level on 970, we can tell the CPU to
164 * treat DCBZ as 32 bytes store */ 170 * treat DCBZ as 32 bytes store */
165 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; 171 vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
@@ -646,7 +652,27 @@ program_interrupt:
646 break; 652 break;
647 } 653 }
648 case BOOK3S_INTERRUPT_SYSCALL: 654 case BOOK3S_INTERRUPT_SYSCALL:
649 if (vcpu->arch.osi_enabled && 655 if (vcpu->arch.papr_enabled &&
656 (kvmppc_get_last_inst(vcpu) == 0x44000022) &&
657 !(vcpu->arch.shared->msr & MSR_PR)) {
658 /* SC 1 papr hypercalls */
659 ulong cmd = kvmppc_get_gpr(vcpu, 3);
660 int i;
661
662 if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
663 r = RESUME_GUEST;
664 break;
665 }
666
667 run->papr_hcall.nr = cmd;
668 for (i = 0; i < 9; ++i) {
669 ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
670 run->papr_hcall.args[i] = gpr;
671 }
672 run->exit_reason = KVM_EXIT_PAPR_HCALL;
673 vcpu->arch.hcall_needed = 1;
674 r = RESUME_HOST;
675 } else if (vcpu->arch.osi_enabled &&
650 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && 676 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
651 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { 677 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
652 /* MOL hypercalls */ 678 /* MOL hypercalls */
@@ -770,6 +796,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
770 } 796 }
771 } 797 }
772 798
799 if (sregs->u.s.flags & KVM_SREGS_S_HIOR)
800 sregs->u.s.hior = to_book3s(vcpu)->hior;
801
773 return 0; 802 return 0;
774} 803}
775 804
@@ -806,6 +835,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
806 /* Flush the MMU after messing with the segments */ 835 /* Flush the MMU after messing with the segments */
807 kvmppc_mmu_pte_flush(vcpu, 0, 0); 836 kvmppc_mmu_pte_flush(vcpu, 0, 0);
808 837
838 if (sregs->u.s.flags & KVM_SREGS_S_HIOR) {
839 to_book3s(vcpu)->hior_sregs = true;
840 to_book3s(vcpu)->hior = sregs->u.s.hior;
841 }
842
809 return 0; 843 return 0;
810} 844}
811 845
@@ -841,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
841 if (!p) 875 if (!p)
842 goto uninit_vcpu; 876 goto uninit_vcpu;
843 877
844 vcpu->arch.host_retip = kvm_return_point;
845 vcpu->arch.host_msr = mfmsr();
846#ifdef CONFIG_PPC_BOOK3S_64 878#ifdef CONFIG_PPC_BOOK3S_64
847 /* default to book3s_64 (970fx) */ 879 /* default to book3s_64 (970fx) */
848 vcpu->arch.pvr = 0x3C0301; 880 vcpu->arch.pvr = 0x3C0301;
@@ -853,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
853 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 885 kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
854 vcpu->arch.slb_nr = 64; 886 vcpu->arch.slb_nr = 64;
855 887
856 /* remember where some real-mode handlers are */
857 vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline);
858 vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter);
859 vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
860#ifdef CONFIG_PPC_BOOK3S_64
861 vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
862#else
863 vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
864#endif
865
866 vcpu->arch.shadow_msr = MSR_USER64; 888 vcpu->arch.shadow_msr = MSR_USER64;
867 889
868 err = kvmppc_mmu_init(vcpu); 890 err = kvmppc_mmu_init(vcpu);
@@ -908,6 +930,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
908#endif 930#endif
909 ulong ext_msr; 931 ulong ext_msr;
910 932
933 /* Check if we can run the vcpu at all */
934 if (!vcpu->arch.sane) {
935 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
936 return -EINVAL;
937 }
938
911 /* No need to go into the guest when all we do is going out */ 939 /* No need to go into the guest when all we do is going out */
912 if (signal_pending(current)) { 940 if (signal_pending(current)) {
913 kvm_run->exit_reason = KVM_EXIT_INTR; 941 kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
new file mode 100644
index 000000000000..b9589324797b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2011. Freescale Inc. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 * Paul Mackerras <paulus@samba.org>
7 *
8 * Description:
9 *
10 * Hypercall handling for running PAPR guests in PR KVM on Book 3S
11 * processors.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License, version 2, as
15 * published by the Free Software Foundation.
16 */
17
18#include <asm/uaccess.h>
19#include <asm/kvm_ppc.h>
20#include <asm/kvm_book3s.h>
21
22static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
23{
24 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
25 unsigned long pteg_addr;
26
27 pte_index <<= 4;
28 pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
29 pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
30 pteg_addr |= pte_index;
31
32 return pteg_addr;
33}
34
35static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
36{
37 long flags = kvmppc_get_gpr(vcpu, 4);
38 long pte_index = kvmppc_get_gpr(vcpu, 5);
39 unsigned long pteg[2 * 8];
40 unsigned long pteg_addr, i, *hpte;
41
42 pte_index &= ~7UL;
43 pteg_addr = get_pteg_addr(vcpu, pte_index);
44
45 copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
46 hpte = pteg;
47
48 if (likely((flags & H_EXACT) == 0)) {
49 pte_index &= ~7UL;
50 for (i = 0; ; ++i) {
51 if (i == 8)
52 return H_PTEG_FULL;
53 if ((*hpte & HPTE_V_VALID) == 0)
54 break;
55 hpte += 2;
56 }
57 } else {
58 i = kvmppc_get_gpr(vcpu, 5) & 7UL;
59 hpte += i * 2;
60 }
61
62 hpte[0] = kvmppc_get_gpr(vcpu, 6);
63 hpte[1] = kvmppc_get_gpr(vcpu, 7);
64 copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
65 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
66 kvmppc_set_gpr(vcpu, 4, pte_index | i);
67
68 return EMULATE_DONE;
69}
70
71static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
72{
73 unsigned long flags= kvmppc_get_gpr(vcpu, 4);
74 unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
75 unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
76 unsigned long v = 0, pteg, rb;
77 unsigned long pte[2];
78
79 pteg = get_pteg_addr(vcpu, pte_index);
80 copy_from_user(pte, (void __user *)pteg, sizeof(pte));
81
82 if ((pte[0] & HPTE_V_VALID) == 0 ||
83 ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
84 ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) {
85 kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
86 return EMULATE_DONE;
87 }
88
89 copy_to_user((void __user *)pteg, &v, sizeof(v));
90
91 rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
92 vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
93
94 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
95 kvmppc_set_gpr(vcpu, 4, pte[0]);
96 kvmppc_set_gpr(vcpu, 5, pte[1]);
97
98 return EMULATE_DONE;
99}
100
101static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
102{
103 unsigned long flags = kvmppc_get_gpr(vcpu, 4);
104 unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
105 unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
106 unsigned long rb, pteg, r, v;
107 unsigned long pte[2];
108
109 pteg = get_pteg_addr(vcpu, pte_index);
110 copy_from_user(pte, (void __user *)pteg, sizeof(pte));
111
112 if ((pte[0] & HPTE_V_VALID) == 0 ||
113 ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) {
114 kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND);
115 return EMULATE_DONE;
116 }
117
118 v = pte[0];
119 r = pte[1];
120 r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
121 HPTE_R_KEY_LO);
122 r |= (flags << 55) & HPTE_R_PP0;
123 r |= (flags << 48) & HPTE_R_KEY_HI;
124 r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
125
126 pte[1] = r;
127
128 rb = compute_tlbie_rb(v, r, pte_index);
129 vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
130 copy_to_user((void __user *)pteg, pte, sizeof(pte));
131
132 kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
133
134 return EMULATE_DONE;
135}
136
137int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
138{
139 switch (cmd) {
140 case H_ENTER:
141 return kvmppc_h_pr_enter(vcpu);
142 case H_REMOVE:
143 return kvmppc_h_pr_remove(vcpu);
144 case H_PROTECT:
145 return kvmppc_h_pr_protect(vcpu);
146 case H_BULK_REMOVE:
147 /* We just flush all PTEs, so user space can
148 handle the HPT modifications */
149 kvmppc_mmu_pte_flush(vcpu, 0, 0);
150 break;
151 case H_CEDE:
152 kvm_vcpu_block(vcpu);
153 vcpu->stat.halt_wakeup++;
154 return EMULATE_DONE;
155 }
156
157 return EMULATE_FAIL;
158}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index c1f877c4a884..34187585c507 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -20,6 +20,7 @@
20#include <asm/ppc_asm.h> 20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h> 21#include <asm/kvm_asm.h>
22#include <asm/reg.h> 22#include <asm/reg.h>
23#include <asm/mmu.h>
23#include <asm/page.h> 24#include <asm/page.h>
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
25 26
@@ -35,10 +36,10 @@
35 36
36#if defined(CONFIG_PPC_BOOK3S_64) 37#if defined(CONFIG_PPC_BOOK3S_64)
37 38
38#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg)
39#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR)
40#define FUNC(name) GLUE(.,name) 39#define FUNC(name) GLUE(.,name)
40#define MTMSR_EERI(reg) mtmsrd (reg),1
41 41
42 .globl kvmppc_skip_interrupt
42kvmppc_skip_interrupt: 43kvmppc_skip_interrupt:
43 /* 44 /*
44 * Here all GPRs are unchanged from when the interrupt happened 45 * Here all GPRs are unchanged from when the interrupt happened
@@ -51,6 +52,7 @@ kvmppc_skip_interrupt:
51 rfid 52 rfid
52 b . 53 b .
53 54
55 .globl kvmppc_skip_Hinterrupt
54kvmppc_skip_Hinterrupt: 56kvmppc_skip_Hinterrupt:
55 /* 57 /*
56 * Here all GPRs are unchanged from when the interrupt happened 58 * Here all GPRs are unchanged from when the interrupt happened
@@ -65,8 +67,8 @@ kvmppc_skip_Hinterrupt:
65 67
66#elif defined(CONFIG_PPC_BOOK3S_32) 68#elif defined(CONFIG_PPC_BOOK3S_32)
67 69
68#define MSR_NOIRQ MSR_KERNEL
69#define FUNC(name) name 70#define FUNC(name) name
71#define MTMSR_EERI(reg) mtmsr (reg)
70 72
71.macro INTERRUPT_TRAMPOLINE intno 73.macro INTERRUPT_TRAMPOLINE intno
72 74
@@ -167,40 +169,24 @@ kvmppc_handler_skip_ins:
167#endif 169#endif
168 170
169/* 171/*
170 * This trampoline brings us back to a real mode handler 172 * Call kvmppc_handler_trampoline_enter in real mode
171 *
172 * Input Registers:
173 *
174 * R5 = SRR0
175 * R6 = SRR1
176 * LR = real-mode IP
177 * 173 *
174 * On entry, r4 contains the guest shadow MSR
178 */ 175 */
179.global kvmppc_handler_lowmem_trampoline 176_GLOBAL(kvmppc_entry_trampoline)
180kvmppc_handler_lowmem_trampoline: 177 mfmsr r5
181 178 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
182 mtsrr0 r5 179 toreal(r7)
180
181 li r9, MSR_RI
182 ori r9, r9, MSR_EE
183 andc r9, r5, r9 /* Clear EE and RI in MSR value */
184 li r6, MSR_IR | MSR_DR
185 ori r6, r6, MSR_EE
186 andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */
187 MTMSR_EERI(r9) /* Clear EE and RI in MSR */
188 mtsrr0 r7 /* before we set srr0/1 */
183 mtsrr1 r6 189 mtsrr1 r6
184 blr
185kvmppc_handler_lowmem_trampoline_end:
186
187/*
188 * Call a function in real mode
189 *
190 * Input Registers:
191 *
192 * R3 = function
193 * R4 = MSR
194 * R5 = scratch register
195 *
196 */
197_GLOBAL(kvmppc_rmcall)
198 LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ)
199 mtmsr r5 /* Disable relocation and interrupts, so mtsrr
200 doesn't get interrupted */
201 sync
202 mtsrr0 r3
203 mtsrr1 r4
204 RFI 190 RFI
205 191
206#if defined(CONFIG_PPC_BOOK3S_32) 192#if defined(CONFIG_PPC_BOOK3S_32)
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index aed32e517212..0676ae249b9f 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -23,6 +23,7 @@
23 23
24#define GET_SHADOW_VCPU(reg) \ 24#define GET_SHADOW_VCPU(reg) \
25 mr reg, r13 25 mr reg, r13
26#define MTMSR_EERI(reg) mtmsrd (reg),1
26 27
27#elif defined(CONFIG_PPC_BOOK3S_32) 28#elif defined(CONFIG_PPC_BOOK3S_32)
28 29
@@ -30,6 +31,7 @@
30 tophys(reg, r2); \ 31 tophys(reg, r2); \
31 lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ 32 lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \
32 tophys(reg, reg) 33 tophys(reg, reg)
34#define MTMSR_EERI(reg) mtmsr (reg)
33 35
34#endif 36#endif
35 37
@@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter:
57 /* Required state: 59 /* Required state:
58 * 60 *
59 * MSR = ~IR|DR 61 * MSR = ~IR|DR
60 * R13 = PACA
61 * R1 = host R1 62 * R1 = host R1
62 * R2 = host R2 63 * R2 = host R2
63 * R10 = guest MSR 64 * R4 = guest shadow MSR
65 * R5 = normal host MSR
66 * R6 = current host MSR (EE, IR, DR off)
67 * LR = highmem guest exit code
64 * all other volatile GPRS = free 68 * all other volatile GPRS = free
65 * SVCPU[CR] = guest CR 69 * SVCPU[CR] = guest CR
66 * SVCPU[XER] = guest XER 70 * SVCPU[XER] = guest XER
@@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter:
71 /* r3 = shadow vcpu */ 75 /* r3 = shadow vcpu */
72 GET_SHADOW_VCPU(r3) 76 GET_SHADOW_VCPU(r3)
73 77
78 /* Save guest exit handler address and MSR */
79 mflr r0
80 PPC_STL r0, HSTATE_VMHANDLER(r3)
81 PPC_STL r5, HSTATE_HOST_MSR(r3)
82
74 /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ 83 /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
75 PPC_STL r1, HSTATE_HOST_R1(r3) 84 PPC_STL r1, HSTATE_HOST_R1(r3)
76 PPC_STL r2, HSTATE_HOST_R2(r3) 85 PPC_STL r2, HSTATE_HOST_R2(r3)
77 86
78 /* Move SRR0 and SRR1 into the respective regs */
79 PPC_LL r9, SVCPU_PC(r3)
80 mtsrr0 r9
81 mtsrr1 r10
82
83 /* Activate guest mode, so faults get handled by KVM */ 87 /* Activate guest mode, so faults get handled by KVM */
84 li r11, KVM_GUEST_MODE_GUEST 88 li r11, KVM_GUEST_MODE_GUEST
85 stb r11, HSTATE_IN_GUEST(r3) 89 stb r11, HSTATE_IN_GUEST(r3)
@@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter:
87 /* Switch to guest segment. This is subarch specific. */ 91 /* Switch to guest segment. This is subarch specific. */
88 LOAD_GUEST_SEGMENTS 92 LOAD_GUEST_SEGMENTS
89 93
94#ifdef CONFIG_PPC_BOOK3S_64
95 /* Some guests may need to have dcbz set to 32 byte length.
96 *
97 * Usually we ensure that by patching the guest's instructions
98 * to trap on dcbz and emulate it in the hypervisor.
99 *
100 * If we can, we should tell the CPU to use 32 byte dcbz though,
101 * because that's a lot faster.
102 */
103 lbz r0, HSTATE_RESTORE_HID5(r3)
104 cmpwi r0, 0
105 beq no_dcbz32_on
106
107 mfspr r0,SPRN_HID5
108 ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */
109 mtspr SPRN_HID5,r0
110no_dcbz32_on:
111
112#endif /* CONFIG_PPC_BOOK3S_64 */
113
90 /* Enter guest */ 114 /* Enter guest */
91 115
92 PPC_LL r4, SVCPU_CTR(r3) 116 PPC_LL r8, SVCPU_CTR(r3)
93 PPC_LL r5, SVCPU_LR(r3) 117 PPC_LL r9, SVCPU_LR(r3)
94 lwz r6, SVCPU_CR(r3) 118 lwz r10, SVCPU_CR(r3)
95 lwz r7, SVCPU_XER(r3) 119 lwz r11, SVCPU_XER(r3)
120
121 mtctr r8
122 mtlr r9
123 mtcr r10
124 mtxer r11
96 125
97 mtctr r4 126 /* Move SRR0 and SRR1 into the respective regs */
98 mtlr r5 127 PPC_LL r9, SVCPU_PC(r3)
99 mtcr r6 128 /* First clear RI in our current MSR value */
100 mtxer r7 129 li r0, MSR_RI
130 andc r6, r6, r0
131 MTMSR_EERI(r6)
132 mtsrr0 r9
133 mtsrr1 r4
101 134
102 PPC_LL r0, SVCPU_R0(r3) 135 PPC_LL r0, SVCPU_R0(r3)
103 PPC_LL r1, SVCPU_R1(r3) 136 PPC_LL r1, SVCPU_R1(r3)
@@ -213,11 +246,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
213 beq ld_last_inst 246 beq ld_last_inst
214 cmpwi r12, BOOK3S_INTERRUPT_PROGRAM 247 cmpwi r12, BOOK3S_INTERRUPT_PROGRAM
215 beq ld_last_inst 248 beq ld_last_inst
249 cmpwi r12, BOOK3S_INTERRUPT_SYSCALL
250 beq ld_last_prev_inst
216 cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT 251 cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT
217 beq- ld_last_inst 252 beq- ld_last_inst
218 253
219 b no_ld_last_inst 254 b no_ld_last_inst
220 255
256ld_last_prev_inst:
257 addi r3, r3, -4
258
221ld_last_inst: 259ld_last_inst:
222 /* Save off the guest instruction we're at */ 260 /* Save off the guest instruction we're at */
223 261
@@ -254,6 +292,43 @@ no_ld_last_inst:
254 /* Switch back to host MMU */ 292 /* Switch back to host MMU */
255 LOAD_HOST_SEGMENTS 293 LOAD_HOST_SEGMENTS
256 294
295#ifdef CONFIG_PPC_BOOK3S_64
296
297 lbz r5, HSTATE_RESTORE_HID5(r13)
298 cmpwi r5, 0
299 beq no_dcbz32_off
300
301 li r4, 0
302 mfspr r5,SPRN_HID5
303 rldimi r5,r4,6,56
304 mtspr SPRN_HID5,r5
305
306no_dcbz32_off:
307
308#endif /* CONFIG_PPC_BOOK3S_64 */
309
310 /*
311 * For some interrupts, we need to call the real Linux
312 * handler, so it can do work for us. This has to happen
313 * as if the interrupt arrived from the kernel though,
314 * so let's fake it here where most state is restored.
315 *
316 * Having set up SRR0/1 with the address where we want
317 * to continue with relocation on (potentially in module
318 * space), we either just go straight there with rfi[d],
319 * or we jump to an interrupt handler with bctr if there
320 * is an interrupt to be handled first. In the latter
321 * case, the rfi[d] at the end of the interrupt handler
322 * will get us back to where we want to continue.
323 */
324
325 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
326 beq 1f
327 cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER
328 beq 1f
329 cmpwi r12, BOOK3S_INTERRUPT_PERFMON
3301: mtctr r12
331
257 /* Register usage at this point: 332 /* Register usage at this point:
258 * 333 *
259 * R1 = host R1 334 * R1 = host R1
@@ -264,13 +339,15 @@ no_ld_last_inst:
264 * 339 *
265 */ 340 */
266 341
267 /* RFI into the highmem handler */ 342 PPC_LL r6, HSTATE_HOST_MSR(r13)
268 mfmsr r7
269 ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */
270 mtsrr1 r7
271 /* Load highmem handler address */
272 PPC_LL r8, HSTATE_VMHANDLER(r13) 343 PPC_LL r8, HSTATE_VMHANDLER(r13)
344
345 /* Restore host msr -> SRR1 */
346 mtsrr1 r6
347 /* Load highmem handler address */
273 mtsrr0 r8 348 mtsrr0 r8
274 349
350 /* RFI into the highmem handler, or jump to interrupt handler */
351 beqctr
275 RFI 352 RFI
276kvmppc_handler_trampoline_exit_end: 353kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee45fa01220e..bb6c988f010a 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
316{ 316{
317 int ret; 317 int ret;
318 318
319 if (!vcpu->arch.sane) {
320 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
321 return -EINVAL;
322 }
323
319 local_irq_disable(); 324 local_irq_disable();
320 kvm_guest_enter(); 325 kvm_guest_enter();
321 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 326 ret = __kvmppc_vcpu_run(kvm_run, vcpu);
@@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
618int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 623int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
619{ 624{
620 int i; 625 int i;
626 int r;
621 627
622 vcpu->arch.pc = 0; 628 vcpu->arch.pc = 0;
623 vcpu->arch.shared->msr = 0; 629 vcpu->arch.shared->msr = 0;
@@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
634 640
635 kvmppc_init_timing_stats(vcpu); 641 kvmppc_init_timing_stats(vcpu);
636 642
637 return kvmppc_core_vcpu_setup(vcpu); 643 r = kvmppc_core_vcpu_setup(vcpu);
644 kvmppc_sanity_check(vcpu);
645 return r;
638} 646}
639 647
640int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 648int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 797a7447c268..26d20903f2bc 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
73 /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ 73 /* Since booke kvm only support one core, update all vcpus' PIR to 0 */
74 vcpu->vcpu_id = 0; 74 vcpu->vcpu_id = 0;
75 75
76 vcpu->arch.cpu_type = KVM_CPU_E500V2;
77
76 return 0; 78 return 0;
77} 79}
78 80
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a107c9be0fb1..0d843c6ba315 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -39,12 +39,8 @@
39 39
40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 40int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
41{ 41{
42#ifndef CONFIG_KVM_BOOK3S_64_HV
43 return !(v->arch.shared->msr & MSR_WE) || 42 return !(v->arch.shared->msr & MSR_WE) ||
44 !!(v->arch.pending_exceptions); 43 !!(v->arch.pending_exceptions);
45#else
46 return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
47#endif
48} 44}
49 45
50int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 46int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -95,6 +91,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
95 return r; 91 return r;
96} 92}
97 93
94int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
95{
96 int r = false;
97
98 /* We have to know what CPU to virtualize */
99 if (!vcpu->arch.pvr)
100 goto out;
101
102 /* PAPR only works with book3s_64 */
103 if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
104 goto out;
105
106#ifdef CONFIG_KVM_BOOK3S_64_HV
107 /* HV KVM can only do PAPR mode for now */
108 if (!vcpu->arch.papr_enabled)
109 goto out;
110#endif
111
112 r = true;
113
114out:
115 vcpu->arch.sane = r;
116 return r ? 0 : -EINVAL;
117}
118
98int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) 119int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
99{ 120{
100 enum emulation_result er; 121 enum emulation_result er;
@@ -188,6 +209,8 @@ int kvm_dev_ioctl_check_extension(long ext)
188 case KVM_CAP_PPC_BOOKE_SREGS: 209 case KVM_CAP_PPC_BOOKE_SREGS:
189#else 210#else
190 case KVM_CAP_PPC_SEGSTATE: 211 case KVM_CAP_PPC_SEGSTATE:
212 case KVM_CAP_PPC_HIOR:
213 case KVM_CAP_PPC_PAPR:
191#endif 214#endif
192 case KVM_CAP_PPC_UNSET_IRQ: 215 case KVM_CAP_PPC_UNSET_IRQ:
193 case KVM_CAP_PPC_IRQ_LEVEL: 216 case KVM_CAP_PPC_IRQ_LEVEL:
@@ -258,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
258{ 281{
259 struct kvm_vcpu *vcpu; 282 struct kvm_vcpu *vcpu;
260 vcpu = kvmppc_core_vcpu_create(kvm, id); 283 vcpu = kvmppc_core_vcpu_create(kvm, id);
284 vcpu->arch.wqp = &vcpu->wq;
261 if (!IS_ERR(vcpu)) 285 if (!IS_ERR(vcpu))
262 kvmppc_create_vcpu_debugfs(vcpu, id); 286 kvmppc_create_vcpu_debugfs(vcpu, id);
263 return vcpu; 287 return vcpu;
@@ -289,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data)
289 313
290 kvmppc_core_queue_dec(vcpu); 314 kvmppc_core_queue_dec(vcpu);
291 315
292 if (waitqueue_active(&vcpu->wq)) { 316 if (waitqueue_active(vcpu->arch.wqp)) {
293 wake_up_interruptible(&vcpu->wq); 317 wake_up_interruptible(vcpu->arch.wqp);
294 vcpu->stat.halt_wakeup++; 318 vcpu->stat.halt_wakeup++;
295 } 319 }
296} 320}
@@ -543,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
543 567
544int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) 568int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
545{ 569{
546 if (irq->irq == KVM_INTERRUPT_UNSET) 570 if (irq->irq == KVM_INTERRUPT_UNSET) {
547 kvmppc_core_dequeue_external(vcpu, irq); 571 kvmppc_core_dequeue_external(vcpu, irq);
548 else 572 return 0;
549 kvmppc_core_queue_external(vcpu, irq); 573 }
574
575 kvmppc_core_queue_external(vcpu, irq);
550 576
551 if (waitqueue_active(&vcpu->wq)) { 577 if (waitqueue_active(vcpu->arch.wqp)) {
552 wake_up_interruptible(&vcpu->wq); 578 wake_up_interruptible(vcpu->arch.wqp);
553 vcpu->stat.halt_wakeup++; 579 vcpu->stat.halt_wakeup++;
554 } else if (vcpu->cpu != -1) { 580 } else if (vcpu->cpu != -1) {
555 smp_send_reschedule(vcpu->cpu); 581 smp_send_reschedule(vcpu->cpu);
@@ -571,11 +597,18 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
571 r = 0; 597 r = 0;
572 vcpu->arch.osi_enabled = true; 598 vcpu->arch.osi_enabled = true;
573 break; 599 break;
600 case KVM_CAP_PPC_PAPR:
601 r = 0;
602 vcpu->arch.papr_enabled = true;
603 break;
574 default: 604 default:
575 r = -EINVAL; 605 r = -EINVAL;
576 break; 606 break;
577 } 607 }
578 608
609 if (!r)
610 r = kvmppc_sanity_check(vcpu);
611
579 return r; 612 return r;
580} 613}
581 614
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 00ff00dfb24c..1ca5de07ac36 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -119,6 +119,7 @@ struct kvm_vcpu_stat {
119 u32 instruction_lctlg; 119 u32 instruction_lctlg;
120 u32 exit_program_interruption; 120 u32 exit_program_interruption;
121 u32 exit_instr_and_program; 121 u32 exit_instr_and_program;
122 u32 deliver_external_call;
122 u32 deliver_emergency_signal; 123 u32 deliver_emergency_signal;
123 u32 deliver_service_signal; 124 u32 deliver_service_signal;
124 u32 deliver_virtio_interrupt; 125 u32 deliver_virtio_interrupt;
@@ -138,6 +139,7 @@ struct kvm_vcpu_stat {
138 u32 instruction_stfl; 139 u32 instruction_stfl;
139 u32 instruction_tprot; 140 u32 instruction_tprot;
140 u32 instruction_sigp_sense; 141 u32 instruction_sigp_sense;
142 u32 instruction_sigp_external_call;
141 u32 instruction_sigp_emergency; 143 u32 instruction_sigp_emergency;
142 u32 instruction_sigp_stop; 144 u32 instruction_sigp_stop;
143 u32 instruction_sigp_arch; 145 u32 instruction_sigp_arch;
@@ -174,6 +176,10 @@ struct kvm_s390_prefix_info {
174 __u32 address; 176 __u32 address;
175}; 177};
176 178
179struct kvm_s390_extcall_info {
180 __u16 code;
181};
182
177struct kvm_s390_emerg_info { 183struct kvm_s390_emerg_info {
178 __u16 code; 184 __u16 code;
179}; 185};
@@ -186,6 +192,7 @@ struct kvm_s390_interrupt_info {
186 struct kvm_s390_ext_info ext; 192 struct kvm_s390_ext_info ext;
187 struct kvm_s390_pgm_info pgm; 193 struct kvm_s390_pgm_info pgm;
188 struct kvm_s390_emerg_info emerg; 194 struct kvm_s390_emerg_info emerg;
195 struct kvm_s390_extcall_info extcall;
189 struct kvm_s390_prefix_info prefix; 196 struct kvm_s390_prefix_info prefix;
190 }; 197 };
191}; 198};
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c9aeb4b4d0b8..87c16705b381 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -38,6 +38,11 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
38 struct kvm_s390_interrupt_info *inti) 38 struct kvm_s390_interrupt_info *inti)
39{ 39{
40 switch (inti->type) { 40 switch (inti->type) {
41 case KVM_S390_INT_EXTERNAL_CALL:
42 if (psw_extint_disabled(vcpu))
43 return 0;
44 if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
45 return 1;
41 case KVM_S390_INT_EMERGENCY: 46 case KVM_S390_INT_EMERGENCY:
42 if (psw_extint_disabled(vcpu)) 47 if (psw_extint_disabled(vcpu))
43 return 0; 48 return 0;
@@ -98,6 +103,7 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
98 struct kvm_s390_interrupt_info *inti) 103 struct kvm_s390_interrupt_info *inti)
99{ 104{
100 switch (inti->type) { 105 switch (inti->type) {
106 case KVM_S390_INT_EXTERNAL_CALL:
101 case KVM_S390_INT_EMERGENCY: 107 case KVM_S390_INT_EMERGENCY:
102 case KVM_S390_INT_SERVICE: 108 case KVM_S390_INT_SERVICE:
103 case KVM_S390_INT_VIRTIO: 109 case KVM_S390_INT_VIRTIO:
@@ -143,6 +149,28 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
143 exception = 1; 149 exception = 1;
144 break; 150 break;
145 151
152 case KVM_S390_INT_EXTERNAL_CALL:
153 VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
154 vcpu->stat.deliver_external_call++;
155 rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
156 if (rc == -EFAULT)
157 exception = 1;
158
159 rc = put_guest_u16(vcpu, __LC_CPU_ADDRESS, inti->extcall.code);
160 if (rc == -EFAULT)
161 exception = 1;
162
163 rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
164 &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
165 if (rc == -EFAULT)
166 exception = 1;
167
168 rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
169 __LC_EXT_NEW_PSW, sizeof(psw_t));
170 if (rc == -EFAULT)
171 exception = 1;
172 break;
173
146 case KVM_S390_INT_SERVICE: 174 case KVM_S390_INT_SERVICE:
147 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", 175 VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
148 inti->ext.ext_params); 176 inti->ext.ext_params);
@@ -522,6 +550,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
522 break; 550 break;
523 case KVM_S390_PROGRAM_INT: 551 case KVM_S390_PROGRAM_INT:
524 case KVM_S390_SIGP_STOP: 552 case KVM_S390_SIGP_STOP:
553 case KVM_S390_INT_EXTERNAL_CALL:
525 case KVM_S390_INT_EMERGENCY: 554 case KVM_S390_INT_EMERGENCY:
526 default: 555 default:
527 kfree(inti); 556 kfree(inti);
@@ -581,6 +610,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
581 break; 610 break;
582 case KVM_S390_SIGP_STOP: 611 case KVM_S390_SIGP_STOP:
583 case KVM_S390_RESTART: 612 case KVM_S390_RESTART:
613 case KVM_S390_INT_EXTERNAL_CALL:
584 case KVM_S390_INT_EMERGENCY: 614 case KVM_S390_INT_EMERGENCY:
585 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 615 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
586 inti->type = s390int->type; 616 inti->type = s390int->type;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dc2b580e27bc..9610ba41b974 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -46,6 +46,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
46 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, 46 { "instruction_lctlg", VCPU_STAT(instruction_lctlg) },
47 { "instruction_lctl", VCPU_STAT(instruction_lctl) }, 47 { "instruction_lctl", VCPU_STAT(instruction_lctl) },
48 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, 48 { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) },
49 { "deliver_external_call", VCPU_STAT(deliver_external_call) },
49 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, 50 { "deliver_service_signal", VCPU_STAT(deliver_service_signal) },
50 { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) }, 51 { "deliver_virtio_interrupt", VCPU_STAT(deliver_virtio_interrupt) },
51 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, 52 { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) },
@@ -64,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
64 { "instruction_stfl", VCPU_STAT(instruction_stfl) }, 65 { "instruction_stfl", VCPU_STAT(instruction_stfl) },
65 { "instruction_tprot", VCPU_STAT(instruction_tprot) }, 66 { "instruction_tprot", VCPU_STAT(instruction_tprot) },
66 { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, 67 { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
68 { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
67 { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, 69 { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) },
68 { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, 70 { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) },
69 { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, 71 { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) },
@@ -175,6 +177,8 @@ int kvm_arch_init_vm(struct kvm *kvm)
175 if (rc) 177 if (rc)
176 goto out_err; 178 goto out_err;
177 179
180 rc = -ENOMEM;
181
178 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL); 182 kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
179 if (!kvm->arch.sca) 183 if (!kvm->arch.sca)
180 goto out_err; 184 goto out_err;
@@ -312,11 +316,17 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
312struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 316struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
313 unsigned int id) 317 unsigned int id)
314{ 318{
315 struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); 319 struct kvm_vcpu *vcpu;
316 int rc = -ENOMEM; 320 int rc = -EINVAL;
321
322 if (id >= KVM_MAX_VCPUS)
323 goto out;
317 324
325 rc = -ENOMEM;
326
327 vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
318 if (!vcpu) 328 if (!vcpu)
319 goto out_nomem; 329 goto out;
320 330
321 vcpu->arch.sie_block = (struct kvm_s390_sie_block *) 331 vcpu->arch.sie_block = (struct kvm_s390_sie_block *)
322 get_zeroed_page(GFP_KERNEL); 332 get_zeroed_page(GFP_KERNEL);
@@ -352,7 +362,7 @@ out_free_sie_block:
352 free_page((unsigned long)(vcpu->arch.sie_block)); 362 free_page((unsigned long)(vcpu->arch.sie_block));
353out_free_cpu: 363out_free_cpu:
354 kfree(vcpu); 364 kfree(vcpu);
355out_nomem: 365out:
356 return ERR_PTR(rc); 366 return ERR_PTR(rc);
357} 367}
358 368
@@ -386,6 +396,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
386{ 396{
387 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 397 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
388 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 398 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
399 restore_access_regs(vcpu->arch.guest_acrs);
389 return 0; 400 return 0;
390} 401}
391 402
@@ -401,6 +412,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
401{ 412{
402 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 413 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
403 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 414 vcpu->arch.guest_fpregs.fpc = fpu->fpc;
415 restore_fp_regs(&vcpu->arch.guest_fpregs);
404 return 0; 416 return 0;
405} 417}
406 418
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index d6a50c1fb2e6..f815118835f3 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -87,6 +87,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
87 return -ENOMEM; 87 return -ENOMEM;
88 88
89 inti->type = KVM_S390_INT_EMERGENCY; 89 inti->type = KVM_S390_INT_EMERGENCY;
90 inti->emerg.code = vcpu->vcpu_id;
90 91
91 spin_lock(&fi->lock); 92 spin_lock(&fi->lock);
92 li = fi->local_int[cpu_addr]; 93 li = fi->local_int[cpu_addr];
@@ -103,9 +104,47 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
103 wake_up_interruptible(&li->wq); 104 wake_up_interruptible(&li->wq);
104 spin_unlock_bh(&li->lock); 105 spin_unlock_bh(&li->lock);
105 rc = 0; /* order accepted */ 106 rc = 0; /* order accepted */
107 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
108unlock:
109 spin_unlock(&fi->lock);
110 return rc;
111}
112
113static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
114{
115 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
116 struct kvm_s390_local_interrupt *li;
117 struct kvm_s390_interrupt_info *inti;
118 int rc;
119
120 if (cpu_addr >= KVM_MAX_VCPUS)
121 return 3; /* not operational */
122
123 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
124 if (!inti)
125 return -ENOMEM;
126
127 inti->type = KVM_S390_INT_EXTERNAL_CALL;
128 inti->extcall.code = vcpu->vcpu_id;
129
130 spin_lock(&fi->lock);
131 li = fi->local_int[cpu_addr];
132 if (li == NULL) {
133 rc = 3; /* not operational */
134 kfree(inti);
135 goto unlock;
136 }
137 spin_lock_bh(&li->lock);
138 list_add_tail(&inti->list, &li->list);
139 atomic_set(&li->active, 1);
140 atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
141 if (waitqueue_active(&li->wq))
142 wake_up_interruptible(&li->wq);
143 spin_unlock_bh(&li->lock);
144 rc = 0; /* order accepted */
145 VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
106unlock: 146unlock:
107 spin_unlock(&fi->lock); 147 spin_unlock(&fi->lock);
108 VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
109 return rc; 148 return rc;
110} 149}
111 150
@@ -267,6 +306,10 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
267 rc = __sigp_sense(vcpu, cpu_addr, 306 rc = __sigp_sense(vcpu, cpu_addr,
268 &vcpu->arch.guest_gprs[r1]); 307 &vcpu->arch.guest_gprs[r1]);
269 break; 308 break;
309 case SIGP_EXTERNAL_CALL:
310 vcpu->stat.instruction_sigp_external_call++;
311 rc = __sigp_external_call(vcpu, cpu_addr);
312 break;
270 case SIGP_EMERGENCY: 313 case SIGP_EMERGENCY:
271 vcpu->stat.instruction_sigp_emergency++; 314 vcpu->stat.instruction_sigp_emergency++;
272 rc = __sigp_emergency(vcpu, cpu_addr); 315 rc = __sigp_emergency(vcpu, cpu_addr);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 34595d5e1038..3925d8007864 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -100,7 +100,9 @@
100#define APIC_TIMER_BASE_CLKIN 0x0 100#define APIC_TIMER_BASE_CLKIN 0x0
101#define APIC_TIMER_BASE_TMBASE 0x1 101#define APIC_TIMER_BASE_TMBASE 0x1
102#define APIC_TIMER_BASE_DIV 0x2 102#define APIC_TIMER_BASE_DIV 0x2
103#define APIC_LVT_TIMER_ONESHOT (0 << 17)
103#define APIC_LVT_TIMER_PERIODIC (1 << 17) 104#define APIC_LVT_TIMER_PERIODIC (1 << 17)
105#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
104#define APIC_LVT_MASKED (1 << 16) 106#define APIC_LVT_MASKED (1 << 16)
105#define APIC_LVT_LEVEL_TRIGGER (1 << 15) 107#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
106#define APIC_LVT_REMOTE_IRR (1 << 14) 108#define APIC_LVT_REMOTE_IRR (1 << 14)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aa6a488cd075..2f84a433b6a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -121,6 +121,7 @@
121#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ 121#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
122#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ 122#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
123#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ 123#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
124#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
124#define X86_FEATURE_AES (4*32+25) /* AES instructions */ 125#define X86_FEATURE_AES (4*32+25) /* AES instructions */
125#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ 126#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
126#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 127#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 6040d115ef51..a026507893e9 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -262,7 +262,7 @@ struct x86_emulate_ctxt {
262 struct operand dst; 262 struct operand dst;
263 bool has_seg_override; 263 bool has_seg_override;
264 u8 seg_override; 264 u8 seg_override;
265 unsigned int d; 265 u64 d;
266 int (*execute)(struct x86_emulate_ctxt *ctxt); 266 int (*execute)(struct x86_emulate_ctxt *ctxt);
267 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 267 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
268 /* modrm */ 268 /* modrm */
@@ -275,6 +275,8 @@ struct x86_emulate_ctxt {
275 unsigned long _eip; 275 unsigned long _eip;
276 /* Fields above regs are cleared together. */ 276 /* Fields above regs are cleared together. */
277 unsigned long regs[NR_VCPU_REGS]; 277 unsigned long regs[NR_VCPU_REGS];
278 struct operand memop;
279 struct operand *memopp;
278 struct fetch_cache fetch; 280 struct fetch_cache fetch;
279 struct read_cache io_read; 281 struct read_cache io_read;
280 struct read_cache mem_read; 282 struct read_cache mem_read;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd51c83aa5de..b4973f4dab98 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -26,7 +26,8 @@
26#include <asm/mtrr.h> 26#include <asm/mtrr.h>
27#include <asm/msr-index.h> 27#include <asm/msr-index.h>
28 28
29#define KVM_MAX_VCPUS 64 29#define KVM_MAX_VCPUS 254
30#define KVM_SOFT_MAX_VCPUS 64
30#define KVM_MEMORY_SLOTS 32 31#define KVM_MEMORY_SLOTS 32
31/* memory slots that does not exposed to userspace */ 32/* memory slots that does not exposed to userspace */
32#define KVM_PRIVATE_MEM_SLOTS 4 33#define KVM_PRIVATE_MEM_SLOTS 4
@@ -264,6 +265,7 @@ struct kvm_mmu {
264 void (*new_cr3)(struct kvm_vcpu *vcpu); 265 void (*new_cr3)(struct kvm_vcpu *vcpu);
265 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); 266 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
266 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); 267 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
268 u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
267 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, 269 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
268 bool prefault); 270 bool prefault);
269 void (*inject_page_fault)(struct kvm_vcpu *vcpu, 271 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
@@ -411,8 +413,9 @@ struct kvm_vcpu_arch {
411 u32 tsc_catchup_mult; 413 u32 tsc_catchup_mult;
412 s8 tsc_catchup_shift; 414 s8 tsc_catchup_shift;
413 415
414 bool nmi_pending; 416 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
415 bool nmi_injected; 417 unsigned nmi_pending; /* NMI queued after currently running handler */
418 bool nmi_injected; /* Trying to inject an NMI this entry */
416 419
417 struct mtrr_state_type mtrr_state; 420 struct mtrr_state_type mtrr_state;
418 u32 pat; 421 u32 pat;
@@ -628,14 +631,13 @@ struct kvm_x86_ops {
628 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 631 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
629 632
630 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 633 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
634 u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
631 635
632 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 636 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
633 637
634 int (*check_intercept)(struct kvm_vcpu *vcpu, 638 int (*check_intercept)(struct kvm_vcpu *vcpu,
635 struct x86_instruction_info *info, 639 struct x86_instruction_info *info,
636 enum x86_intercept_stage stage); 640 enum x86_intercept_stage stage);
637
638 const struct trace_print_flags *exit_reasons_str;
639}; 641};
640 642
641struct kvm_arch_async_pf { 643struct kvm_arch_async_pf {
@@ -672,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
672 674
673extern bool tdp_enabled; 675extern bool tdp_enabled;
674 676
677u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
678
675/* control of guest tsc rate supported? */ 679/* control of guest tsc rate supported? */
676extern bool kvm_has_tsc_control; 680extern bool kvm_has_tsc_control;
677/* minimum supported tsc_khz for guests */ 681/* minimum supported tsc_khz for guests */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d52609aeeab8..a6962d9161a0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -229,6 +229,8 @@
229#define MSR_IA32_APICBASE_ENABLE (1<<11) 229#define MSR_IA32_APICBASE_ENABLE (1<<11)
230#define MSR_IA32_APICBASE_BASE (0xfffff<<12) 230#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
231 231
232#define MSR_IA32_TSCDEADLINE 0x000006e0
233
232#define MSR_IA32_UCODE_WRITE 0x00000079 234#define MSR_IA32_UCODE_WRITE 0x00000079
233#define MSR_IA32_UCODE_REV 0x0000008b 235#define MSR_IA32_UCODE_REV 0x0000008b
234 236
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2caf290e9895..31f180c21ce9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -350,6 +350,18 @@ enum vmcs_field {
350#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ 350#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
351 351
352 352
353/*
354 * Exit Qualifications for APIC-Access
355 */
356#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */
357#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */
358#define TYPE_LINEAR_APIC_INST_READ (0 << 12)
359#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12)
360#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12)
361#define TYPE_LINEAR_APIC_EVENT (3 << 12)
362#define TYPE_PHYSICAL_APIC_EVENT (10 << 12)
363#define TYPE_PHYSICAL_APIC_INST (15 << 12)
364
353/* segment AR */ 365/* segment AR */
354#define SEGMENT_AR_L_MASK (1 << 13) 366#define SEGMENT_AR_L_MASK (1 << 13)
355 367
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8b4cc5f067de..f1e3be18a08f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,39 @@
29#include "tss.h" 29#include "tss.h"
30 30
31/* 31/*
32 * Operand types
33 */
34#define OpNone 0ull
35#define OpImplicit 1ull /* No generic decode */
36#define OpReg 2ull /* Register */
37#define OpMem 3ull /* Memory */
38#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
39#define OpDI 5ull /* ES:DI/EDI/RDI */
40#define OpMem64 6ull /* Memory, 64-bit */
41#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
42#define OpDX 8ull /* DX register */
43#define OpCL 9ull /* CL register (for shifts) */
44#define OpImmByte 10ull /* 8-bit sign extended immediate */
45#define OpOne 11ull /* Implied 1 */
46#define OpImm 12ull /* Sign extended immediate */
47#define OpMem16 13ull /* Memory operand (16-bit). */
48#define OpMem32 14ull /* Memory operand (32-bit). */
49#define OpImmU 15ull /* Immediate operand, zero extended */
50#define OpSI 16ull /* SI/ESI/RSI */
51#define OpImmFAddr 17ull /* Immediate far address */
52#define OpMemFAddr 18ull /* Far address in memory */
53#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
54#define OpES 20ull /* ES */
55#define OpCS 21ull /* CS */
56#define OpSS 22ull /* SS */
57#define OpDS 23ull /* DS */
58#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */
60
61#define OpBits 5 /* Width of operand field */
62#define OpMask ((1ull << OpBits) - 1)
63
64/*
32 * Opcode effective-address decode tables. 65 * Opcode effective-address decode tables.
33 * Note that we only emulate instructions that have at least one memory 66 * Note that we only emulate instructions that have at least one memory
34 * operand (excluding implicit stack references). We assume that stack 67 * operand (excluding implicit stack references). We assume that stack
@@ -40,37 +73,35 @@
40/* Operand sizes: 8-bit operands or specified/overridden size. */ 73/* Operand sizes: 8-bit operands or specified/overridden size. */
41#define ByteOp (1<<0) /* 8-bit operands. */ 74#define ByteOp (1<<0) /* 8-bit operands. */
42/* Destination operand type. */ 75/* Destination operand type. */
43#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 76#define DstShift 1
44#define DstReg (2<<1) /* Register operand. */ 77#define ImplicitOps (OpImplicit << DstShift)
45#define DstMem (3<<1) /* Memory operand. */ 78#define DstReg (OpReg << DstShift)
46#define DstAcc (4<<1) /* Destination Accumulator */ 79#define DstMem (OpMem << DstShift)
47#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 80#define DstAcc (OpAcc << DstShift)
48#define DstMem64 (6<<1) /* 64bit memory operand */ 81#define DstDI (OpDI << DstShift)
49#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ 82#define DstMem64 (OpMem64 << DstShift)
50#define DstDX (8<<1) /* Destination is in DX register */ 83#define DstImmUByte (OpImmUByte << DstShift)
51#define DstMask (0xf<<1) 84#define DstDX (OpDX << DstShift)
85#define DstMask (OpMask << DstShift)
52/* Source operand type. */ 86/* Source operand type. */
53#define SrcNone (0<<5) /* No source operand. */ 87#define SrcShift 6
54#define SrcReg (1<<5) /* Register operand. */ 88#define SrcNone (OpNone << SrcShift)
55#define SrcMem (2<<5) /* Memory operand. */ 89#define SrcReg (OpReg << SrcShift)
56#define SrcMem16 (3<<5) /* Memory operand (16-bit). */ 90#define SrcMem (OpMem << SrcShift)
57#define SrcMem32 (4<<5) /* Memory operand (32-bit). */ 91#define SrcMem16 (OpMem16 << SrcShift)
58#define SrcImm (5<<5) /* Immediate operand. */ 92#define SrcMem32 (OpMem32 << SrcShift)
59#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ 93#define SrcImm (OpImm << SrcShift)
60#define SrcOne (7<<5) /* Implied '1' */ 94#define SrcImmByte (OpImmByte << SrcShift)
61#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ 95#define SrcOne (OpOne << SrcShift)
62#define SrcImmU (9<<5) /* Immediate operand, unsigned */ 96#define SrcImmUByte (OpImmUByte << SrcShift)
63#define SrcSI (0xa<<5) /* Source is in the DS:RSI */ 97#define SrcImmU (OpImmU << SrcShift)
64#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ 98#define SrcSI (OpSI << SrcShift)
65#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ 99#define SrcImmFAddr (OpImmFAddr << SrcShift)
66#define SrcAcc (0xd<<5) /* Source Accumulator */ 100#define SrcMemFAddr (OpMemFAddr << SrcShift)
67#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ 101#define SrcAcc (OpAcc << SrcShift)
68#define SrcDX (0xf<<5) /* Source is in DX register */ 102#define SrcImmU16 (OpImmU16 << SrcShift)
69#define SrcMask (0xf<<5) 103#define SrcDX (OpDX << SrcShift)
70/* Generic ModRM decode. */ 104#define SrcMask (OpMask << SrcShift)
71#define ModRM (1<<9)
72/* Destination is only written; never read. */
73#define Mov (1<<10)
74#define BitOp (1<<11) 105#define BitOp (1<<11)
75#define MemAbs (1<<12) /* Memory operand is absolute displacement */ 106#define MemAbs (1<<12) /* Memory operand is absolute displacement */
76#define String (1<<13) /* String instruction (rep capable) */ 107#define String (1<<13) /* String instruction (rep capable) */
@@ -81,6 +112,10 @@
81#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ 112#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
82#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 113#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
83#define Sse (1<<18) /* SSE Vector instruction */ 114#define Sse (1<<18) /* SSE Vector instruction */
115/* Generic ModRM decode. */
116#define ModRM (1<<19)
117/* Destination is only written; never read. */
118#define Mov (1<<20)
84/* Misc flags */ 119/* Misc flags */
85#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ 120#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
86#define VendorSpecific (1<<22) /* Vendor specific instruction */ 121#define VendorSpecific (1<<22) /* Vendor specific instruction */
@@ -91,12 +126,19 @@
91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 126#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
92#define No64 (1<<28) 127#define No64 (1<<28)
93/* Source 2 operand type */ 128/* Source 2 operand type */
94#define Src2None (0<<29) 129#define Src2Shift (29)
95#define Src2CL (1<<29) 130#define Src2None (OpNone << Src2Shift)
96#define Src2ImmByte (2<<29) 131#define Src2CL (OpCL << Src2Shift)
97#define Src2One (3<<29) 132#define Src2ImmByte (OpImmByte << Src2Shift)
98#define Src2Imm (4<<29) 133#define Src2One (OpOne << Src2Shift)
99#define Src2Mask (7<<29) 134#define Src2Imm (OpImm << Src2Shift)
135#define Src2ES (OpES << Src2Shift)
136#define Src2CS (OpCS << Src2Shift)
137#define Src2SS (OpSS << Src2Shift)
138#define Src2DS (OpDS << Src2Shift)
139#define Src2FS (OpFS << Src2Shift)
140#define Src2GS (OpGS << Src2Shift)
141#define Src2Mask (OpMask << Src2Shift)
100 142
101#define X2(x...) x, x 143#define X2(x...) x, x
102#define X3(x...) X2(x), x 144#define X3(x...) X2(x), x
@@ -108,8 +150,8 @@
108#define X16(x...) X8(x), X8(x) 150#define X16(x...) X8(x), X8(x)
109 151
110struct opcode { 152struct opcode {
111 u32 flags; 153 u64 flags : 56;
112 u8 intercept; 154 u64 intercept : 8;
113 union { 155 union {
114 int (*execute)(struct x86_emulate_ctxt *ctxt); 156 int (*execute)(struct x86_emulate_ctxt *ctxt);
115 struct opcode *group; 157 struct opcode *group;
@@ -205,105 +247,100 @@ struct gprefix {
205#define ON64(x) 247#define ON64(x)
206#endif 248#endif
207 249
208#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ 250#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
209 do { \ 251 do { \
210 __asm__ __volatile__ ( \ 252 __asm__ __volatile__ ( \
211 _PRE_EFLAGS("0", "4", "2") \ 253 _PRE_EFLAGS("0", "4", "2") \
212 _op _suffix " %"_x"3,%1; " \ 254 _op _suffix " %"_x"3,%1; " \
213 _POST_EFLAGS("0", "4", "2") \ 255 _POST_EFLAGS("0", "4", "2") \
214 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ 256 : "=m" ((ctxt)->eflags), \
257 "+q" (*(_dsttype*)&(ctxt)->dst.val), \
215 "=&r" (_tmp) \ 258 "=&r" (_tmp) \
216 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 259 : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
217 } while (0) 260 } while (0)
218 261
219 262
220/* Raw emulation: instruction has two explicit operands. */ 263/* Raw emulation: instruction has two explicit operands. */
221#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ 264#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
222 do { \ 265 do { \
223 unsigned long _tmp; \ 266 unsigned long _tmp; \
224 \ 267 \
225 switch ((_dst).bytes) { \ 268 switch ((ctxt)->dst.bytes) { \
226 case 2: \ 269 case 2: \
227 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ 270 ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
228 break; \ 271 break; \
229 case 4: \ 272 case 4: \
230 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ 273 ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
231 break; \ 274 break; \
232 case 8: \ 275 case 8: \
233 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ 276 ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
234 break; \ 277 break; \
235 } \ 278 } \
236 } while (0) 279 } while (0)
237 280
238#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ 281#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
239 do { \ 282 do { \
240 unsigned long _tmp; \ 283 unsigned long _tmp; \
241 switch ((_dst).bytes) { \ 284 switch ((ctxt)->dst.bytes) { \
242 case 1: \ 285 case 1: \
243 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ 286 ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
244 break; \ 287 break; \
245 default: \ 288 default: \
246 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 289 __emulate_2op_nobyte(ctxt, _op, \
247 _wx, _wy, _lx, _ly, _qx, _qy); \ 290 _wx, _wy, _lx, _ly, _qx, _qy); \
248 break; \ 291 break; \
249 } \ 292 } \
250 } while (0) 293 } while (0)
251 294
252/* Source operand is byte-sized and may be restricted to just %cl. */ 295/* Source operand is byte-sized and may be restricted to just %cl. */
253#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ 296#define emulate_2op_SrcB(ctxt, _op) \
254 __emulate_2op(_op, _src, _dst, _eflags, \ 297 __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
255 "b", "c", "b", "c", "b", "c", "b", "c")
256 298
257/* Source operand is byte, word, long or quad sized. */ 299/* Source operand is byte, word, long or quad sized. */
258#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ 300#define emulate_2op_SrcV(ctxt, _op) \
259 __emulate_2op(_op, _src, _dst, _eflags, \ 301 __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
260 "b", "q", "w", "r", _LO32, "r", "", "r")
261 302
262/* Source operand is word, long or quad sized. */ 303/* Source operand is word, long or quad sized. */
263#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ 304#define emulate_2op_SrcV_nobyte(ctxt, _op) \
264 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 305 __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
265 "w", "r", _LO32, "r", "", "r")
266 306
267/* Instruction has three operands and one operand is stored in ECX register */ 307/* Instruction has three operands and one operand is stored in ECX register */
268#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 308#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
269 do { \ 309 do { \
270 unsigned long _tmp; \ 310 unsigned long _tmp; \
271 _type _clv = (_cl).val; \ 311 _type _clv = (ctxt)->src2.val; \
272 _type _srcv = (_src).val; \ 312 _type _srcv = (ctxt)->src.val; \
273 _type _dstv = (_dst).val; \ 313 _type _dstv = (ctxt)->dst.val; \
274 \ 314 \
275 __asm__ __volatile__ ( \ 315 __asm__ __volatile__ ( \
276 _PRE_EFLAGS("0", "5", "2") \ 316 _PRE_EFLAGS("0", "5", "2") \
277 _op _suffix " %4,%1 \n" \ 317 _op _suffix " %4,%1 \n" \
278 _POST_EFLAGS("0", "5", "2") \ 318 _POST_EFLAGS("0", "5", "2") \
279 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 319 : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
280 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 320 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
281 ); \ 321 ); \
282 \ 322 \
283 (_cl).val = (unsigned long) _clv; \ 323 (ctxt)->src2.val = (unsigned long) _clv; \
284 (_src).val = (unsigned long) _srcv; \ 324 (ctxt)->src2.val = (unsigned long) _srcv; \
285 (_dst).val = (unsigned long) _dstv; \ 325 (ctxt)->dst.val = (unsigned long) _dstv; \
286 } while (0) 326 } while (0)
287 327
288#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 328#define emulate_2op_cl(ctxt, _op) \
289 do { \ 329 do { \
290 switch ((_dst).bytes) { \ 330 switch ((ctxt)->dst.bytes) { \
291 case 2: \ 331 case 2: \
292 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 332 __emulate_2op_cl(ctxt, _op, "w", u16); \
293 "w", unsigned short); \
294 break; \ 333 break; \
295 case 4: \ 334 case 4: \
296 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 335 __emulate_2op_cl(ctxt, _op, "l", u32); \
297 "l", unsigned int); \
298 break; \ 336 break; \
299 case 8: \ 337 case 8: \
300 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 338 ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
301 "q", unsigned long)); \
302 break; \ 339 break; \
303 } \ 340 } \
304 } while (0) 341 } while (0)
305 342
306#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 343#define __emulate_1op(ctxt, _op, _suffix) \
307 do { \ 344 do { \
308 unsigned long _tmp; \ 345 unsigned long _tmp; \
309 \ 346 \
@@ -311,39 +348,27 @@ struct gprefix {
311 _PRE_EFLAGS("0", "3", "2") \ 348 _PRE_EFLAGS("0", "3", "2") \
312 _op _suffix " %1; " \ 349 _op _suffix " %1; " \
313 _POST_EFLAGS("0", "3", "2") \ 350 _POST_EFLAGS("0", "3", "2") \
314 : "=m" (_eflags), "+m" ((_dst).val), \ 351 : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
315 "=&r" (_tmp) \ 352 "=&r" (_tmp) \
316 : "i" (EFLAGS_MASK)); \ 353 : "i" (EFLAGS_MASK)); \
317 } while (0) 354 } while (0)
318 355
319/* Instruction has only one explicit operand (no source operand). */ 356/* Instruction has only one explicit operand (no source operand). */
320#define emulate_1op(_op, _dst, _eflags) \ 357#define emulate_1op(ctxt, _op) \
321 do { \ 358 do { \
322 switch ((_dst).bytes) { \ 359 switch ((ctxt)->dst.bytes) { \
323 case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ 360 case 1: __emulate_1op(ctxt, _op, "b"); break; \
324 case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ 361 case 2: __emulate_1op(ctxt, _op, "w"); break; \
325 case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ 362 case 4: __emulate_1op(ctxt, _op, "l"); break; \
326 case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ 363 case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
327 } \ 364 } \
328 } while (0) 365 } while (0)
329 366
330#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ 367#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
331 do { \
332 unsigned long _tmp; \
333 \
334 __asm__ __volatile__ ( \
335 _PRE_EFLAGS("0", "4", "1") \
336 _op _suffix " %5; " \
337 _POST_EFLAGS("0", "4", "1") \
338 : "=m" (_eflags), "=&r" (_tmp), \
339 "+a" (_rax), "+d" (_rdx) \
340 : "i" (EFLAGS_MASK), "m" ((_src).val), \
341 "a" (_rax), "d" (_rdx)); \
342 } while (0)
343
344#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
345 do { \ 368 do { \
346 unsigned long _tmp; \ 369 unsigned long _tmp; \
370 ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \
371 ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \
347 \ 372 \
348 __asm__ __volatile__ ( \ 373 __asm__ __volatile__ ( \
349 _PRE_EFLAGS("0", "5", "1") \ 374 _PRE_EFLAGS("0", "5", "1") \
@@ -356,53 +381,27 @@ struct gprefix {
356 "jmp 2b \n\t" \ 381 "jmp 2b \n\t" \
357 ".popsection \n\t" \ 382 ".popsection \n\t" \
358 _ASM_EXTABLE(1b, 3b) \ 383 _ASM_EXTABLE(1b, 3b) \
359 : "=m" (_eflags), "=&r" (_tmp), \ 384 : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
360 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ 385 "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
361 : "i" (EFLAGS_MASK), "m" ((_src).val), \ 386 : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \
362 "a" (_rax), "d" (_rdx)); \ 387 "a" (*rax), "d" (*rdx)); \
363 } while (0) 388 } while (0)
364 389
365/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ 390/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
366#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ 391#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
367 do { \ 392 do { \
368 switch((_src).bytes) { \ 393 switch((ctxt)->src.bytes) { \
369 case 1: \ 394 case 1: \
370 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 395 __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
371 _eflags, "b"); \
372 break; \ 396 break; \
373 case 2: \ 397 case 2: \
374 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 398 __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
375 _eflags, "w"); \
376 break; \ 399 break; \
377 case 4: \ 400 case 4: \
378 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ 401 __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
379 _eflags, "l"); \
380 break; \
381 case 8: \
382 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
383 _eflags, "q")); \
384 break; \
385 } \
386 } while (0)
387
388#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
389 do { \
390 switch((_src).bytes) { \
391 case 1: \
392 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
393 _eflags, "b", _ex); \
394 break; \
395 case 2: \
396 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
397 _eflags, "w", _ex); \
398 break; \
399 case 4: \
400 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
401 _eflags, "l", _ex); \
402 break; \ 402 break; \
403 case 8: ON64( \ 403 case 8: ON64( \
404 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ 404 __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
405 _eflags, "q", _ex)); \
406 break; \ 405 break; \
407 } \ 406 } \
408 } while (0) 407 } while (0)
@@ -651,41 +650,50 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
651 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); 650 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
652} 651}
653 652
654static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, 653/*
655 unsigned long eip, u8 *dest) 654 * Fetch the next byte of the instruction being emulated which is pointed to
655 * by ctxt->_eip, then increment ctxt->_eip.
656 *
657 * Also prefetch the remaining bytes of the instruction without crossing page
658 * boundary if they are not in fetch_cache yet.
659 */
660static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
656{ 661{
657 struct fetch_cache *fc = &ctxt->fetch; 662 struct fetch_cache *fc = &ctxt->fetch;
658 int rc; 663 int rc;
659 int size, cur_size; 664 int size, cur_size;
660 665
661 if (eip == fc->end) { 666 if (ctxt->_eip == fc->end) {
662 unsigned long linear; 667 unsigned long linear;
663 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; 668 struct segmented_address addr = { .seg = VCPU_SREG_CS,
669 .ea = ctxt->_eip };
664 cur_size = fc->end - fc->start; 670 cur_size = fc->end - fc->start;
665 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 671 size = min(15UL - cur_size,
672 PAGE_SIZE - offset_in_page(ctxt->_eip));
666 rc = __linearize(ctxt, addr, size, false, true, &linear); 673 rc = __linearize(ctxt, addr, size, false, true, &linear);
667 if (rc != X86EMUL_CONTINUE) 674 if (unlikely(rc != X86EMUL_CONTINUE))
668 return rc; 675 return rc;
669 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, 676 rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
670 size, &ctxt->exception); 677 size, &ctxt->exception);
671 if (rc != X86EMUL_CONTINUE) 678 if (unlikely(rc != X86EMUL_CONTINUE))
672 return rc; 679 return rc;
673 fc->end += size; 680 fc->end += size;
674 } 681 }
675 *dest = fc->data[eip - fc->start]; 682 *dest = fc->data[ctxt->_eip - fc->start];
683 ctxt->_eip++;
676 return X86EMUL_CONTINUE; 684 return X86EMUL_CONTINUE;
677} 685}
678 686
679static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 687static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
680 unsigned long eip, void *dest, unsigned size) 688 void *dest, unsigned size)
681{ 689{
682 int rc; 690 int rc;
683 691
684 /* x86 instructions are limited to 15 bytes. */ 692 /* x86 instructions are limited to 15 bytes. */
685 if (eip + size - ctxt->eip > 15) 693 if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
686 return X86EMUL_UNHANDLEABLE; 694 return X86EMUL_UNHANDLEABLE;
687 while (size--) { 695 while (size--) {
688 rc = do_insn_fetch_byte(ctxt, eip++, dest++); 696 rc = do_insn_fetch_byte(ctxt, dest++);
689 if (rc != X86EMUL_CONTINUE) 697 if (rc != X86EMUL_CONTINUE)
690 return rc; 698 return rc;
691 } 699 }
@@ -693,20 +701,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
693} 701}
694 702
695/* Fetch next part of the instruction being emulated. */ 703/* Fetch next part of the instruction being emulated. */
696#define insn_fetch(_type, _size, _eip) \ 704#define insn_fetch(_type, _ctxt) \
697({ unsigned long _x; \ 705({ unsigned long _x; \
698 rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ 706 rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
699 if (rc != X86EMUL_CONTINUE) \ 707 if (rc != X86EMUL_CONTINUE) \
700 goto done; \ 708 goto done; \
701 (_eip) += (_size); \
702 (_type)_x; \ 709 (_type)_x; \
703}) 710})
704 711
705#define insn_fetch_arr(_arr, _size, _eip) \ 712#define insn_fetch_arr(_arr, _size, _ctxt) \
706({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ 713({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
707 if (rc != X86EMUL_CONTINUE) \ 714 if (rc != X86EMUL_CONTINUE) \
708 goto done; \ 715 goto done; \
709 (_eip) += (_size); \
710}) 716})
711 717
712/* 718/*
@@ -894,7 +900,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
894 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ 900 ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
895 } 901 }
896 902
897 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 903 ctxt->modrm = insn_fetch(u8, ctxt);
898 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; 904 ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
899 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; 905 ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
900 ctxt->modrm_rm |= (ctxt->modrm & 0x07); 906 ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -928,13 +934,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
928 switch (ctxt->modrm_mod) { 934 switch (ctxt->modrm_mod) {
929 case 0: 935 case 0:
930 if (ctxt->modrm_rm == 6) 936 if (ctxt->modrm_rm == 6)
931 modrm_ea += insn_fetch(u16, 2, ctxt->_eip); 937 modrm_ea += insn_fetch(u16, ctxt);
932 break; 938 break;
933 case 1: 939 case 1:
934 modrm_ea += insn_fetch(s8, 1, ctxt->_eip); 940 modrm_ea += insn_fetch(s8, ctxt);
935 break; 941 break;
936 case 2: 942 case 2:
937 modrm_ea += insn_fetch(u16, 2, ctxt->_eip); 943 modrm_ea += insn_fetch(u16, ctxt);
938 break; 944 break;
939 } 945 }
940 switch (ctxt->modrm_rm) { 946 switch (ctxt->modrm_rm) {
@@ -971,13 +977,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
971 } else { 977 } else {
972 /* 32/64-bit ModR/M decode. */ 978 /* 32/64-bit ModR/M decode. */
973 if ((ctxt->modrm_rm & 7) == 4) { 979 if ((ctxt->modrm_rm & 7) == 4) {
974 sib = insn_fetch(u8, 1, ctxt->_eip); 980 sib = insn_fetch(u8, ctxt);
975 index_reg |= (sib >> 3) & 7; 981 index_reg |= (sib >> 3) & 7;
976 base_reg |= sib & 7; 982 base_reg |= sib & 7;
977 scale = sib >> 6; 983 scale = sib >> 6;
978 984
979 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 985 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
980 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 986 modrm_ea += insn_fetch(s32, ctxt);
981 else 987 else
982 modrm_ea += ctxt->regs[base_reg]; 988 modrm_ea += ctxt->regs[base_reg];
983 if (index_reg != 4) 989 if (index_reg != 4)
@@ -990,13 +996,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
990 switch (ctxt->modrm_mod) { 996 switch (ctxt->modrm_mod) {
991 case 0: 997 case 0:
992 if (ctxt->modrm_rm == 5) 998 if (ctxt->modrm_rm == 5)
993 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 999 modrm_ea += insn_fetch(s32, ctxt);
994 break; 1000 break;
995 case 1: 1001 case 1:
996 modrm_ea += insn_fetch(s8, 1, ctxt->_eip); 1002 modrm_ea += insn_fetch(s8, ctxt);
997 break; 1003 break;
998 case 2: 1004 case 2:
999 modrm_ea += insn_fetch(s32, 4, ctxt->_eip); 1005 modrm_ea += insn_fetch(s32, ctxt);
1000 break; 1006 break;
1001 } 1007 }
1002 } 1008 }
@@ -1013,13 +1019,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
1013 op->type = OP_MEM; 1019 op->type = OP_MEM;
1014 switch (ctxt->ad_bytes) { 1020 switch (ctxt->ad_bytes) {
1015 case 2: 1021 case 2:
1016 op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); 1022 op->addr.mem.ea = insn_fetch(u16, ctxt);
1017 break; 1023 break;
1018 case 4: 1024 case 4:
1019 op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); 1025 op->addr.mem.ea = insn_fetch(u32, ctxt);
1020 break; 1026 break;
1021 case 8: 1027 case 8:
1022 op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); 1028 op->addr.mem.ea = insn_fetch(u64, ctxt);
1023 break; 1029 break;
1024 } 1030 }
1025done: 1031done:
@@ -1452,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
1452 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); 1458 return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
1453} 1459}
1454 1460
1455static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1461static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
1456{ 1462{
1463 int seg = ctxt->src2.val;
1464
1457 ctxt->src.val = get_segment_selector(ctxt, seg); 1465 ctxt->src.val = get_segment_selector(ctxt, seg);
1458 1466
1459 return em_push(ctxt); 1467 return em_push(ctxt);
1460} 1468}
1461 1469
1462static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1470static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1463{ 1471{
1472 int seg = ctxt->src2.val;
1464 unsigned long selector; 1473 unsigned long selector;
1465 int rc; 1474 int rc;
1466 1475
@@ -1674,64 +1683,74 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
1674{ 1683{
1675 switch (ctxt->modrm_reg) { 1684 switch (ctxt->modrm_reg) {
1676 case 0: /* rol */ 1685 case 0: /* rol */
1677 emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); 1686 emulate_2op_SrcB(ctxt, "rol");
1678 break; 1687 break;
1679 case 1: /* ror */ 1688 case 1: /* ror */
1680 emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); 1689 emulate_2op_SrcB(ctxt, "ror");
1681 break; 1690 break;
1682 case 2: /* rcl */ 1691 case 2: /* rcl */
1683 emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); 1692 emulate_2op_SrcB(ctxt, "rcl");
1684 break; 1693 break;
1685 case 3: /* rcr */ 1694 case 3: /* rcr */
1686 emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); 1695 emulate_2op_SrcB(ctxt, "rcr");
1687 break; 1696 break;
1688 case 4: /* sal/shl */ 1697 case 4: /* sal/shl */
1689 case 6: /* sal/shl */ 1698 case 6: /* sal/shl */
1690 emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); 1699 emulate_2op_SrcB(ctxt, "sal");
1691 break; 1700 break;
1692 case 5: /* shr */ 1701 case 5: /* shr */
1693 emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); 1702 emulate_2op_SrcB(ctxt, "shr");
1694 break; 1703 break;
1695 case 7: /* sar */ 1704 case 7: /* sar */
1696 emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); 1705 emulate_2op_SrcB(ctxt, "sar");
1697 break; 1706 break;
1698 } 1707 }
1699 return X86EMUL_CONTINUE; 1708 return X86EMUL_CONTINUE;
1700} 1709}
1701 1710
1702static int em_grp3(struct x86_emulate_ctxt *ctxt) 1711static int em_not(struct x86_emulate_ctxt *ctxt)
1712{
1713 ctxt->dst.val = ~ctxt->dst.val;
1714 return X86EMUL_CONTINUE;
1715}
1716
1717static int em_neg(struct x86_emulate_ctxt *ctxt)
1718{
1719 emulate_1op(ctxt, "neg");
1720 return X86EMUL_CONTINUE;
1721}
1722
1723static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
1724{
1725 u8 ex = 0;
1726
1727 emulate_1op_rax_rdx(ctxt, "mul", ex);
1728 return X86EMUL_CONTINUE;
1729}
1730
1731static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
1732{
1733 u8 ex = 0;
1734
1735 emulate_1op_rax_rdx(ctxt, "imul", ex);
1736 return X86EMUL_CONTINUE;
1737}
1738
1739static int em_div_ex(struct x86_emulate_ctxt *ctxt)
1703{ 1740{
1704 unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
1705 unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
1706 u8 de = 0; 1741 u8 de = 0;
1707 1742
1708 switch (ctxt->modrm_reg) { 1743 emulate_1op_rax_rdx(ctxt, "div", de);
1709 case 0 ... 1: /* test */ 1744 if (de)
1710 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); 1745 return emulate_de(ctxt);
1711 break; 1746 return X86EMUL_CONTINUE;
1712 case 2: /* not */ 1747}
1713 ctxt->dst.val = ~ctxt->dst.val; 1748
1714 break; 1749static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
1715 case 3: /* neg */ 1750{
1716 emulate_1op("neg", ctxt->dst, ctxt->eflags); 1751 u8 de = 0;
1717 break; 1752
1718 case 4: /* mul */ 1753 emulate_1op_rax_rdx(ctxt, "idiv", de);
1719 emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
1720 break;
1721 case 5: /* imul */
1722 emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
1723 break;
1724 case 6: /* div */
1725 emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
1726 ctxt->eflags, de);
1727 break;
1728 case 7: /* idiv */
1729 emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
1730 ctxt->eflags, de);
1731 break;
1732 default:
1733 return X86EMUL_UNHANDLEABLE;
1734 }
1735 if (de) 1754 if (de)
1736 return emulate_de(ctxt); 1755 return emulate_de(ctxt);
1737 return X86EMUL_CONTINUE; 1756 return X86EMUL_CONTINUE;
@@ -1743,10 +1762,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
1743 1762
1744 switch (ctxt->modrm_reg) { 1763 switch (ctxt->modrm_reg) {
1745 case 0: /* inc */ 1764 case 0: /* inc */
1746 emulate_1op("inc", ctxt->dst, ctxt->eflags); 1765 emulate_1op(ctxt, "inc");
1747 break; 1766 break;
1748 case 1: /* dec */ 1767 case 1: /* dec */
1749 emulate_1op("dec", ctxt->dst, ctxt->eflags); 1768 emulate_1op(ctxt, "dec");
1750 break; 1769 break;
1751 case 2: /* call near abs */ { 1770 case 2: /* call near abs */ {
1752 long int old_eip; 1771 long int old_eip;
@@ -1812,8 +1831,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
1812 return rc; 1831 return rc;
1813} 1832}
1814 1833
1815static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) 1834static int em_lseg(struct x86_emulate_ctxt *ctxt)
1816{ 1835{
1836 int seg = ctxt->src2.val;
1817 unsigned short sel; 1837 unsigned short sel;
1818 int rc; 1838 int rc;
1819 1839
@@ -2452,7 +2472,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2452 ctxt->src.type = OP_IMM; 2472 ctxt->src.type = OP_IMM;
2453 ctxt->src.val = 0; 2473 ctxt->src.val = 0;
2454 ctxt->src.bytes = 1; 2474 ctxt->src.bytes = 1;
2455 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); 2475 emulate_2op_SrcV(ctxt, "or");
2456 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2476 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2457 if (cf) 2477 if (cf)
2458 ctxt->eflags |= X86_EFLAGS_CF; 2478 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2502,49 +2522,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2502 2522
2503static int em_add(struct x86_emulate_ctxt *ctxt) 2523static int em_add(struct x86_emulate_ctxt *ctxt)
2504{ 2524{
2505 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); 2525 emulate_2op_SrcV(ctxt, "add");
2506 return X86EMUL_CONTINUE; 2526 return X86EMUL_CONTINUE;
2507} 2527}
2508 2528
2509static int em_or(struct x86_emulate_ctxt *ctxt) 2529static int em_or(struct x86_emulate_ctxt *ctxt)
2510{ 2530{
2511 emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); 2531 emulate_2op_SrcV(ctxt, "or");
2512 return X86EMUL_CONTINUE; 2532 return X86EMUL_CONTINUE;
2513} 2533}
2514 2534
2515static int em_adc(struct x86_emulate_ctxt *ctxt) 2535static int em_adc(struct x86_emulate_ctxt *ctxt)
2516{ 2536{
2517 emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); 2537 emulate_2op_SrcV(ctxt, "adc");
2518 return X86EMUL_CONTINUE; 2538 return X86EMUL_CONTINUE;
2519} 2539}
2520 2540
2521static int em_sbb(struct x86_emulate_ctxt *ctxt) 2541static int em_sbb(struct x86_emulate_ctxt *ctxt)
2522{ 2542{
2523 emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); 2543 emulate_2op_SrcV(ctxt, "sbb");
2524 return X86EMUL_CONTINUE; 2544 return X86EMUL_CONTINUE;
2525} 2545}
2526 2546
2527static int em_and(struct x86_emulate_ctxt *ctxt) 2547static int em_and(struct x86_emulate_ctxt *ctxt)
2528{ 2548{
2529 emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); 2549 emulate_2op_SrcV(ctxt, "and");
2530 return X86EMUL_CONTINUE; 2550 return X86EMUL_CONTINUE;
2531} 2551}
2532 2552
2533static int em_sub(struct x86_emulate_ctxt *ctxt) 2553static int em_sub(struct x86_emulate_ctxt *ctxt)
2534{ 2554{
2535 emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); 2555 emulate_2op_SrcV(ctxt, "sub");
2536 return X86EMUL_CONTINUE; 2556 return X86EMUL_CONTINUE;
2537} 2557}
2538 2558
2539static int em_xor(struct x86_emulate_ctxt *ctxt) 2559static int em_xor(struct x86_emulate_ctxt *ctxt)
2540{ 2560{
2541 emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); 2561 emulate_2op_SrcV(ctxt, "xor");
2542 return X86EMUL_CONTINUE; 2562 return X86EMUL_CONTINUE;
2543} 2563}
2544 2564
2545static int em_cmp(struct x86_emulate_ctxt *ctxt) 2565static int em_cmp(struct x86_emulate_ctxt *ctxt)
2546{ 2566{
2547 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); 2567 emulate_2op_SrcV(ctxt, "cmp");
2548 /* Disable writeback. */ 2568 /* Disable writeback. */
2549 ctxt->dst.type = OP_NONE; 2569 ctxt->dst.type = OP_NONE;
2550 return X86EMUL_CONTINUE; 2570 return X86EMUL_CONTINUE;
@@ -2552,7 +2572,9 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt)
2552 2572
2553static int em_test(struct x86_emulate_ctxt *ctxt) 2573static int em_test(struct x86_emulate_ctxt *ctxt)
2554{ 2574{
2555 emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); 2575 emulate_2op_SrcV(ctxt, "test");
2576 /* Disable writeback. */
2577 ctxt->dst.type = OP_NONE;
2556 return X86EMUL_CONTINUE; 2578 return X86EMUL_CONTINUE;
2557} 2579}
2558 2580
@@ -2570,7 +2592,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
2570 2592
2571static int em_imul(struct x86_emulate_ctxt *ctxt) 2593static int em_imul(struct x86_emulate_ctxt *ctxt)
2572{ 2594{
2573 emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); 2595 emulate_2op_SrcV_nobyte(ctxt, "imul");
2574 return X86EMUL_CONTINUE; 2596 return X86EMUL_CONTINUE;
2575} 2597}
2576 2598
@@ -3025,9 +3047,14 @@ static struct opcode group1A[] = {
3025}; 3047};
3026 3048
3027static struct opcode group3[] = { 3049static struct opcode group3[] = {
3028 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), 3050 I(DstMem | SrcImm | ModRM, em_test),
3029 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), 3051 I(DstMem | SrcImm | ModRM, em_test),
3030 X4(D(SrcMem | ModRM)), 3052 I(DstMem | SrcNone | ModRM | Lock, em_not),
3053 I(DstMem | SrcNone | ModRM | Lock, em_neg),
3054 I(SrcMem | ModRM, em_mul_ex),
3055 I(SrcMem | ModRM, em_imul_ex),
3056 I(SrcMem | ModRM, em_div_ex),
3057 I(SrcMem | ModRM, em_idiv_ex),
3031}; 3058};
3032 3059
3033static struct opcode group4[] = { 3060static struct opcode group4[] = {
@@ -3090,16 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = {
3090static struct opcode opcode_table[256] = { 3117static struct opcode opcode_table[256] = {
3091 /* 0x00 - 0x07 */ 3118 /* 0x00 - 0x07 */
3092 I6ALU(Lock, em_add), 3119 I6ALU(Lock, em_add),
3093 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3120 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3121 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3094 /* 0x08 - 0x0F */ 3122 /* 0x08 - 0x0F */
3095 I6ALU(Lock, em_or), 3123 I6ALU(Lock, em_or),
3096 D(ImplicitOps | Stack | No64), N, 3124 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3125 N,
3097 /* 0x10 - 0x17 */ 3126 /* 0x10 - 0x17 */
3098 I6ALU(Lock, em_adc), 3127 I6ALU(Lock, em_adc),
3099 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3128 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
3129 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
3100 /* 0x18 - 0x1F */ 3130 /* 0x18 - 0x1F */
3101 I6ALU(Lock, em_sbb), 3131 I6ALU(Lock, em_sbb),
3102 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3132 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3133 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3103 /* 0x20 - 0x27 */ 3134 /* 0x20 - 0x27 */
3104 I6ALU(Lock, em_and), N, N, 3135 I6ALU(Lock, em_and), N, N,
3105 /* 0x28 - 0x2F */ 3136 /* 0x28 - 0x2F */
@@ -3167,7 +3198,8 @@ static struct opcode opcode_table[256] = {
3167 D2bv(DstMem | SrcImmByte | ModRM), 3198 D2bv(DstMem | SrcImmByte | ModRM),
3168 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3199 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3169 I(ImplicitOps | Stack, em_ret), 3200 I(ImplicitOps | Stack, em_ret),
3170 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), 3201 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
3202 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
3171 G(ByteOp, group11), G(0, group11), 3203 G(ByteOp, group11), G(0, group11),
3172 /* 0xC8 - 0xCF */ 3204 /* 0xC8 - 0xCF */
3173 N, N, N, I(ImplicitOps | Stack, em_ret_far), 3205 N, N, N, I(ImplicitOps | Stack, em_ret_far),
@@ -3242,20 +3274,22 @@ static struct opcode twobyte_table[256] = {
3242 /* 0x90 - 0x9F */ 3274 /* 0x90 - 0x9F */
3243 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3275 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3244 /* 0xA0 - 0xA7 */ 3276 /* 0xA0 - 0xA7 */
3245 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3277 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3246 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), 3278 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
3247 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3279 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3248 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3280 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3249 /* 0xA8 - 0xAF */ 3281 /* 0xA8 - 0xAF */
3250 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3282 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3251 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3283 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3252 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3284 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3253 D(DstMem | SrcReg | Src2CL | ModRM), 3285 D(DstMem | SrcReg | Src2CL | ModRM),
3254 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3286 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3255 /* 0xB0 - 0xB7 */ 3287 /* 0xB0 - 0xB7 */
3256 D2bv(DstMem | SrcReg | ModRM | Lock), 3288 D2bv(DstMem | SrcReg | ModRM | Lock),
3257 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), 3289 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3258 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), 3290 D(DstMem | SrcReg | ModRM | BitOp | Lock),
3291 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3292 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3259 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 3293 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3260 /* 0xB8 - 0xBF */ 3294 /* 0xB8 - 0xBF */
3261 N, N, 3295 N, N,
@@ -3309,13 +3343,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3309 /* NB. Immediates are sign-extended as necessary. */ 3343 /* NB. Immediates are sign-extended as necessary. */
3310 switch (op->bytes) { 3344 switch (op->bytes) {
3311 case 1: 3345 case 1:
3312 op->val = insn_fetch(s8, 1, ctxt->_eip); 3346 op->val = insn_fetch(s8, ctxt);
3313 break; 3347 break;
3314 case 2: 3348 case 2:
3315 op->val = insn_fetch(s16, 2, ctxt->_eip); 3349 op->val = insn_fetch(s16, ctxt);
3316 break; 3350 break;
3317 case 4: 3351 case 4:
3318 op->val = insn_fetch(s32, 4, ctxt->_eip); 3352 op->val = insn_fetch(s32, ctxt);
3319 break; 3353 break;
3320 } 3354 }
3321 if (!sign_extension) { 3355 if (!sign_extension) {
@@ -3335,6 +3369,125 @@ done:
3335 return rc; 3369 return rc;
3336} 3370}
3337 3371
3372static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3373 unsigned d)
3374{
3375 int rc = X86EMUL_CONTINUE;
3376
3377 switch (d) {
3378 case OpReg:
3379 decode_register_operand(ctxt, op,
3380 op == &ctxt->dst &&
3381 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3382 break;
3383 case OpImmUByte:
3384 rc = decode_imm(ctxt, op, 1, false);
3385 break;
3386 case OpMem:
3387 ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3388 mem_common:
3389 *op = ctxt->memop;
3390 ctxt->memopp = op;
3391 if ((ctxt->d & BitOp) && op == &ctxt->dst)
3392 fetch_bit_operand(ctxt);
3393 op->orig_val = op->val;
3394 break;
3395 case OpMem64:
3396 ctxt->memop.bytes = 8;
3397 goto mem_common;
3398 case OpAcc:
3399 op->type = OP_REG;
3400 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3401 op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3402 fetch_register_operand(op);
3403 op->orig_val = op->val;
3404 break;
3405 case OpDI:
3406 op->type = OP_MEM;
3407 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3408 op->addr.mem.ea =
3409 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3410 op->addr.mem.seg = VCPU_SREG_ES;
3411 op->val = 0;
3412 break;
3413 case OpDX:
3414 op->type = OP_REG;
3415 op->bytes = 2;
3416 op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3417 fetch_register_operand(op);
3418 break;
3419 case OpCL:
3420 op->bytes = 1;
3421 op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
3422 break;
3423 case OpImmByte:
3424 rc = decode_imm(ctxt, op, 1, true);
3425 break;
3426 case OpOne:
3427 op->bytes = 1;
3428 op->val = 1;
3429 break;
3430 case OpImm:
3431 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
3432 break;
3433 case OpMem16:
3434 ctxt->memop.bytes = 2;
3435 goto mem_common;
3436 case OpMem32:
3437 ctxt->memop.bytes = 4;
3438 goto mem_common;
3439 case OpImmU16:
3440 rc = decode_imm(ctxt, op, 2, false);
3441 break;
3442 case OpImmU:
3443 rc = decode_imm(ctxt, op, imm_size(ctxt), false);
3444 break;
3445 case OpSI:
3446 op->type = OP_MEM;
3447 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3448 op->addr.mem.ea =
3449 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3450 op->addr.mem.seg = seg_override(ctxt);
3451 op->val = 0;
3452 break;
3453 case OpImmFAddr:
3454 op->type = OP_IMM;
3455 op->addr.mem.ea = ctxt->_eip;
3456 op->bytes = ctxt->op_bytes + 2;
3457 insn_fetch_arr(op->valptr, op->bytes, ctxt);
3458 break;
3459 case OpMemFAddr:
3460 ctxt->memop.bytes = ctxt->op_bytes + 2;
3461 goto mem_common;
3462 case OpES:
3463 op->val = VCPU_SREG_ES;
3464 break;
3465 case OpCS:
3466 op->val = VCPU_SREG_CS;
3467 break;
3468 case OpSS:
3469 op->val = VCPU_SREG_SS;
3470 break;
3471 case OpDS:
3472 op->val = VCPU_SREG_DS;
3473 break;
3474 case OpFS:
3475 op->val = VCPU_SREG_FS;
3476 break;
3477 case OpGS:
3478 op->val = VCPU_SREG_GS;
3479 break;
3480 case OpImplicit:
3481 /* Special instructions do their own operand decoding. */
3482 default:
3483 op->type = OP_NONE; /* Disable writeback. */
3484 break;
3485 }
3486
3487done:
3488 return rc;
3489}
3490
3338int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) 3491int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3339{ 3492{
3340 int rc = X86EMUL_CONTINUE; 3493 int rc = X86EMUL_CONTINUE;
@@ -3342,8 +3495,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3342 int def_op_bytes, def_ad_bytes, goffset, simd_prefix; 3495 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
3343 bool op_prefix = false; 3496 bool op_prefix = false;
3344 struct opcode opcode; 3497 struct opcode opcode;
3345 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3346 3498
3499 ctxt->memop.type = OP_NONE;
3500 ctxt->memopp = NULL;
3347 ctxt->_eip = ctxt->eip; 3501 ctxt->_eip = ctxt->eip;
3348 ctxt->fetch.start = ctxt->_eip; 3502 ctxt->fetch.start = ctxt->_eip;
3349 ctxt->fetch.end = ctxt->fetch.start + insn_len; 3503 ctxt->fetch.end = ctxt->fetch.start + insn_len;
@@ -3366,7 +3520,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3366 break; 3520 break;
3367#endif 3521#endif
3368 default: 3522 default:
3369 return -1; 3523 return EMULATION_FAILED;
3370 } 3524 }
3371 3525
3372 ctxt->op_bytes = def_op_bytes; 3526 ctxt->op_bytes = def_op_bytes;
@@ -3374,7 +3528,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
3374 3528
3375 /* Legacy prefixes. */ 3529 /* Legacy prefixes. */
3376 for (;;) { 3530 for (;;) {
3377 switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { 3531 switch (ctxt->b = insn_fetch(u8, ctxt)) {
3378 case 0x66: /* operand-size override */ 3532 case 0x66: /* operand-size override */
3379 op_prefix = true; 3533 op_prefix = true;
3380 /* switch between 2/4 bytes */ 3534 /* switch between 2/4 bytes */
@@ -3430,7 +3584,7 @@ done_prefixes:
3430 /* Two-byte opcode? */ 3584 /* Two-byte opcode? */
3431 if (ctxt->b == 0x0f) { 3585 if (ctxt->b == 0x0f) {
3432 ctxt->twobyte = 1; 3586 ctxt->twobyte = 1;
3433 ctxt->b = insn_fetch(u8, 1, ctxt->_eip); 3587 ctxt->b = insn_fetch(u8, ctxt);
3434 opcode = twobyte_table[ctxt->b]; 3588 opcode = twobyte_table[ctxt->b];
3435 } 3589 }
3436 ctxt->d = opcode.flags; 3590 ctxt->d = opcode.flags;
@@ -3438,13 +3592,13 @@ done_prefixes:
3438 while (ctxt->d & GroupMask) { 3592 while (ctxt->d & GroupMask) {
3439 switch (ctxt->d & GroupMask) { 3593 switch (ctxt->d & GroupMask) {
3440 case Group: 3594 case Group:
3441 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 3595 ctxt->modrm = insn_fetch(u8, ctxt);
3442 --ctxt->_eip; 3596 --ctxt->_eip;
3443 goffset = (ctxt->modrm >> 3) & 7; 3597 goffset = (ctxt->modrm >> 3) & 7;
3444 opcode = opcode.u.group[goffset]; 3598 opcode = opcode.u.group[goffset];
3445 break; 3599 break;
3446 case GroupDual: 3600 case GroupDual:
3447 ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); 3601 ctxt->modrm = insn_fetch(u8, ctxt);
3448 --ctxt->_eip; 3602 --ctxt->_eip;
3449 goffset = (ctxt->modrm >> 3) & 7; 3603 goffset = (ctxt->modrm >> 3) & 7;
3450 if ((ctxt->modrm >> 6) == 3) 3604 if ((ctxt->modrm >> 6) == 3)
@@ -3458,7 +3612,7 @@ done_prefixes:
3458 break; 3612 break;
3459 case Prefix: 3613 case Prefix:
3460 if (ctxt->rep_prefix && op_prefix) 3614 if (ctxt->rep_prefix && op_prefix)
3461 return X86EMUL_UNHANDLEABLE; 3615 return EMULATION_FAILED;
3462 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; 3616 simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
3463 switch (simd_prefix) { 3617 switch (simd_prefix) {
3464 case 0x00: opcode = opcode.u.gprefix->pfx_no; break; 3618 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
@@ -3468,10 +3622,10 @@ done_prefixes:
3468 } 3622 }
3469 break; 3623 break;
3470 default: 3624 default:
3471 return X86EMUL_UNHANDLEABLE; 3625 return EMULATION_FAILED;
3472 } 3626 }
3473 3627
3474 ctxt->d &= ~GroupMask; 3628 ctxt->d &= ~(u64)GroupMask;
3475 ctxt->d |= opcode.flags; 3629 ctxt->d |= opcode.flags;
3476 } 3630 }
3477 3631
@@ -3481,10 +3635,10 @@ done_prefixes:
3481 3635
3482 /* Unrecognised? */ 3636 /* Unrecognised? */
3483 if (ctxt->d == 0 || (ctxt->d & Undefined)) 3637 if (ctxt->d == 0 || (ctxt->d & Undefined))
3484 return -1; 3638 return EMULATION_FAILED;
3485 3639
3486 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) 3640 if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3487 return -1; 3641 return EMULATION_FAILED;
3488 3642
3489 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) 3643 if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
3490 ctxt->op_bytes = 8; 3644 ctxt->op_bytes = 8;
@@ -3501,96 +3655,27 @@ done_prefixes:
3501 3655
3502 /* ModRM and SIB bytes. */ 3656 /* ModRM and SIB bytes. */
3503 if (ctxt->d & ModRM) { 3657 if (ctxt->d & ModRM) {
3504 rc = decode_modrm(ctxt, &memop); 3658 rc = decode_modrm(ctxt, &ctxt->memop);
3505 if (!ctxt->has_seg_override) 3659 if (!ctxt->has_seg_override)
3506 set_seg_override(ctxt, ctxt->modrm_seg); 3660 set_seg_override(ctxt, ctxt->modrm_seg);
3507 } else if (ctxt->d & MemAbs) 3661 } else if (ctxt->d & MemAbs)
3508 rc = decode_abs(ctxt, &memop); 3662 rc = decode_abs(ctxt, &ctxt->memop);
3509 if (rc != X86EMUL_CONTINUE) 3663 if (rc != X86EMUL_CONTINUE)
3510 goto done; 3664 goto done;
3511 3665
3512 if (!ctxt->has_seg_override) 3666 if (!ctxt->has_seg_override)
3513 set_seg_override(ctxt, VCPU_SREG_DS); 3667 set_seg_override(ctxt, VCPU_SREG_DS);
3514 3668
3515 memop.addr.mem.seg = seg_override(ctxt); 3669 ctxt->memop.addr.mem.seg = seg_override(ctxt);
3516 3670
3517 if (memop.type == OP_MEM && ctxt->ad_bytes != 8) 3671 if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
3518 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3672 ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
3519 3673
3520 /* 3674 /*
3521 * Decode and fetch the source operand: register, memory 3675 * Decode and fetch the source operand: register, memory
3522 * or immediate. 3676 * or immediate.
3523 */ 3677 */
3524 switch (ctxt->d & SrcMask) { 3678 rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
3525 case SrcNone:
3526 break;
3527 case SrcReg:
3528 decode_register_operand(ctxt, &ctxt->src, 0);
3529 break;
3530 case SrcMem16:
3531 memop.bytes = 2;
3532 goto srcmem_common;
3533 case SrcMem32:
3534 memop.bytes = 4;
3535 goto srcmem_common;
3536 case SrcMem:
3537 memop.bytes = (ctxt->d & ByteOp) ? 1 :
3538 ctxt->op_bytes;
3539 srcmem_common:
3540 ctxt->src = memop;
3541 memopp = &ctxt->src;
3542 break;
3543 case SrcImmU16:
3544 rc = decode_imm(ctxt, &ctxt->src, 2, false);
3545 break;
3546 case SrcImm:
3547 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
3548 break;
3549 case SrcImmU:
3550 rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
3551 break;
3552 case SrcImmByte:
3553 rc = decode_imm(ctxt, &ctxt->src, 1, true);
3554 break;
3555 case SrcImmUByte:
3556 rc = decode_imm(ctxt, &ctxt->src, 1, false);
3557 break;
3558 case SrcAcc:
3559 ctxt->src.type = OP_REG;
3560 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3561 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3562 fetch_register_operand(&ctxt->src);
3563 break;
3564 case SrcOne:
3565 ctxt->src.bytes = 1;
3566 ctxt->src.val = 1;
3567 break;
3568 case SrcSI:
3569 ctxt->src.type = OP_MEM;
3570 ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3571 ctxt->src.addr.mem.ea =
3572 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
3573 ctxt->src.addr.mem.seg = seg_override(ctxt);
3574 ctxt->src.val = 0;
3575 break;
3576 case SrcImmFAddr:
3577 ctxt->src.type = OP_IMM;
3578 ctxt->src.addr.mem.ea = ctxt->_eip;
3579 ctxt->src.bytes = ctxt->op_bytes + 2;
3580 insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
3581 break;
3582 case SrcMemFAddr:
3583 memop.bytes = ctxt->op_bytes + 2;
3584 goto srcmem_common;
3585 break;
3586 case SrcDX:
3587 ctxt->src.type = OP_REG;
3588 ctxt->src.bytes = 2;
3589 ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3590 fetch_register_operand(&ctxt->src);
3591 break;
3592 }
3593
3594 if (rc != X86EMUL_CONTINUE) 3679 if (rc != X86EMUL_CONTINUE)
3595 goto done; 3680 goto done;
3596 3681
@@ -3598,85 +3683,18 @@ done_prefixes:
3598 * Decode and fetch the second source operand: register, memory 3683 * Decode and fetch the second source operand: register, memory
3599 * or immediate. 3684 * or immediate.
3600 */ 3685 */
3601 switch (ctxt->d & Src2Mask) { 3686 rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
3602 case Src2None:
3603 break;
3604 case Src2CL:
3605 ctxt->src2.bytes = 1;
3606 ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
3607 break;
3608 case Src2ImmByte:
3609 rc = decode_imm(ctxt, &ctxt->src2, 1, true);
3610 break;
3611 case Src2One:
3612 ctxt->src2.bytes = 1;
3613 ctxt->src2.val = 1;
3614 break;
3615 case Src2Imm:
3616 rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
3617 break;
3618 }
3619
3620 if (rc != X86EMUL_CONTINUE) 3687 if (rc != X86EMUL_CONTINUE)
3621 goto done; 3688 goto done;
3622 3689
3623 /* Decode and fetch the destination operand: register or memory. */ 3690 /* Decode and fetch the destination operand: register or memory. */
3624 switch (ctxt->d & DstMask) { 3691 rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
3625 case DstReg:
3626 decode_register_operand(ctxt, &ctxt->dst,
3627 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
3628 break;
3629 case DstImmUByte:
3630 ctxt->dst.type = OP_IMM;
3631 ctxt->dst.addr.mem.ea = ctxt->_eip;
3632 ctxt->dst.bytes = 1;
3633 ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
3634 break;
3635 case DstMem:
3636 case DstMem64:
3637 ctxt->dst = memop;
3638 memopp = &ctxt->dst;
3639 if ((ctxt->d & DstMask) == DstMem64)
3640 ctxt->dst.bytes = 8;
3641 else
3642 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3643 if (ctxt->d & BitOp)
3644 fetch_bit_operand(ctxt);
3645 ctxt->dst.orig_val = ctxt->dst.val;
3646 break;
3647 case DstAcc:
3648 ctxt->dst.type = OP_REG;
3649 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3650 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
3651 fetch_register_operand(&ctxt->dst);
3652 ctxt->dst.orig_val = ctxt->dst.val;
3653 break;
3654 case DstDI:
3655 ctxt->dst.type = OP_MEM;
3656 ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3657 ctxt->dst.addr.mem.ea =
3658 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
3659 ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
3660 ctxt->dst.val = 0;
3661 break;
3662 case DstDX:
3663 ctxt->dst.type = OP_REG;
3664 ctxt->dst.bytes = 2;
3665 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
3666 fetch_register_operand(&ctxt->dst);
3667 break;
3668 case ImplicitOps:
3669 /* Special instructions do their own operand decoding. */
3670 default:
3671 ctxt->dst.type = OP_NONE; /* Disable writeback. */
3672 break;
3673 }
3674 3692
3675done: 3693done:
3676 if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) 3694 if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
3677 memopp->addr.mem.ea += ctxt->_eip; 3695 ctxt->memopp->addr.mem.ea += ctxt->_eip;
3678 3696
3679 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 3697 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
3680} 3698}
3681 3699
3682static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3700static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -3825,32 +3843,11 @@ special_insn:
3825 goto twobyte_insn; 3843 goto twobyte_insn;
3826 3844
3827 switch (ctxt->b) { 3845 switch (ctxt->b) {
3828 case 0x06: /* push es */
3829 rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
3830 break;
3831 case 0x07: /* pop es */
3832 rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
3833 break;
3834 case 0x0e: /* push cs */
3835 rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
3836 break;
3837 case 0x16: /* push ss */
3838 rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
3839 break;
3840 case 0x17: /* pop ss */
3841 rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
3842 break;
3843 case 0x1e: /* push ds */
3844 rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
3845 break;
3846 case 0x1f: /* pop ds */
3847 rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
3848 break;
3849 case 0x40 ... 0x47: /* inc r16/r32 */ 3846 case 0x40 ... 0x47: /* inc r16/r32 */
3850 emulate_1op("inc", ctxt->dst, ctxt->eflags); 3847 emulate_1op(ctxt, "inc");
3851 break; 3848 break;
3852 case 0x48 ... 0x4f: /* dec r16/r32 */ 3849 case 0x48 ... 0x4f: /* dec r16/r32 */
3853 emulate_1op("dec", ctxt->dst, ctxt->eflags); 3850 emulate_1op(ctxt, "dec");
3854 break; 3851 break;
3855 case 0x63: /* movsxd */ 3852 case 0x63: /* movsxd */
3856 if (ctxt->mode != X86EMUL_MODE_PROT64) 3853 if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3891,12 +3888,6 @@ special_insn:
3891 case 0xc0 ... 0xc1: 3888 case 0xc0 ... 0xc1:
3892 rc = em_grp2(ctxt); 3889 rc = em_grp2(ctxt);
3893 break; 3890 break;
3894 case 0xc4: /* les */
3895 rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
3896 break;
3897 case 0xc5: /* lds */
3898 rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
3899 break;
3900 case 0xcc: /* int3 */ 3891 case 0xcc: /* int3 */
3901 rc = emulate_int(ctxt, 3); 3892 rc = emulate_int(ctxt, 3);
3902 break; 3893 break;
@@ -3953,9 +3944,6 @@ special_insn:
3953 /* complement carry flag from eflags reg */ 3944 /* complement carry flag from eflags reg */
3954 ctxt->eflags ^= EFLG_CF; 3945 ctxt->eflags ^= EFLG_CF;
3955 break; 3946 break;
3956 case 0xf6 ... 0xf7: /* Grp3 */
3957 rc = em_grp3(ctxt);
3958 break;
3959 case 0xf8: /* clc */ 3947 case 0xf8: /* clc */
3960 ctxt->eflags &= ~EFLG_CF; 3948 ctxt->eflags &= ~EFLG_CF;
3961 break; 3949 break;
@@ -4103,36 +4091,24 @@ twobyte_insn:
4103 case 0x90 ... 0x9f: /* setcc r/m8 */ 4091 case 0x90 ... 0x9f: /* setcc r/m8 */
4104 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4092 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4105 break; 4093 break;
4106 case 0xa0: /* push fs */
4107 rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
4108 break;
4109 case 0xa1: /* pop fs */
4110 rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
4111 break;
4112 case 0xa3: 4094 case 0xa3:
4113 bt: /* bt */ 4095 bt: /* bt */
4114 ctxt->dst.type = OP_NONE; 4096 ctxt->dst.type = OP_NONE;
4115 /* only subword offset */ 4097 /* only subword offset */
4116 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; 4098 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
4117 emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); 4099 emulate_2op_SrcV_nobyte(ctxt, "bt");
4118 break; 4100 break;
4119 case 0xa4: /* shld imm8, r, r/m */ 4101 case 0xa4: /* shld imm8, r, r/m */
4120 case 0xa5: /* shld cl, r, r/m */ 4102 case 0xa5: /* shld cl, r, r/m */
4121 emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); 4103 emulate_2op_cl(ctxt, "shld");
4122 break;
4123 case 0xa8: /* push gs */
4124 rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
4125 break;
4126 case 0xa9: /* pop gs */
4127 rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
4128 break; 4104 break;
4129 case 0xab: 4105 case 0xab:
4130 bts: /* bts */ 4106 bts: /* bts */
4131 emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); 4107 emulate_2op_SrcV_nobyte(ctxt, "bts");
4132 break; 4108 break;
4133 case 0xac: /* shrd imm8, r, r/m */ 4109 case 0xac: /* shrd imm8, r, r/m */
4134 case 0xad: /* shrd cl, r, r/m */ 4110 case 0xad: /* shrd cl, r, r/m */
4135 emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); 4111 emulate_2op_cl(ctxt, "shrd");
4136 break; 4112 break;
4137 case 0xae: /* clflush */ 4113 case 0xae: /* clflush */
4138 break; 4114 break;
@@ -4143,7 +4119,7 @@ twobyte_insn:
4143 */ 4119 */
4144 ctxt->src.orig_val = ctxt->src.val; 4120 ctxt->src.orig_val = ctxt->src.val;
4145 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; 4121 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
4146 emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); 4122 emulate_2op_SrcV(ctxt, "cmp");
4147 if (ctxt->eflags & EFLG_ZF) { 4123 if (ctxt->eflags & EFLG_ZF) {
4148 /* Success: write back to memory. */ 4124 /* Success: write back to memory. */
4149 ctxt->dst.val = ctxt->src.orig_val; 4125 ctxt->dst.val = ctxt->src.orig_val;
@@ -4153,18 +4129,9 @@ twobyte_insn:
4153 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; 4129 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
4154 } 4130 }
4155 break; 4131 break;
4156 case 0xb2: /* lss */
4157 rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
4158 break;
4159 case 0xb3: 4132 case 0xb3:
4160 btr: /* btr */ 4133 btr: /* btr */
4161 emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); 4134 emulate_2op_SrcV_nobyte(ctxt, "btr");
4162 break;
4163 case 0xb4: /* lfs */
4164 rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
4165 break;
4166 case 0xb5: /* lgs */
4167 rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
4168 break; 4135 break;
4169 case 0xb6 ... 0xb7: /* movzx */ 4136 case 0xb6 ... 0xb7: /* movzx */
4170 ctxt->dst.bytes = ctxt->op_bytes; 4137 ctxt->dst.bytes = ctxt->op_bytes;
@@ -4185,7 +4152,7 @@ twobyte_insn:
4185 break; 4152 break;
4186 case 0xbb: 4153 case 0xbb:
4187 btc: /* btc */ 4154 btc: /* btc */
4188 emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); 4155 emulate_2op_SrcV_nobyte(ctxt, "btc");
4189 break; 4156 break;
4190 case 0xbc: { /* bsf */ 4157 case 0xbc: { /* bsf */
4191 u8 zf; 4158 u8 zf;
@@ -4217,7 +4184,7 @@ twobyte_insn:
4217 (s16) ctxt->src.val; 4184 (s16) ctxt->src.val;
4218 break; 4185 break;
4219 case 0xc0 ... 0xc1: /* xadd */ 4186 case 0xc0 ... 0xc1: /* xadd */
4220 emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); 4187 emulate_2op_SrcV(ctxt, "add");
4221 /* Write back the register source. */ 4188 /* Write back the register source. */
4222 ctxt->src.val = ctxt->dst.orig_val; 4189 ctxt->src.val = ctxt->dst.orig_val;
4223 write_register_operand(&ctxt->src); 4190 write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad72385058..76e3f1cd0369 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
713 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 713 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
714 714
715 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 715 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
716 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); 716 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
717 KVM_PIT_MEM_LENGTH, &pit->dev);
717 if (ret < 0) 718 if (ret < 0)
718 goto fail; 719 goto fail;
719 720
720 if (flags & KVM_PIT_SPEAKER_DUMMY) { 721 if (flags & KVM_PIT_SPEAKER_DUMMY) {
721 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 722 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
722 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 723 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
723 &pit->speaker_dev); 724 KVM_SPEAKER_BASE_ADDRESS, 4,
725 &pit->speaker_dev);
724 if (ret < 0) 726 if (ret < 0)
725 goto fail_unregister; 727 goto fail_unregister;
726 } 728 }
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 19fe855e7953..cac4746d7ffb 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,9 @@
34#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
35#include "trace.h" 35#include "trace.h"
36 36
37#define pr_pic_unimpl(fmt, ...) \
38 pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
39
37static void pic_irq_request(struct kvm *kvm, int level); 40static void pic_irq_request(struct kvm *kvm, int level);
38 41
39static void pic_lock(struct kvm_pic *s) 42static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
306 } 309 }
307 s->init_state = 1; 310 s->init_state = 1;
308 if (val & 0x02) 311 if (val & 0x02)
309 printk(KERN_ERR "single mode not supported"); 312 pr_pic_unimpl("single mode not supported");
310 if (val & 0x08) 313 if (val & 0x08)
311 printk(KERN_ERR 314 pr_pic_unimpl(
312 "level sensitive irq not supported"); 315 "level sensitive irq not supported");
313 } else if (val & 0x08) { 316 } else if (val & 0x08) {
314 if (val & 0x04) 317 if (val & 0x04)
315 s->poll = 1; 318 s->poll = 1;
@@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr)
459 } 462 }
460} 463}
461 464
462static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) 465static int picdev_write(struct kvm_pic *s,
463{
464 return container_of(dev, struct kvm_pic, dev);
465}
466
467static int picdev_write(struct kvm_io_device *this,
468 gpa_t addr, int len, const void *val) 466 gpa_t addr, int len, const void *val)
469{ 467{
470 struct kvm_pic *s = to_pic(this);
471 unsigned char data = *(unsigned char *)val; 468 unsigned char data = *(unsigned char *)val;
472 if (!picdev_in_range(addr)) 469 if (!picdev_in_range(addr))
473 return -EOPNOTSUPP; 470 return -EOPNOTSUPP;
474 471
475 if (len != 1) { 472 if (len != 1) {
476 if (printk_ratelimit()) 473 pr_pic_unimpl("non byte write\n");
477 printk(KERN_ERR "PIC: non byte write\n");
478 return 0; 474 return 0;
479 } 475 }
480 pic_lock(s); 476 pic_lock(s);
@@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this,
494 return 0; 490 return 0;
495} 491}
496 492
497static int picdev_read(struct kvm_io_device *this, 493static int picdev_read(struct kvm_pic *s,
498 gpa_t addr, int len, void *val) 494 gpa_t addr, int len, void *val)
499{ 495{
500 struct kvm_pic *s = to_pic(this);
501 unsigned char data = 0; 496 unsigned char data = 0;
502 if (!picdev_in_range(addr)) 497 if (!picdev_in_range(addr))
503 return -EOPNOTSUPP; 498 return -EOPNOTSUPP;
504 499
505 if (len != 1) { 500 if (len != 1) {
506 if (printk_ratelimit()) 501 pr_pic_unimpl("non byte read\n");
507 printk(KERN_ERR "PIC: non byte read\n");
508 return 0; 502 return 0;
509 } 503 }
510 pic_lock(s); 504 pic_lock(s);
@@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this,
525 return 0; 519 return 0;
526} 520}
527 521
522static int picdev_master_write(struct kvm_io_device *dev,
523 gpa_t addr, int len, const void *val)
524{
525 return picdev_write(container_of(dev, struct kvm_pic, dev_master),
526 addr, len, val);
527}
528
529static int picdev_master_read(struct kvm_io_device *dev,
530 gpa_t addr, int len, void *val)
531{
532 return picdev_read(container_of(dev, struct kvm_pic, dev_master),
533 addr, len, val);
534}
535
536static int picdev_slave_write(struct kvm_io_device *dev,
537 gpa_t addr, int len, const void *val)
538{
539 return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
540 addr, len, val);
541}
542
543static int picdev_slave_read(struct kvm_io_device *dev,
544 gpa_t addr, int len, void *val)
545{
546 return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
547 addr, len, val);
548}
549
550static int picdev_eclr_write(struct kvm_io_device *dev,
551 gpa_t addr, int len, const void *val)
552{
553 return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
554 addr, len, val);
555}
556
557static int picdev_eclr_read(struct kvm_io_device *dev,
558 gpa_t addr, int len, void *val)
559{
560 return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
561 addr, len, val);
562}
563
528/* 564/*
529 * callback when PIC0 irq status changed 565 * callback when PIC0 irq status changed
530 */ 566 */
@@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
537 s->output = level; 573 s->output = level;
538} 574}
539 575
540static const struct kvm_io_device_ops picdev_ops = { 576static const struct kvm_io_device_ops picdev_master_ops = {
541 .read = picdev_read, 577 .read = picdev_master_read,
542 .write = picdev_write, 578 .write = picdev_master_write,
579};
580
581static const struct kvm_io_device_ops picdev_slave_ops = {
582 .read = picdev_slave_read,
583 .write = picdev_slave_write,
584};
585
586static const struct kvm_io_device_ops picdev_eclr_ops = {
587 .read = picdev_eclr_read,
588 .write = picdev_eclr_write,
543}; 589};
544 590
545struct kvm_pic *kvm_create_pic(struct kvm *kvm) 591struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
560 /* 606 /*
561 * Initialize PIO device 607 * Initialize PIO device
562 */ 608 */
563 kvm_iodevice_init(&s->dev, &picdev_ops); 609 kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
610 kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
611 kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
564 mutex_lock(&kvm->slots_lock); 612 mutex_lock(&kvm->slots_lock);
565 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); 613 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
614 &s->dev_master);
615 if (ret < 0)
616 goto fail_unlock;
617
618 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
619 if (ret < 0)
620 goto fail_unreg_2;
621
622 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
623 if (ret < 0)
624 goto fail_unreg_1;
625
566 mutex_unlock(&kvm->slots_lock); 626 mutex_unlock(&kvm->slots_lock);
567 if (ret < 0) {
568 kfree(s);
569 return NULL;
570 }
571 627
572 return s; 628 return s;
629
630fail_unreg_1:
631 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
632
633fail_unreg_2:
634 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
635
636fail_unlock:
637 mutex_unlock(&kvm->slots_lock);
638
639 kfree(s);
640
641 return NULL;
573} 642}
574 643
575void kvm_destroy_pic(struct kvm *kvm) 644void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm)
577 struct kvm_pic *vpic = kvm->arch.vpic; 646 struct kvm_pic *vpic = kvm->arch.vpic;
578 647
579 if (vpic) { 648 if (vpic) {
580 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); 649 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
650 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
651 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
581 kvm->arch.vpic = NULL; 652 kvm->arch.vpic = NULL;
582 kfree(vpic); 653 kfree(vpic);
583 } 654 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d084bffb..2086f2bfba33 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -66,7 +66,9 @@ struct kvm_pic {
66 struct kvm *kvm; 66 struct kvm *kvm;
67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 67 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
68 int output; /* intr from master PIC */ 68 int output; /* intr from master PIC */
69 struct kvm_io_device dev; 69 struct kvm_io_device dev_master;
70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr;
70 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
71 unsigned long irq_states[16]; 73 unsigned long irq_states[16];
72}; 74};
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3377d53fcd36..544076c4f44b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
45 return vcpu->arch.walk_mmu->pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46} 46}
47 47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
53}
54
55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
56{ 49{
57 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; 50 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 64bc6ea78d90..497dbaa366d4 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -2,6 +2,8 @@
2struct kvm_timer { 2struct kvm_timer {
3 struct hrtimer timer; 3 struct hrtimer timer;
4 s64 period; /* unit: ns */ 4 s64 period; /* unit: ns */
5 u32 timer_mode_mask;
6 u64 tscdeadline;
5 atomic_t pending; /* accumulated triggered timers */ 7 atomic_t pending; /* accumulated triggered timers */
6 bool reinject; 8 bool reinject;
7 struct kvm_timer_ops *t_ops; 9 struct kvm_timer_ops *t_ops;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57dcbd4308fa..54abb40199d6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,9 @@
68#define VEC_POS(v) ((v) & (32 - 1)) 68#define VEC_POS(v) ((v) & (32 - 1))
69#define REG_POS(v) (((v) >> 5) << 4) 69#define REG_POS(v) (((v) >> 5) << 4)
70 70
71static unsigned int min_timer_period_us = 500;
72module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
73
71static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) 74static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
72{ 75{
73 return *((u32 *) (apic->regs + reg_off)); 76 return *((u32 *) (apic->regs + reg_off));
@@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
135 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; 138 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
136} 139}
137 140
141static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
142{
143 return ((apic_get_reg(apic, APIC_LVTT) &
144 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
145}
146
138static inline int apic_lvtt_period(struct kvm_lapic *apic) 147static inline int apic_lvtt_period(struct kvm_lapic *apic)
139{ 148{
140 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; 149 return ((apic_get_reg(apic, APIC_LVTT) &
150 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
151}
152
153static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
154{
155 return ((apic_get_reg(apic, APIC_LVTT) &
156 apic->lapic_timer.timer_mode_mask) ==
157 APIC_LVT_TIMER_TSCDEADLINE);
141} 158}
142 159
143static inline int apic_lvt_nmi_mode(u32 lvt_val) 160static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
166} 183}
167 184
168static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 185static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
169 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 186 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
170 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 187 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
171 LVT_MASK | APIC_MODE_MASK, /* LVTPC */ 188 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
172 LINT_MASK, LINT_MASK, /* LVT0-1 */ 189 LINT_MASK, LINT_MASK, /* LVT0-1 */
@@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
316 result = 1; 333 result = 1;
317 break; 334 break;
318 default: 335 default:
319 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", 336 apic_debug("Bad DFR vcpu %d: %08x\n",
320 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); 337 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
321 break; 338 break;
322 } 339 }
323 340
@@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
354 result = (target != source); 371 result = (target != source);
355 break; 372 break;
356 default: 373 default:
357 printk(KERN_WARNING "Bad dest shorthand value %x\n", 374 apic_debug("kvm: apic: Bad dest shorthand value %x\n",
358 short_hand); 375 short_hand);
359 break; 376 break;
360 } 377 }
361 378
@@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
401 break; 418 break;
402 419
403 case APIC_DM_REMRD: 420 case APIC_DM_REMRD:
404 printk(KERN_DEBUG "Ignoring delivery mode 3\n"); 421 apic_debug("Ignoring delivery mode 3\n");
405 break; 422 break;
406 423
407 case APIC_DM_SMI: 424 case APIC_DM_SMI:
408 printk(KERN_DEBUG "Ignoring guest SMI\n"); 425 apic_debug("Ignoring guest SMI\n");
409 break; 426 break;
410 427
411 case APIC_DM_NMI: 428 case APIC_DM_NMI:
@@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
565 val = kvm_apic_id(apic) << 24; 582 val = kvm_apic_id(apic) << 24;
566 break; 583 break;
567 case APIC_ARBPRI: 584 case APIC_ARBPRI:
568 printk(KERN_WARNING "Access APIC ARBPRI register " 585 apic_debug("Access APIC ARBPRI register which is for P6\n");
569 "which is for P6\n");
570 break; 586 break;
571 587
572 case APIC_TMCCT: /* Timer CCR */ 588 case APIC_TMCCT: /* Timer CCR */
589 if (apic_lvtt_tscdeadline(apic))
590 return 0;
591
573 val = apic_get_tmcct(apic); 592 val = apic_get_tmcct(apic);
574 break; 593 break;
575 594
@@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic)
664 683
665static void start_apic_timer(struct kvm_lapic *apic) 684static void start_apic_timer(struct kvm_lapic *apic)
666{ 685{
667 ktime_t now = apic->lapic_timer.timer.base->get_time(); 686 ktime_t now;
668
669 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
670 APIC_BUS_CYCLE_NS * apic->divide_count;
671 atomic_set(&apic->lapic_timer.pending, 0); 687 atomic_set(&apic->lapic_timer.pending, 0);
672 688
673 if (!apic->lapic_timer.period) 689 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
674 return; 690 /* lapic timer in oneshot or peroidic mode */
675 /* 691 now = apic->lapic_timer.timer.base->get_time();
676 * Do not allow the guest to program periodic timers with small 692 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
677 * interval, since the hrtimers are not throttled by the host 693 * APIC_BUS_CYCLE_NS * apic->divide_count;
678 * scheduler. 694
679 */ 695 if (!apic->lapic_timer.period)
680 if (apic_lvtt_period(apic)) { 696 return;
681 if (apic->lapic_timer.period < NSEC_PER_MSEC/2) 697 /*
682 apic->lapic_timer.period = NSEC_PER_MSEC/2; 698 * Do not allow the guest to program periodic timers with small
683 } 699 * interval, since the hrtimers are not throttled by the host
700 * scheduler.
701 */
702 if (apic_lvtt_period(apic)) {
703 s64 min_period = min_timer_period_us * 1000LL;
704
705 if (apic->lapic_timer.period < min_period) {
706 pr_info_ratelimited(
707 "kvm: vcpu %i: requested %lld ns "
708 "lapic timer period limited to %lld ns\n",
709 apic->vcpu->vcpu_id,
710 apic->lapic_timer.period, min_period);
711 apic->lapic_timer.period = min_period;
712 }
713 }
684 714
685 hrtimer_start(&apic->lapic_timer.timer, 715 hrtimer_start(&apic->lapic_timer.timer,
686 ktime_add_ns(now, apic->lapic_timer.period), 716 ktime_add_ns(now, apic->lapic_timer.period),
687 HRTIMER_MODE_ABS); 717 HRTIMER_MODE_ABS);
688 718
689 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 719 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
690 PRIx64 ", " 720 PRIx64 ", "
691 "timer initial count 0x%x, period %lldns, " 721 "timer initial count 0x%x, period %lldns, "
692 "expire @ 0x%016" PRIx64 ".\n", __func__, 722 "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic)
695 apic->lapic_timer.period, 725 apic->lapic_timer.period,
696 ktime_to_ns(ktime_add_ns(now, 726 ktime_to_ns(ktime_add_ns(now,
697 apic->lapic_timer.period))); 727 apic->lapic_timer.period)));
728 } else if (apic_lvtt_tscdeadline(apic)) {
729 /* lapic timer in tsc deadline mode */
730 u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
731 u64 ns = 0;
732 struct kvm_vcpu *vcpu = apic->vcpu;
733 unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
734 unsigned long flags;
735
736 if (unlikely(!tscdeadline || !this_tsc_khz))
737 return;
738
739 local_irq_save(flags);
740
741 now = apic->lapic_timer.timer.base->get_time();
742 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
743 if (likely(tscdeadline > guest_tsc)) {
744 ns = (tscdeadline - guest_tsc) * 1000000ULL;
745 do_div(ns, this_tsc_khz);
746 }
747 hrtimer_start(&apic->lapic_timer.timer,
748 ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
749
750 local_irq_restore(flags);
751 }
698} 752}
699 753
700static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 754static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
782 836
783 case APIC_LVT0: 837 case APIC_LVT0:
784 apic_manage_nmi_watchdog(apic, val); 838 apic_manage_nmi_watchdog(apic, val);
785 case APIC_LVTT:
786 case APIC_LVTTHMR: 839 case APIC_LVTTHMR:
787 case APIC_LVTPC: 840 case APIC_LVTPC:
788 case APIC_LVT1: 841 case APIC_LVT1:
@@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
796 849
797 break; 850 break;
798 851
852 case APIC_LVTT:
853 if ((apic_get_reg(apic, APIC_LVTT) &
854 apic->lapic_timer.timer_mode_mask) !=
855 (val & apic->lapic_timer.timer_mode_mask))
856 hrtimer_cancel(&apic->lapic_timer.timer);
857
858 if (!apic_sw_enabled(apic))
859 val |= APIC_LVT_MASKED;
860 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
861 apic_set_reg(apic, APIC_LVTT, val);
862 break;
863
799 case APIC_TMICT: 864 case APIC_TMICT:
865 if (apic_lvtt_tscdeadline(apic))
866 break;
867
800 hrtimer_cancel(&apic->lapic_timer.timer); 868 hrtimer_cancel(&apic->lapic_timer.timer);
801 apic_set_reg(apic, APIC_TMICT, val); 869 apic_set_reg(apic, APIC_TMICT, val);
802 start_apic_timer(apic); 870 start_apic_timer(apic);
@@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
804 872
805 case APIC_TDCR: 873 case APIC_TDCR:
806 if (val & 4) 874 if (val & 4)
807 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); 875 apic_debug("KVM_WRITE:TDCR %x\n", val);
808 apic_set_reg(apic, APIC_TDCR, val); 876 apic_set_reg(apic, APIC_TDCR, val);
809 update_divide_count(apic); 877 update_divide_count(apic);
810 break; 878 break;
811 879
812 case APIC_ESR: 880 case APIC_ESR:
813 if (apic_x2apic_mode(apic) && val != 0) { 881 if (apic_x2apic_mode(apic) && val != 0) {
814 printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); 882 apic_debug("KVM_WRITE:ESR not zero %x\n", val);
815 ret = 1; 883 ret = 1;
816 } 884 }
817 break; 885 break;
@@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this,
864 return 0; 932 return 0;
865} 933}
866 934
935void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
936{
937 struct kvm_lapic *apic = vcpu->arch.apic;
938
939 if (apic)
940 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
941}
942EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
943
867void kvm_free_lapic(struct kvm_vcpu *vcpu) 944void kvm_free_lapic(struct kvm_vcpu *vcpu)
868{ 945{
869 if (!vcpu->arch.apic) 946 if (!vcpu->arch.apic)
@@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
883 *---------------------------------------------------------------------- 960 *----------------------------------------------------------------------
884 */ 961 */
885 962
963u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
964{
965 struct kvm_lapic *apic = vcpu->arch.apic;
966 if (!apic)
967 return 0;
968
969 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
970 return 0;
971
972 return apic->lapic_timer.tscdeadline;
973}
974
975void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
976{
977 struct kvm_lapic *apic = vcpu->arch.apic;
978 if (!apic)
979 return;
980
981 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
982 return;
983
984 hrtimer_cancel(&apic->lapic_timer.timer);
985 apic->lapic_timer.tscdeadline = data;
986 start_apic_timer(apic);
987}
988
886void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) 989void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
887{ 990{
888 struct kvm_lapic *apic = vcpu->arch.apic; 991 struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 52c9e6b9e725..138e8cc6fea6 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
26void kvm_lapic_reset(struct kvm_vcpu *vcpu); 26void kvm_lapic_reset(struct kvm_vcpu *vcpu);
27u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 27u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
29void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
29void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 30void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
30u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
31void kvm_apic_set_version(struct kvm_vcpu *vcpu); 32void kvm_apic_set_version(struct kvm_vcpu *vcpu);
@@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
41bool kvm_apic_present(struct kvm_vcpu *vcpu); 42bool kvm_apic_present(struct kvm_vcpu *vcpu);
42int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
43 44
45u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
46void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
47
44void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 48void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
45void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 49void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
46void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 50void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8e8da7960dbe..f1b36cf3e3d0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2770 2770
2771 ASSERT(!VALID_PAGE(root)); 2771 ASSERT(!VALID_PAGE(root));
2772 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2772 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2773 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); 2773 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
2774 if (!is_present_gpte(pdptr)) { 2774 if (!is_present_gpte(pdptr)) {
2775 vcpu->arch.mmu.pae_root[i] = 0; 2775 vcpu->arch.mmu.pae_root[i] = 0;
2776 continue; 2776 continue;
@@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3318 context->direct_map = true; 3318 context->direct_map = true;
3319 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 3319 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
3320 context->get_cr3 = get_cr3; 3320 context->get_cr3 = get_cr3;
3321 context->get_pdptr = kvm_pdptr_read;
3321 context->inject_page_fault = kvm_inject_page_fault; 3322 context->inject_page_fault = kvm_inject_page_fault;
3322 context->nx = is_nx(vcpu); 3323 context->nx = is_nx(vcpu);
3323 3324
@@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
3376 3377
3377 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; 3378 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3378 vcpu->arch.walk_mmu->get_cr3 = get_cr3; 3379 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3380 vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
3379 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3381 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3380 3382
3381 return r; 3383 return r;
@@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3386 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 3388 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3387 3389
3388 g_context->get_cr3 = get_cr3; 3390 g_context->get_cr3 = get_cr3;
3391 g_context->get_pdptr = kvm_pdptr_read;
3389 g_context->inject_page_fault = kvm_inject_page_fault; 3392 g_context->inject_page_fault = kvm_inject_page_fault;
3390 3393
3391 /* 3394 /*
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2460a265be23..746ec259d024 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
121 121
122static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) 122static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
123{ 123{
124 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
124 unsigned long *rmapp; 125 unsigned long *rmapp;
125 struct kvm_mmu_page *rev_sp; 126 struct kvm_mmu_page *rev_sp;
126 gfn_t gfn; 127 gfn_t gfn;
127 128
128
129 rev_sp = page_header(__pa(sptep)); 129 rev_sp = page_header(__pa(sptep));
130 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 130 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
131 131
132 if (!gfn_to_memslot(kvm, gfn)) { 132 if (!gfn_to_memslot(kvm, gfn)) {
133 if (!printk_ratelimit()) 133 if (!__ratelimit(&ratelimit_state))
134 return; 134 return;
135 audit_printk(kvm, "no memslot for gfn %llx\n", gfn); 135 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
136 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", 136 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
141 141
142 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 142 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
143 if (!*rmapp) { 143 if (!*rmapp) {
144 if (!printk_ratelimit()) 144 if (!__ratelimit(&ratelimit_state))
145 return; 145 return;
146 audit_printk(kvm, "no rmap for writable spte %llx\n", 146 audit_printk(kvm, "no rmap for writable spte %llx\n",
147 *sptep); 147 *sptep);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 507e2b844cfa..92994100638b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
147 gfn_t table_gfn; 147 gfn_t table_gfn;
148 unsigned index, pt_access, uninitialized_var(pte_access); 148 unsigned index, pt_access, uninitialized_var(pte_access);
149 gpa_t pte_gpa; 149 gpa_t pte_gpa;
150 bool eperm; 150 bool eperm, last_gpte;
151 int offset; 151 int offset;
152 const int write_fault = access & PFERR_WRITE_MASK; 152 const int write_fault = access & PFERR_WRITE_MASK;
153 const int user_fault = access & PFERR_USER_MASK; 153 const int user_fault = access & PFERR_USER_MASK;
@@ -163,7 +163,7 @@ retry_walk:
163 163
164#if PTTYPE == 64 164#if PTTYPE == 64
165 if (walker->level == PT32E_ROOT_LEVEL) { 165 if (walker->level == PT32E_ROOT_LEVEL) {
166 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); 166 pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
167 trace_kvm_mmu_paging_element(pte, walker->level); 167 trace_kvm_mmu_paging_element(pte, walker->level);
168 if (!is_present_gpte(pte)) 168 if (!is_present_gpte(pte))
169 goto error; 169 goto error;
@@ -221,6 +221,17 @@ retry_walk:
221 eperm = true; 221 eperm = true;
222#endif 222#endif
223 223
224 last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
225 if (last_gpte) {
226 pte_access = pt_access &
227 FNAME(gpte_access)(vcpu, pte, true);
228 /* check if the kernel is fetching from user page */
229 if (unlikely(pte_access & PT_USER_MASK) &&
230 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
231 if (fetch_fault && !user_fault)
232 eperm = true;
233 }
234
224 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { 235 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
225 int ret; 236 int ret;
226 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 237 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -238,18 +249,12 @@ retry_walk:
238 249
239 walker->ptes[walker->level - 1] = pte; 250 walker->ptes[walker->level - 1] = pte;
240 251
241 if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { 252 if (last_gpte) {
242 int lvl = walker->level; 253 int lvl = walker->level;
243 gpa_t real_gpa; 254 gpa_t real_gpa;
244 gfn_t gfn; 255 gfn_t gfn;
245 u32 ac; 256 u32 ac;
246 257
247 /* check if the kernel is fetching from user page */
248 if (unlikely(pte_access & PT_USER_MASK) &&
249 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
250 if (fetch_fault && !user_fault)
251 eperm = true;
252
253 gfn = gpte_to_gfn_lvl(pte, lvl); 258 gfn = gpte_to_gfn_lvl(pte, lvl);
254 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; 259 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
255 260
@@ -295,7 +300,6 @@ retry_walk:
295 walker->ptes[walker->level - 1] = pte; 300 walker->ptes[walker->level - 1] = pte;
296 } 301 }
297 302
298 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
299 walker->pt_access = pt_access; 303 walker->pt_access = pt_access;
300 walker->pte_access = pte_access; 304 walker->pte_access = pte_access;
301 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 305 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 475d1c948501..e32243eac2f4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1084 if (npt_enabled) { 1084 if (npt_enabled) {
1085 /* Setup VMCB for Nested Paging */ 1085 /* Setup VMCB for Nested Paging */
1086 control->nested_ctl = 1; 1086 control->nested_ctl = 1;
1087 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
1088 clr_intercept(svm, INTERCEPT_INVLPG); 1087 clr_intercept(svm, INTERCEPT_INVLPG);
1089 clr_exception_intercept(svm, PF_VECTOR); 1088 clr_exception_intercept(svm, PF_VECTOR);
1090 clr_cr_intercept(svm, INTERCEPT_CR3_READ); 1089 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1844 return svm->nested.nested_cr3; 1843 return svm->nested.nested_cr3;
1845} 1844}
1846 1845
1846static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1847{
1848 struct vcpu_svm *svm = to_svm(vcpu);
1849 u64 cr3 = svm->nested.nested_cr3;
1850 u64 pdpte;
1851 int ret;
1852
1853 ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
1854 offset_in_page(cr3) + index * 8, 8);
1855 if (ret)
1856 return 0;
1857 return pdpte;
1858}
1859
1847static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, 1860static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1848 unsigned long root) 1861 unsigned long root)
1849{ 1862{
@@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1875 1888
1876 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; 1889 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1877 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; 1890 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1891 vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
1878 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; 1892 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1879 vcpu->arch.mmu.shadow_root_level = get_npt_level(); 1893 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1880 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 1894 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
@@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2182 vmcb->control.exit_info_1, 2196 vmcb->control.exit_info_1,
2183 vmcb->control.exit_info_2, 2197 vmcb->control.exit_info_2,
2184 vmcb->control.exit_int_info, 2198 vmcb->control.exit_int_info,
2185 vmcb->control.exit_int_info_err); 2199 vmcb->control.exit_int_info_err,
2200 KVM_ISA_SVM);
2186 2201
2187 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); 2202 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2188 if (!nested_vmcb) 2203 if (!nested_vmcb)
@@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm)
2894 return 0; 2909 return 0;
2895} 2910}
2896 2911
2912u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
2913{
2914 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
2915 return vmcb->control.tsc_offset +
2916 svm_scale_tsc(vcpu, native_read_tsc());
2917}
2918
2897static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 2919static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2898{ 2920{
2899 struct vcpu_svm *svm = to_svm(vcpu); 2921 struct vcpu_svm *svm = to_svm(vcpu);
2900 2922
2901 switch (ecx) { 2923 switch (ecx) {
2902 case MSR_IA32_TSC: { 2924 case MSR_IA32_TSC: {
2903 struct vmcb *vmcb = get_host_vmcb(svm); 2925 *data = svm->vmcb->control.tsc_offset +
2904
2905 *data = vmcb->control.tsc_offset +
2906 svm_scale_tsc(vcpu, native_read_tsc()); 2926 svm_scale_tsc(vcpu, native_read_tsc());
2907 2927
2908 break; 2928 break;
@@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3314 struct kvm_run *kvm_run = vcpu->run; 3334 struct kvm_run *kvm_run = vcpu->run;
3315 u32 exit_code = svm->vmcb->control.exit_code; 3335 u32 exit_code = svm->vmcb->control.exit_code;
3316 3336
3317 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3318
3319 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) 3337 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3320 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3338 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3321 if (npt_enabled) 3339 if (npt_enabled)
@@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
3335 svm->vmcb->control.exit_info_1, 3353 svm->vmcb->control.exit_info_1,
3336 svm->vmcb->control.exit_info_2, 3354 svm->vmcb->control.exit_info_2,
3337 svm->vmcb->control.exit_int_info, 3355 svm->vmcb->control.exit_int_info,
3338 svm->vmcb->control.exit_int_info_err); 3356 svm->vmcb->control.exit_int_info_err,
3357 KVM_ISA_SVM);
3339 3358
3340 vmexit = nested_svm_exit_special(svm); 3359 vmexit = nested_svm_exit_special(svm);
3341 3360
@@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3768 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3787 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3769 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3788 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3770 3789
3790 trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
3791
3771 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3792 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3772 kvm_before_handle_nmi(&svm->vcpu); 3793 kvm_before_handle_nmi(&svm->vcpu);
3773 3794
@@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3897 } 3918 }
3898} 3919}
3899 3920
3900static const struct trace_print_flags svm_exit_reasons_str[] = {
3901 { SVM_EXIT_READ_CR0, "read_cr0" },
3902 { SVM_EXIT_READ_CR3, "read_cr3" },
3903 { SVM_EXIT_READ_CR4, "read_cr4" },
3904 { SVM_EXIT_READ_CR8, "read_cr8" },
3905 { SVM_EXIT_WRITE_CR0, "write_cr0" },
3906 { SVM_EXIT_WRITE_CR3, "write_cr3" },
3907 { SVM_EXIT_WRITE_CR4, "write_cr4" },
3908 { SVM_EXIT_WRITE_CR8, "write_cr8" },
3909 { SVM_EXIT_READ_DR0, "read_dr0" },
3910 { SVM_EXIT_READ_DR1, "read_dr1" },
3911 { SVM_EXIT_READ_DR2, "read_dr2" },
3912 { SVM_EXIT_READ_DR3, "read_dr3" },
3913 { SVM_EXIT_WRITE_DR0, "write_dr0" },
3914 { SVM_EXIT_WRITE_DR1, "write_dr1" },
3915 { SVM_EXIT_WRITE_DR2, "write_dr2" },
3916 { SVM_EXIT_WRITE_DR3, "write_dr3" },
3917 { SVM_EXIT_WRITE_DR5, "write_dr5" },
3918 { SVM_EXIT_WRITE_DR7, "write_dr7" },
3919 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
3920 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
3921 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
3922 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
3923 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
3924 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
3925 { SVM_EXIT_INTR, "interrupt" },
3926 { SVM_EXIT_NMI, "nmi" },
3927 { SVM_EXIT_SMI, "smi" },
3928 { SVM_EXIT_INIT, "init" },
3929 { SVM_EXIT_VINTR, "vintr" },
3930 { SVM_EXIT_CPUID, "cpuid" },
3931 { SVM_EXIT_INVD, "invd" },
3932 { SVM_EXIT_HLT, "hlt" },
3933 { SVM_EXIT_INVLPG, "invlpg" },
3934 { SVM_EXIT_INVLPGA, "invlpga" },
3935 { SVM_EXIT_IOIO, "io" },
3936 { SVM_EXIT_MSR, "msr" },
3937 { SVM_EXIT_TASK_SWITCH, "task_switch" },
3938 { SVM_EXIT_SHUTDOWN, "shutdown" },
3939 { SVM_EXIT_VMRUN, "vmrun" },
3940 { SVM_EXIT_VMMCALL, "hypercall" },
3941 { SVM_EXIT_VMLOAD, "vmload" },
3942 { SVM_EXIT_VMSAVE, "vmsave" },
3943 { SVM_EXIT_STGI, "stgi" },
3944 { SVM_EXIT_CLGI, "clgi" },
3945 { SVM_EXIT_SKINIT, "skinit" },
3946 { SVM_EXIT_WBINVD, "wbinvd" },
3947 { SVM_EXIT_MONITOR, "monitor" },
3948 { SVM_EXIT_MWAIT, "mwait" },
3949 { SVM_EXIT_XSETBV, "xsetbv" },
3950 { SVM_EXIT_NPF, "npf" },
3951 { -1, NULL }
3952};
3953
3954static int svm_get_lpage_level(void) 3921static int svm_get_lpage_level(void)
3955{ 3922{
3956 return PT_PDPE_LEVEL; 3923 return PT_PDPE_LEVEL;
@@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = {
4223 .get_mt_mask = svm_get_mt_mask, 4190 .get_mt_mask = svm_get_mt_mask,
4224 4191
4225 .get_exit_info = svm_get_exit_info, 4192 .get_exit_info = svm_get_exit_info,
4226 .exit_reasons_str = svm_exit_reasons_str,
4227 4193
4228 .get_lpage_level = svm_get_lpage_level, 4194 .get_lpage_level = svm_get_lpage_level,
4229 4195
@@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4239 .write_tsc_offset = svm_write_tsc_offset, 4205 .write_tsc_offset = svm_write_tsc_offset,
4240 .adjust_tsc_offset = svm_adjust_tsc_offset, 4206 .adjust_tsc_offset = svm_adjust_tsc_offset,
4241 .compute_tsc_offset = svm_compute_tsc_offset, 4207 .compute_tsc_offset = svm_compute_tsc_offset,
4208 .read_l1_tsc = svm_read_l1_tsc,
4242 4209
4243 .set_tdp_cr3 = set_tdp_cr3, 4210 .set_tdp_cr3 = set_tdp_cr3,
4244 4211
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 3ff898c104f7..911d2641f14c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -2,6 +2,8 @@
2#define _TRACE_KVM_H 2#define _TRACE_KVM_H
3 3
4#include <linux/tracepoint.h> 4#include <linux/tracepoint.h>
5#include <asm/vmx.h>
6#include <asm/svm.h>
5 7
6#undef TRACE_SYSTEM 8#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 9#define TRACE_SYSTEM kvm
@@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic,
181#define KVM_ISA_VMX 1 183#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2 184#define KVM_ISA_SVM 2
183 185
186#define VMX_EXIT_REASONS \
187 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
188 { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
189 { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
190 { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
191 { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
192 { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
193 { EXIT_REASON_CPUID, "CPUID" }, \
194 { EXIT_REASON_HLT, "HLT" }, \
195 { EXIT_REASON_INVLPG, "INVLPG" }, \
196 { EXIT_REASON_RDPMC, "RDPMC" }, \
197 { EXIT_REASON_RDTSC, "RDTSC" }, \
198 { EXIT_REASON_VMCALL, "VMCALL" }, \
199 { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
200 { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
201 { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
202 { EXIT_REASON_VMPTRST, "VMPTRST" }, \
203 { EXIT_REASON_VMREAD, "VMREAD" }, \
204 { EXIT_REASON_VMRESUME, "VMRESUME" }, \
205 { EXIT_REASON_VMWRITE, "VMWRITE" }, \
206 { EXIT_REASON_VMOFF, "VMOFF" }, \
207 { EXIT_REASON_VMON, "VMON" }, \
208 { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
209 { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
210 { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
211 { EXIT_REASON_MSR_READ, "MSR_READ" }, \
212 { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
213 { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
214 { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
215 { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
216 { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
217 { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
218 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
219 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
220 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
221 { EXIT_REASON_WBINVD, "WBINVD" }
222
223#define SVM_EXIT_REASONS \
224 { SVM_EXIT_READ_CR0, "read_cr0" }, \
225 { SVM_EXIT_READ_CR3, "read_cr3" }, \
226 { SVM_EXIT_READ_CR4, "read_cr4" }, \
227 { SVM_EXIT_READ_CR8, "read_cr8" }, \
228 { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
229 { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
230 { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
231 { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
232 { SVM_EXIT_READ_DR0, "read_dr0" }, \
233 { SVM_EXIT_READ_DR1, "read_dr1" }, \
234 { SVM_EXIT_READ_DR2, "read_dr2" }, \
235 { SVM_EXIT_READ_DR3, "read_dr3" }, \
236 { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
237 { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
238 { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
239 { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
240 { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
241 { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
242 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
243 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
244 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
245 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
246 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
247 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
248 { SVM_EXIT_INTR, "interrupt" }, \
249 { SVM_EXIT_NMI, "nmi" }, \
250 { SVM_EXIT_SMI, "smi" }, \
251 { SVM_EXIT_INIT, "init" }, \
252 { SVM_EXIT_VINTR, "vintr" }, \
253 { SVM_EXIT_CPUID, "cpuid" }, \
254 { SVM_EXIT_INVD, "invd" }, \
255 { SVM_EXIT_HLT, "hlt" }, \
256 { SVM_EXIT_INVLPG, "invlpg" }, \
257 { SVM_EXIT_INVLPGA, "invlpga" }, \
258 { SVM_EXIT_IOIO, "io" }, \
259 { SVM_EXIT_MSR, "msr" }, \
260 { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
261 { SVM_EXIT_SHUTDOWN, "shutdown" }, \
262 { SVM_EXIT_VMRUN, "vmrun" }, \
263 { SVM_EXIT_VMMCALL, "hypercall" }, \
264 { SVM_EXIT_VMLOAD, "vmload" }, \
265 { SVM_EXIT_VMSAVE, "vmsave" }, \
266 { SVM_EXIT_STGI, "stgi" }, \
267 { SVM_EXIT_CLGI, "clgi" }, \
268 { SVM_EXIT_SKINIT, "skinit" }, \
269 { SVM_EXIT_WBINVD, "wbinvd" }, \
270 { SVM_EXIT_MONITOR, "monitor" }, \
271 { SVM_EXIT_MWAIT, "mwait" }, \
272 { SVM_EXIT_XSETBV, "xsetbv" }, \
273 { SVM_EXIT_NPF, "npf" }
274
184/* 275/*
185 * Tracepoint for kvm guest exit: 276 * Tracepoint for kvm guest exit:
186 */ 277 */
@@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit,
205 ), 296 ),
206 297
207 TP_printk("reason %s rip 0x%lx info %llx %llx", 298 TP_printk("reason %s rip 0x%lx info %llx %llx",
208 ftrace_print_symbols_seq(p, __entry->exit_reason, 299 (__entry->isa == KVM_ISA_VMX) ?
209 kvm_x86_ops->exit_reasons_str), 300 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
301 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
210 __entry->guest_rip, __entry->info1, __entry->info2) 302 __entry->guest_rip, __entry->info1, __entry->info2)
211); 303);
212 304
@@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts,
486TRACE_EVENT(kvm_nested_vmexit, 578TRACE_EVENT(kvm_nested_vmexit,
487 TP_PROTO(__u64 rip, __u32 exit_code, 579 TP_PROTO(__u64 rip, __u32 exit_code,
488 __u64 exit_info1, __u64 exit_info2, 580 __u64 exit_info1, __u64 exit_info2,
489 __u32 exit_int_info, __u32 exit_int_info_err), 581 __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
490 TP_ARGS(rip, exit_code, exit_info1, exit_info2, 582 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
491 exit_int_info, exit_int_info_err), 583 exit_int_info, exit_int_info_err, isa),
492 584
493 TP_STRUCT__entry( 585 TP_STRUCT__entry(
494 __field( __u64, rip ) 586 __field( __u64, rip )
@@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit,
497 __field( __u64, exit_info2 ) 589 __field( __u64, exit_info2 )
498 __field( __u32, exit_int_info ) 590 __field( __u32, exit_int_info )
499 __field( __u32, exit_int_info_err ) 591 __field( __u32, exit_int_info_err )
592 __field( __u32, isa )
500 ), 593 ),
501 594
502 TP_fast_assign( 595 TP_fast_assign(
@@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit,
506 __entry->exit_info2 = exit_info2; 599 __entry->exit_info2 = exit_info2;
507 __entry->exit_int_info = exit_int_info; 600 __entry->exit_int_info = exit_int_info;
508 __entry->exit_int_info_err = exit_int_info_err; 601 __entry->exit_int_info_err = exit_int_info_err;
602 __entry->isa = isa;
509 ), 603 ),
510 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 604 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
511 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", 605 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
512 __entry->rip, 606 __entry->rip,
513 ftrace_print_symbols_seq(p, __entry->exit_code, 607 (__entry->isa == KVM_ISA_VMX) ?
514 kvm_x86_ops->exit_reasons_str), 608 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
609 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
515 __entry->exit_info1, __entry->exit_info2, 610 __entry->exit_info1, __entry->exit_info2,
516 __entry->exit_int_info, __entry->exit_int_info_err) 611 __entry->exit_int_info, __entry->exit_int_info_err)
517); 612);
@@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit,
522TRACE_EVENT(kvm_nested_vmexit_inject, 617TRACE_EVENT(kvm_nested_vmexit_inject,
523 TP_PROTO(__u32 exit_code, 618 TP_PROTO(__u32 exit_code,
524 __u64 exit_info1, __u64 exit_info2, 619 __u64 exit_info1, __u64 exit_info2,
525 __u32 exit_int_info, __u32 exit_int_info_err), 620 __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
526 TP_ARGS(exit_code, exit_info1, exit_info2, 621 TP_ARGS(exit_code, exit_info1, exit_info2,
527 exit_int_info, exit_int_info_err), 622 exit_int_info, exit_int_info_err, isa),
528 623
529 TP_STRUCT__entry( 624 TP_STRUCT__entry(
530 __field( __u32, exit_code ) 625 __field( __u32, exit_code )
@@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
532 __field( __u64, exit_info2 ) 627 __field( __u64, exit_info2 )
533 __field( __u32, exit_int_info ) 628 __field( __u32, exit_int_info )
534 __field( __u32, exit_int_info_err ) 629 __field( __u32, exit_int_info_err )
630 __field( __u32, isa )
535 ), 631 ),
536 632
537 TP_fast_assign( 633 TP_fast_assign(
@@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
540 __entry->exit_info2 = exit_info2; 636 __entry->exit_info2 = exit_info2;
541 __entry->exit_int_info = exit_int_info; 637 __entry->exit_int_info = exit_int_info;
542 __entry->exit_int_info_err = exit_int_info_err; 638 __entry->exit_int_info_err = exit_int_info_err;
639 __entry->isa = isa;
543 ), 640 ),
544 641
545 TP_printk("reason: %s ext_inf1: 0x%016llx " 642 TP_printk("reason: %s ext_inf1: 0x%016llx "
546 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", 643 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
547 ftrace_print_symbols_seq(p, __entry->exit_code, 644 (__entry->isa == KVM_ISA_VMX) ?
548 kvm_x86_ops->exit_reasons_str), 645 __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
646 __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
549 __entry->exit_info1, __entry->exit_info2, 647 __entry->exit_info1, __entry->exit_info2,
550 __entry->exit_int_info, __entry->exit_int_info_err) 648 __entry->exit_int_info, __entry->exit_int_info_err)
551); 649);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e65a158dee64..a0d6bd9ad442 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
71static int __read_mostly yield_on_hlt = 1; 71static int __read_mostly yield_on_hlt = 1;
72module_param(yield_on_hlt, bool, S_IRUGO); 72module_param(yield_on_hlt, bool, S_IRUGO);
73 73
74static int __read_mostly fasteoi = 1;
75module_param(fasteoi, bool, S_IRUGO);
76
74/* 77/*
75 * If nested=1, nested virtualization is supported, i.e., guests may use 78 * If nested=1, nested virtualization is supported, i.e., guests may use
76 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 79 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1748,6 +1751,21 @@ static u64 guest_read_tsc(void)
1748} 1751}
1749 1752
1750/* 1753/*
1754 * Like guest_read_tsc, but always returns L1's notion of the timestamp
1755 * counter, even if a nested guest (L2) is currently running.
1756 */
1757u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
1758{
1759 u64 host_tsc, tsc_offset;
1760
1761 rdtscll(host_tsc);
1762 tsc_offset = is_guest_mode(vcpu) ?
1763 to_vmx(vcpu)->nested.vmcs01_tsc_offset :
1764 vmcs_read64(TSC_OFFSET);
1765 return host_tsc + tsc_offset;
1766}
1767
1768/*
1751 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ 1769 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1752 * ioctl. In this case the call-back should update internal vmx state to make 1770 * ioctl. In this case the call-back should update internal vmx state to make
1753 * the changes effective. 1771 * the changes effective.
@@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1762 */ 1780 */
1763static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1781static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1764{ 1782{
1765 vmcs_write64(TSC_OFFSET, offset); 1783 if (is_guest_mode(vcpu)) {
1766 if (is_guest_mode(vcpu))
1767 /* 1784 /*
1768 * We're here if L1 chose not to trap the TSC MSR. Since 1785 * We're here if L1 chose not to trap WRMSR to TSC. According
1769 * prepare_vmcs12() does not copy tsc_offset, we need to also 1786 * to the spec, this should set L1's TSC; The offset that L1
1770 * set the vmcs12 field here. 1787 * set for L2 remains unchanged, and still needs to be added
1788 * to the newly set TSC to get L2's TSC.
1771 */ 1789 */
1772 get_vmcs12(vcpu)->tsc_offset = offset - 1790 struct vmcs12 *vmcs12;
1773 to_vmx(vcpu)->nested.vmcs01_tsc_offset; 1791 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
1792 /* recalculate vmcs02.TSC_OFFSET: */
1793 vmcs12 = get_vmcs12(vcpu);
1794 vmcs_write64(TSC_OFFSET, offset +
1795 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
1796 vmcs12->tsc_offset : 0));
1797 } else {
1798 vmcs_write64(TSC_OFFSET, offset);
1799 }
1774} 1800}
1775 1801
1776static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) 1802static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
2736 2762
2737 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 2763 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
2738 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 2764 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
2739 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 2765 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2740 __func__); 2766 __func__);
2741 vmcs_write32(GUEST_TR_AR_BYTES, 2767 vmcs_write32(GUEST_TR_AR_BYTES,
2742 (guest_tr_ar & ~AR_TYPE_MASK) 2768 (guest_tr_ar & ~AR_TYPE_MASK)
2743 | AR_TYPE_BUSY_64_TSS); 2769 | AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4115 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4141 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
4116 if (is_page_fault(intr_info)) { 4142 if (is_page_fault(intr_info)) {
4117 /* EPT won't cause page fault directly */ 4143 /* EPT won't cause page fault directly */
4118 if (enable_ept) 4144 BUG_ON(enable_ept);
4119 BUG();
4120 cr2 = vmcs_readl(EXIT_QUALIFICATION); 4145 cr2 = vmcs_readl(EXIT_QUALIFICATION);
4121 trace_kvm_page_fault(cr2, error_code); 4146 trace_kvm_page_fault(cr2, error_code);
4122 4147
@@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
4518 4543
4519static int handle_apic_access(struct kvm_vcpu *vcpu) 4544static int handle_apic_access(struct kvm_vcpu *vcpu)
4520{ 4545{
4546 if (likely(fasteoi)) {
4547 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4548 int access_type, offset;
4549
4550 access_type = exit_qualification & APIC_ACCESS_TYPE;
4551 offset = exit_qualification & APIC_ACCESS_OFFSET;
4552 /*
4553 * Sane guest uses MOV to write EOI, with written value
4554 * not cared. So make a short-circuit here by avoiding
4555 * heavy instruction emulation.
4556 */
4557 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
4558 (offset == APIC_EOI)) {
4559 kvm_lapic_set_eoi(vcpu);
4560 skip_emulated_instruction(vcpu);
4561 return 1;
4562 }
4563 }
4521 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4564 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4522} 4565}
4523 4566
@@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
5591 return 0; 5634 return 0;
5592 5635
5593 if (unlikely(vmx->fail)) { 5636 if (unlikely(vmx->fail)) {
5594 printk(KERN_INFO "%s failed vm entry %x\n", 5637 pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5595 __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); 5638 vmcs_read32(VM_INSTRUCTION_ERROR));
5596 return 1; 5639 return 1;
5597 } 5640 }
5598 5641
@@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
5696 u32 exit_reason = vmx->exit_reason; 5739 u32 exit_reason = vmx->exit_reason;
5697 u32 vectoring_info = vmx->idt_vectoring_info; 5740 u32 vectoring_info = vmx->idt_vectoring_info;
5698 5741
5699 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
5700
5701 /* If guest state is invalid, start emulating */ 5742 /* If guest state is invalid, start emulating */
5702 if (vmx->emulation_required && emulate_invalid_guest_state) 5743 if (vmx->emulation_required && emulate_invalid_guest_state)
5703 return handle_invalid_guest_state(vcpu); 5744 return handle_invalid_guest_state(vcpu);
@@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6101 vmx->loaded_vmcs->launched = 1; 6142 vmx->loaded_vmcs->launched = 1;
6102 6143
6103 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 6144 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
6145 trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
6104 6146
6105 vmx_complete_atomic_exit(vmx); 6147 vmx_complete_atomic_exit(vmx);
6106 vmx_recover_nmi_blocking(vmx); 6148 vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
6241 return ret; 6283 return ret;
6242} 6284}
6243 6285
6244#define _ER(x) { EXIT_REASON_##x, #x }
6245
6246static const struct trace_print_flags vmx_exit_reasons_str[] = {
6247 _ER(EXCEPTION_NMI),
6248 _ER(EXTERNAL_INTERRUPT),
6249 _ER(TRIPLE_FAULT),
6250 _ER(PENDING_INTERRUPT),
6251 _ER(NMI_WINDOW),
6252 _ER(TASK_SWITCH),
6253 _ER(CPUID),
6254 _ER(HLT),
6255 _ER(INVLPG),
6256 _ER(RDPMC),
6257 _ER(RDTSC),
6258 _ER(VMCALL),
6259 _ER(VMCLEAR),
6260 _ER(VMLAUNCH),
6261 _ER(VMPTRLD),
6262 _ER(VMPTRST),
6263 _ER(VMREAD),
6264 _ER(VMRESUME),
6265 _ER(VMWRITE),
6266 _ER(VMOFF),
6267 _ER(VMON),
6268 _ER(CR_ACCESS),
6269 _ER(DR_ACCESS),
6270 _ER(IO_INSTRUCTION),
6271 _ER(MSR_READ),
6272 _ER(MSR_WRITE),
6273 _ER(MWAIT_INSTRUCTION),
6274 _ER(MONITOR_INSTRUCTION),
6275 _ER(PAUSE_INSTRUCTION),
6276 _ER(MCE_DURING_VMENTRY),
6277 _ER(TPR_BELOW_THRESHOLD),
6278 _ER(APIC_ACCESS),
6279 _ER(EPT_VIOLATION),
6280 _ER(EPT_MISCONFIG),
6281 _ER(WBINVD),
6282 { -1, NULL }
6283};
6284
6285#undef _ER
6286
6287static int vmx_get_lpage_level(void) 6286static int vmx_get_lpage_level(void)
6288{ 6287{
6289 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 6288 if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
6514 6513
6515 set_cr4_guest_host_mask(vmx); 6514 set_cr4_guest_host_mask(vmx);
6516 6515
6517 vmcs_write64(TSC_OFFSET, 6516 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 6517 vmcs_write64(TSC_OFFSET,
6518 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
6519 else
6520 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6519 6521
6520 if (enable_vpid) { 6522 if (enable_vpid) {
6521 /* 6523 /*
@@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
6610 if (vmcs12->vm_entry_msr_load_count > 0 || 6612 if (vmcs12->vm_entry_msr_load_count > 0 ||
6611 vmcs12->vm_exit_msr_load_count > 0 || 6613 vmcs12->vm_exit_msr_load_count > 0 ||
6612 vmcs12->vm_exit_msr_store_count > 0) { 6614 vmcs12->vm_exit_msr_store_count > 0) {
6613 if (printk_ratelimit()) 6615 pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
6614 printk(KERN_WARNING 6616 __func__);
6615 "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
6616 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 6617 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
6617 return 1; 6618 return 1;
6618 } 6619 }
@@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
6922 6923
6923 load_vmcs12_host_state(vcpu, vmcs12); 6924 load_vmcs12_host_state(vcpu, vmcs12);
6924 6925
6925 /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ 6926 /* Update TSC_OFFSET if TSC was changed while L2 ran */
6926 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 6927 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
6927 6928
6928 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 6929 /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
7039 .get_mt_mask = vmx_get_mt_mask, 7040 .get_mt_mask = vmx_get_mt_mask,
7040 7041
7041 .get_exit_info = vmx_get_exit_info, 7042 .get_exit_info = vmx_get_exit_info,
7042 .exit_reasons_str = vmx_exit_reasons_str,
7043 7043
7044 .get_lpage_level = vmx_get_lpage_level, 7044 .get_lpage_level = vmx_get_lpage_level,
7045 7045
@@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7055 .write_tsc_offset = vmx_write_tsc_offset, 7055 .write_tsc_offset = vmx_write_tsc_offset,
7056 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7056 .adjust_tsc_offset = vmx_adjust_tsc_offset,
7057 .compute_tsc_offset = vmx_compute_tsc_offset, 7057 .compute_tsc_offset = vmx_compute_tsc_offset,
7058 .read_l1_tsc = vmx_read_l1_tsc,
7058 7059
7059 .set_tdp_cr3 = vmx_set_cr3, 7060 .set_tdp_cr3 = vmx_set_cr3,
7060 7061
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea45fa4..cf269096eadf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -83,6 +83,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
83static void update_cr8_intercept(struct kvm_vcpu *vcpu); 83static void update_cr8_intercept(struct kvm_vcpu *vcpu);
84static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 84static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
85 struct kvm_cpuid_entry2 __user *entries); 85 struct kvm_cpuid_entry2 __user *entries);
86static void process_nmi(struct kvm_vcpu *vcpu);
86 87
87struct kvm_x86_ops *kvm_x86_ops; 88struct kvm_x86_ops *kvm_x86_ops;
88EXPORT_SYMBOL_GPL(kvm_x86_ops); 89EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
359 360
360void kvm_inject_nmi(struct kvm_vcpu *vcpu) 361void kvm_inject_nmi(struct kvm_vcpu *vcpu)
361{ 362{
362 kvm_make_request(KVM_REQ_EVENT, vcpu); 363 atomic_inc(&vcpu->arch.nmi_queued);
363 vcpu->arch.nmi_pending = 1; 364 kvm_make_request(KVM_REQ_NMI, vcpu);
364} 365}
365EXPORT_SYMBOL_GPL(kvm_inject_nmi); 366EXPORT_SYMBOL_GPL(kvm_inject_nmi);
366 367
@@ -599,6 +600,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
599static void update_cpuid(struct kvm_vcpu *vcpu) 600static void update_cpuid(struct kvm_vcpu *vcpu)
600{ 601{
601 struct kvm_cpuid_entry2 *best; 602 struct kvm_cpuid_entry2 *best;
603 struct kvm_lapic *apic = vcpu->arch.apic;
604 u32 timer_mode_mask;
602 605
603 best = kvm_find_cpuid_entry(vcpu, 1, 0); 606 best = kvm_find_cpuid_entry(vcpu, 1, 0);
604 if (!best) 607 if (!best)
@@ -610,6 +613,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
610 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) 613 if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
611 best->ecx |= bit(X86_FEATURE_OSXSAVE); 614 best->ecx |= bit(X86_FEATURE_OSXSAVE);
612 } 615 }
616
617 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
618 best->function == 0x1) {
619 best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
620 timer_mode_mask = 3 << 17;
621 } else
622 timer_mode_mask = 1 << 17;
623
624 if (apic)
625 apic->lapic_timer.timer_mode_mask = timer_mode_mask;
613} 626}
614 627
615int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 628int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -825,6 +838,7 @@ static u32 msrs_to_save[] = {
825static unsigned num_msrs_to_save; 838static unsigned num_msrs_to_save;
826 839
827static u32 emulated_msrs[] = { 840static u32 emulated_msrs[] = {
841 MSR_IA32_TSCDEADLINE,
828 MSR_IA32_MISC_ENABLE, 842 MSR_IA32_MISC_ENABLE,
829 MSR_IA32_MCG_STATUS, 843 MSR_IA32_MCG_STATUS,
830 MSR_IA32_MCG_CTL, 844 MSR_IA32_MCG_CTL,
@@ -1000,7 +1014,7 @@ static inline int kvm_tsc_changes_freq(void)
1000 return ret; 1014 return ret;
1001} 1015}
1002 1016
1003static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) 1017u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
1004{ 1018{
1005 if (vcpu->arch.virtual_tsc_khz) 1019 if (vcpu->arch.virtual_tsc_khz)
1006 return vcpu->arch.virtual_tsc_khz; 1020 return vcpu->arch.virtual_tsc_khz;
@@ -1098,7 +1112,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1098 1112
1099 /* Keep irq disabled to prevent changes to the clock */ 1113 /* Keep irq disabled to prevent changes to the clock */
1100 local_irq_save(flags); 1114 local_irq_save(flags);
1101 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1115 tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
1102 kernel_ns = get_kernel_ns(); 1116 kernel_ns = get_kernel_ns();
1103 this_tsc_khz = vcpu_tsc_khz(v); 1117 this_tsc_khz = vcpu_tsc_khz(v);
1104 if (unlikely(this_tsc_khz == 0)) { 1118 if (unlikely(this_tsc_khz == 0)) {
@@ -1564,6 +1578,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1564 break; 1578 break;
1565 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1579 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1566 return kvm_x2apic_msr_write(vcpu, msr, data); 1580 return kvm_x2apic_msr_write(vcpu, msr, data);
1581 case MSR_IA32_TSCDEADLINE:
1582 kvm_set_lapic_tscdeadline_msr(vcpu, data);
1583 break;
1567 case MSR_IA32_MISC_ENABLE: 1584 case MSR_IA32_MISC_ENABLE:
1568 vcpu->arch.ia32_misc_enable_msr = data; 1585 vcpu->arch.ia32_misc_enable_msr = data;
1569 break; 1586 break;
@@ -1825,6 +1842,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1825 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); 1842 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1826 case HV_X64_MSR_TPR: 1843 case HV_X64_MSR_TPR:
1827 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); 1844 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1845 case HV_X64_MSR_APIC_ASSIST_PAGE:
1846 data = vcpu->arch.hv_vapic;
1847 break;
1828 default: 1848 default:
1829 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1849 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1830 return 1; 1850 return 1;
@@ -1839,7 +1859,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1839 1859
1840 switch (msr) { 1860 switch (msr) {
1841 case MSR_IA32_PLATFORM_ID: 1861 case MSR_IA32_PLATFORM_ID:
1842 case MSR_IA32_UCODE_REV:
1843 case MSR_IA32_EBL_CR_POWERON: 1862 case MSR_IA32_EBL_CR_POWERON:
1844 case MSR_IA32_DEBUGCTLMSR: 1863 case MSR_IA32_DEBUGCTLMSR:
1845 case MSR_IA32_LASTBRANCHFROMIP: 1864 case MSR_IA32_LASTBRANCHFROMIP:
@@ -1860,6 +1879,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1860 case MSR_FAM10H_MMIO_CONF_BASE: 1879 case MSR_FAM10H_MMIO_CONF_BASE:
1861 data = 0; 1880 data = 0;
1862 break; 1881 break;
1882 case MSR_IA32_UCODE_REV:
1883 data = 0x100000000ULL;
1884 break;
1863 case MSR_MTRRcap: 1885 case MSR_MTRRcap:
1864 data = 0x500 | KVM_NR_VAR_MTRR; 1886 data = 0x500 | KVM_NR_VAR_MTRR;
1865 break; 1887 break;
@@ -1888,6 +1910,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1888 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: 1910 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1889 return kvm_x2apic_msr_read(vcpu, msr, pdata); 1911 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1890 break; 1912 break;
1913 case MSR_IA32_TSCDEADLINE:
1914 data = kvm_get_lapic_tscdeadline_msr(vcpu);
1915 break;
1891 case MSR_IA32_MISC_ENABLE: 1916 case MSR_IA32_MISC_ENABLE:
1892 data = vcpu->arch.ia32_misc_enable_msr; 1917 data = vcpu->arch.ia32_misc_enable_msr;
1893 break; 1918 break;
@@ -2086,6 +2111,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2086 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 2111 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2087 break; 2112 break;
2088 case KVM_CAP_NR_VCPUS: 2113 case KVM_CAP_NR_VCPUS:
2114 r = KVM_SOFT_MAX_VCPUS;
2115 break;
2116 case KVM_CAP_MAX_VCPUS:
2089 r = KVM_MAX_VCPUS; 2117 r = KVM_MAX_VCPUS;
2090 break; 2118 break;
2091 case KVM_CAP_NR_MEMSLOTS: 2119 case KVM_CAP_NR_MEMSLOTS:
@@ -2210,7 +2238,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2210 s64 tsc_delta; 2238 s64 tsc_delta;
2211 u64 tsc; 2239 u64 tsc;
2212 2240
2213 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); 2241 tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2214 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : 2242 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2215 tsc - vcpu->arch.last_guest_tsc; 2243 tsc - vcpu->arch.last_guest_tsc;
2216 2244
@@ -2234,7 +2262,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2234{ 2262{
2235 kvm_x86_ops->vcpu_put(vcpu); 2263 kvm_x86_ops->vcpu_put(vcpu);
2236 kvm_put_guest_fpu(vcpu); 2264 kvm_put_guest_fpu(vcpu);
2237 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 2265 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
2238} 2266}
2239 2267
2240static int is_efer_nx(void) 2268static int is_efer_nx(void)
@@ -2819,6 +2847,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2819static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2847static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2820 struct kvm_vcpu_events *events) 2848 struct kvm_vcpu_events *events)
2821{ 2849{
2850 process_nmi(vcpu);
2822 events->exception.injected = 2851 events->exception.injected =
2823 vcpu->arch.exception.pending && 2852 vcpu->arch.exception.pending &&
2824 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2853 !kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2865,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2836 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); 2865 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2837 2866
2838 events->nmi.injected = vcpu->arch.nmi_injected; 2867 events->nmi.injected = vcpu->arch.nmi_injected;
2839 events->nmi.pending = vcpu->arch.nmi_pending; 2868 events->nmi.pending = vcpu->arch.nmi_pending != 0;
2840 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2869 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2841 events->nmi.pad = 0; 2870 events->nmi.pad = 0;
2842 2871
@@ -2856,6 +2885,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2856 | KVM_VCPUEVENT_VALID_SHADOW)) 2885 | KVM_VCPUEVENT_VALID_SHADOW))
2857 return -EINVAL; 2886 return -EINVAL;
2858 2887
2888 process_nmi(vcpu);
2859 vcpu->arch.exception.pending = events->exception.injected; 2889 vcpu->arch.exception.pending = events->exception.injected;
2860 vcpu->arch.exception.nr = events->exception.nr; 2890 vcpu->arch.exception.nr = events->exception.nr;
2861 vcpu->arch.exception.has_error_code = events->exception.has_error_code; 2891 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -3556,7 +3586,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
3556 if (r) { 3586 if (r) {
3557 mutex_lock(&kvm->slots_lock); 3587 mutex_lock(&kvm->slots_lock);
3558 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3588 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3559 &vpic->dev); 3589 &vpic->dev_master);
3590 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3591 &vpic->dev_slave);
3592 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3593 &vpic->dev_eclr);
3560 mutex_unlock(&kvm->slots_lock); 3594 mutex_unlock(&kvm->slots_lock);
3561 kfree(vpic); 3595 kfree(vpic);
3562 goto create_irqchip_unlock; 3596 goto create_irqchip_unlock;
@@ -4045,84 +4079,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4045 return 0; 4079 return 0;
4046} 4080}
4047 4081
4048static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, 4082int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4049 unsigned long addr, 4083 const void *val, int bytes)
4050 void *val,
4051 unsigned int bytes,
4052 struct x86_exception *exception)
4053{ 4084{
4054 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4085 int ret;
4055 gpa_t gpa;
4056 int handled, ret;
4057 4086
4087 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4088 if (ret < 0)
4089 return 0;
4090 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
4091 return 1;
4092}
4093
4094struct read_write_emulator_ops {
4095 int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4096 int bytes);
4097 int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4098 void *val, int bytes);
4099 int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4100 int bytes, void *val);
4101 int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4102 void *val, int bytes);
4103 bool write;
4104};
4105
4106static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4107{
4058 if (vcpu->mmio_read_completed) { 4108 if (vcpu->mmio_read_completed) {
4059 memcpy(val, vcpu->mmio_data, bytes); 4109 memcpy(val, vcpu->mmio_data, bytes);
4060 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, 4110 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4061 vcpu->mmio_phys_addr, *(u64 *)val); 4111 vcpu->mmio_phys_addr, *(u64 *)val);
4062 vcpu->mmio_read_completed = 0; 4112 vcpu->mmio_read_completed = 0;
4063 return X86EMUL_CONTINUE; 4113 return 1;
4064 } 4114 }
4065 4115
4066 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); 4116 return 0;
4067 4117}
4068 if (ret < 0)
4069 return X86EMUL_PROPAGATE_FAULT;
4070
4071 if (ret)
4072 goto mmio;
4073
4074 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
4075 == X86EMUL_CONTINUE)
4076 return X86EMUL_CONTINUE;
4077 4118
4078mmio: 4119static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4079 /* 4120 void *val, int bytes)
4080 * Is this MMIO handled locally? 4121{
4081 */ 4122 return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4082 handled = vcpu_mmio_read(vcpu, gpa, bytes, val); 4123}
4083 4124
4084 if (handled == bytes) 4125static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4085 return X86EMUL_CONTINUE; 4126 void *val, int bytes)
4127{
4128 return emulator_write_phys(vcpu, gpa, val, bytes);
4129}
4086 4130
4087 gpa += handled; 4131static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4088 bytes -= handled; 4132{
4089 val += handled; 4133 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4134 return vcpu_mmio_write(vcpu, gpa, bytes, val);
4135}
4090 4136
4137static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4138 void *val, int bytes)
4139{
4091 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 4140 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4092
4093 vcpu->mmio_needed = 1;
4094 vcpu->run->exit_reason = KVM_EXIT_MMIO;
4095 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4096 vcpu->mmio_size = bytes;
4097 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4098 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
4099 vcpu->mmio_index = 0;
4100
4101 return X86EMUL_IO_NEEDED; 4141 return X86EMUL_IO_NEEDED;
4102} 4142}
4103 4143
4104int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 4144static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4105 const void *val, int bytes) 4145 void *val, int bytes)
4106{ 4146{
4107 int ret; 4147 memcpy(vcpu->mmio_data, val, bytes);
4108 4148 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
4109 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 4149 return X86EMUL_CONTINUE;
4110 if (ret < 0)
4111 return 0;
4112 kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
4113 return 1;
4114} 4150}
4115 4151
4116static int emulator_write_emulated_onepage(unsigned long addr, 4152static struct read_write_emulator_ops read_emultor = {
4117 const void *val, 4153 .read_write_prepare = read_prepare,
4118 unsigned int bytes, 4154 .read_write_emulate = read_emulate,
4119 struct x86_exception *exception, 4155 .read_write_mmio = vcpu_mmio_read,
4120 struct kvm_vcpu *vcpu) 4156 .read_write_exit_mmio = read_exit_mmio,
4157};
4158
4159static struct read_write_emulator_ops write_emultor = {
4160 .read_write_emulate = write_emulate,
4161 .read_write_mmio = write_mmio,
4162 .read_write_exit_mmio = write_exit_mmio,
4163 .write = true,
4164};
4165
4166static int emulator_read_write_onepage(unsigned long addr, void *val,
4167 unsigned int bytes,
4168 struct x86_exception *exception,
4169 struct kvm_vcpu *vcpu,
4170 struct read_write_emulator_ops *ops)
4121{ 4171{
4122 gpa_t gpa; 4172 gpa_t gpa;
4123 int handled, ret; 4173 int handled, ret;
4174 bool write = ops->write;
4124 4175
4125 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); 4176 if (ops->read_write_prepare &&
4177 ops->read_write_prepare(vcpu, val, bytes))
4178 return X86EMUL_CONTINUE;
4179
4180 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4126 4181
4127 if (ret < 0) 4182 if (ret < 0)
4128 return X86EMUL_PROPAGATE_FAULT; 4183 return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4186,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
4131 if (ret) 4186 if (ret)
4132 goto mmio; 4187 goto mmio;
4133 4188
4134 if (emulator_write_phys(vcpu, gpa, val, bytes)) 4189 if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4135 return X86EMUL_CONTINUE; 4190 return X86EMUL_CONTINUE;
4136 4191
4137mmio: 4192mmio:
4138 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4139 /* 4193 /*
4140 * Is this MMIO handled locally? 4194 * Is this MMIO handled locally?
4141 */ 4195 */
4142 handled = vcpu_mmio_write(vcpu, gpa, bytes, val); 4196 handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4143 if (handled == bytes) 4197 if (handled == bytes)
4144 return X86EMUL_CONTINUE; 4198 return X86EMUL_CONTINUE;
4145 4199
@@ -4148,23 +4202,20 @@ mmio:
4148 val += handled; 4202 val += handled;
4149 4203
4150 vcpu->mmio_needed = 1; 4204 vcpu->mmio_needed = 1;
4151 memcpy(vcpu->mmio_data, val, bytes);
4152 vcpu->run->exit_reason = KVM_EXIT_MMIO; 4205 vcpu->run->exit_reason = KVM_EXIT_MMIO;
4153 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 4206 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4154 vcpu->mmio_size = bytes; 4207 vcpu->mmio_size = bytes;
4155 vcpu->run->mmio.len = min(vcpu->mmio_size, 8); 4208 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4156 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 4209 vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
4157 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
4158 vcpu->mmio_index = 0; 4210 vcpu->mmio_index = 0;
4159 4211
4160 return X86EMUL_CONTINUE; 4212 return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4161} 4213}
4162 4214
4163int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, 4215int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4164 unsigned long addr, 4216 void *val, unsigned int bytes,
4165 const void *val, 4217 struct x86_exception *exception,
4166 unsigned int bytes, 4218 struct read_write_emulator_ops *ops)
4167 struct x86_exception *exception)
4168{ 4219{
4169 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4170 4221
@@ -4173,16 +4224,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4173 int rc, now; 4224 int rc, now;
4174 4225
4175 now = -addr & ~PAGE_MASK; 4226 now = -addr & ~PAGE_MASK;
4176 rc = emulator_write_emulated_onepage(addr, val, now, exception, 4227 rc = emulator_read_write_onepage(addr, val, now, exception,
4177 vcpu); 4228 vcpu, ops);
4229
4178 if (rc != X86EMUL_CONTINUE) 4230 if (rc != X86EMUL_CONTINUE)
4179 return rc; 4231 return rc;
4180 addr += now; 4232 addr += now;
4181 val += now; 4233 val += now;
4182 bytes -= now; 4234 bytes -= now;
4183 } 4235 }
4184 return emulator_write_emulated_onepage(addr, val, bytes, exception, 4236
4185 vcpu); 4237 return emulator_read_write_onepage(addr, val, bytes, exception,
4238 vcpu, ops);
4239}
4240
4241static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4242 unsigned long addr,
4243 void *val,
4244 unsigned int bytes,
4245 struct x86_exception *exception)
4246{
4247 return emulator_read_write(ctxt, addr, val, bytes,
4248 exception, &read_emultor);
4249}
4250
4251int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4252 unsigned long addr,
4253 const void *val,
4254 unsigned int bytes,
4255 struct x86_exception *exception)
4256{
4257 return emulator_read_write(ctxt, addr, (void *)val, bytes,
4258 exception, &write_emultor);
4186} 4259}
4187 4260
4188#define CMPXCHG_TYPE(t, ptr, old, new) \ 4261#define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4712,7 +4785,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4712 kvm_set_rflags(vcpu, ctxt->eflags); 4785 kvm_set_rflags(vcpu, ctxt->eflags);
4713 4786
4714 if (irq == NMI_VECTOR) 4787 if (irq == NMI_VECTOR)
4715 vcpu->arch.nmi_pending = false; 4788 vcpu->arch.nmi_pending = 0;
4716 else 4789 else
4717 vcpu->arch.interrupt.pending = false; 4790 vcpu->arch.interrupt.pending = false;
4718 4791
@@ -4788,7 +4861,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4788 4861
4789 trace_kvm_emulate_insn_start(vcpu); 4862 trace_kvm_emulate_insn_start(vcpu);
4790 ++vcpu->stat.insn_emulation; 4863 ++vcpu->stat.insn_emulation;
4791 if (r) { 4864 if (r != EMULATION_OK) {
4792 if (emulation_type & EMULTYPE_TRAP_UD) 4865 if (emulation_type & EMULTYPE_TRAP_UD)
4793 return EMULATE_FAIL; 4866 return EMULATE_FAIL;
4794 if (reexecute_instruction(vcpu, cr2)) 4867 if (reexecute_instruction(vcpu, cr2))
@@ -5521,7 +5594,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5521 /* try to inject new event if pending */ 5594 /* try to inject new event if pending */
5522 if (vcpu->arch.nmi_pending) { 5595 if (vcpu->arch.nmi_pending) {
5523 if (kvm_x86_ops->nmi_allowed(vcpu)) { 5596 if (kvm_x86_ops->nmi_allowed(vcpu)) {
5524 vcpu->arch.nmi_pending = false; 5597 --vcpu->arch.nmi_pending;
5525 vcpu->arch.nmi_injected = true; 5598 vcpu->arch.nmi_injected = true;
5526 kvm_x86_ops->set_nmi(vcpu); 5599 kvm_x86_ops->set_nmi(vcpu);
5527 } 5600 }
@@ -5553,10 +5626,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5553 } 5626 }
5554} 5627}
5555 5628
5629static void process_nmi(struct kvm_vcpu *vcpu)
5630{
5631 unsigned limit = 2;
5632
5633 /*
5634 * x86 is limited to one NMI running, and one NMI pending after it.
5635 * If an NMI is already in progress, limit further NMIs to just one.
5636 * Otherwise, allow two (and we'll inject the first one immediately).
5637 */
5638 if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
5639 limit = 1;
5640
5641 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
5642 vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
5643 kvm_make_request(KVM_REQ_EVENT, vcpu);
5644}
5645
5556static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5646static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5557{ 5647{
5558 int r; 5648 int r;
5559 bool nmi_pending;
5560 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5649 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5561 vcpu->run->request_interrupt_window; 5650 vcpu->run->request_interrupt_window;
5562 5651
@@ -5596,6 +5685,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5596 } 5685 }
5597 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) 5686 if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5598 record_steal_time(vcpu); 5687 record_steal_time(vcpu);
5688 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5689 process_nmi(vcpu);
5599 5690
5600 } 5691 }
5601 5692
@@ -5603,19 +5694,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5603 if (unlikely(r)) 5694 if (unlikely(r))
5604 goto out; 5695 goto out;
5605 5696
5606 /*
5607 * An NMI can be injected between local nmi_pending read and
5608 * vcpu->arch.nmi_pending read inside inject_pending_event().
5609 * But in that case, KVM_REQ_EVENT will be set, which makes
5610 * the race described above benign.
5611 */
5612 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5613
5614 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5697 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5615 inject_pending_event(vcpu); 5698 inject_pending_event(vcpu);
5616 5699
5617 /* enable NMI/IRQ window open exits if needed */ 5700 /* enable NMI/IRQ window open exits if needed */
5618 if (nmi_pending) 5701 if (vcpu->arch.nmi_pending)
5619 kvm_x86_ops->enable_nmi_window(vcpu); 5702 kvm_x86_ops->enable_nmi_window(vcpu);
5620 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5703 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5621 kvm_x86_ops->enable_irq_window(vcpu); 5704 kvm_x86_ops->enable_irq_window(vcpu);
@@ -5678,7 +5761,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5678 if (hw_breakpoint_active()) 5761 if (hw_breakpoint_active())
5679 hw_breakpoint_restore(); 5762 hw_breakpoint_restore();
5680 5763
5681 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); 5764 vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
5682 5765
5683 vcpu->mode = OUTSIDE_GUEST_MODE; 5766 vcpu->mode = OUTSIDE_GUEST_MODE;
5684 smp_wmb(); 5767 smp_wmb();
@@ -6323,7 +6406,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6323 6406
6324int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6407int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6325{ 6408{
6326 vcpu->arch.nmi_pending = false; 6409 atomic_set(&vcpu->arch.nmi_queued, 0);
6410 vcpu->arch.nmi_pending = 0;
6327 vcpu->arch.nmi_injected = false; 6411 vcpu->arch.nmi_injected = false;
6328 6412
6329 vcpu->arch.switch_db_regs = 0; 6413 vcpu->arch.switch_db_regs = 0;
@@ -6598,7 +6682,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6598 !vcpu->arch.apf.halted) 6682 !vcpu->arch.apf.halted)
6599 || !list_empty_careful(&vcpu->async_pf.done) 6683 || !list_empty_careful(&vcpu->async_pf.done)
6600 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6684 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6601 || vcpu->arch.nmi_pending || 6685 || atomic_read(&vcpu->arch.nmi_queued) ||
6602 (kvm_arch_interrupt_allowed(vcpu) && 6686 (kvm_arch_interrupt_allowed(vcpu) &&
6603 kvm_cpu_has_interrupt(vcpu)); 6687 kvm_cpu_has_interrupt(vcpu));
6604} 6688}