aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
commitecefbd94b834fa32559d854646d777c56749ef1c (patch)
treeca8958900ad9e208a8e5fb7704f1b66dc76131b4 /arch/x86
parentce57e981f2b996aaca2031003b3f866368307766 (diff)
parent3d11df7abbff013b811d5615320580cd5d9d7d31 (diff)
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights of the changes for this release include support for vfio level triggered interrupts, improved big real mode support on older Intels, a streamlines guest page table walker, guest APIC speedups, PIO optimizations, better overcommit handling, and read-only memory." * tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: s390: Fix vcpu_load handling in interrupt code KVM: x86: Fix guest debug across vcpu INIT reset KVM: Add resampling irqfds for level triggered interrupts KVM: optimize apic interrupt delivery KVM: MMU: Eliminate pointless temporary 'ac' KVM: MMU: Avoid access/dirty update loop if all is well KVM: MMU: Eliminate eperm temporary KVM: MMU: Optimize is_last_gpte() KVM: MMU: Simplify walk_addr_generic() loop KVM: MMU: Optimize pte permission checks KVM: MMU: Update accessed and dirty bits after guest pagetable walk KVM: MMU: Move gpte_access() out of paging_tmpl.h KVM: MMU: Optimize gpte_access() slightly KVM: MMU: Push clean gpte write protection out of gpte_access() KVM: clarify kvmclock documentation KVM: make processes waiting on vcpu mutex killable KVM: SVM: Make use of asm.h KVM: VMX: Make use of asm.h KVM: VMX: Make lto-friendly KVM: x86: lapic: Clean up find_highest_vector() and count_vectors() ... Conflicts: arch/s390/include/asm/processor.h arch/x86/kvm/i8259.c
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig21
-rw-r--r--arch/x86/include/asm/kvm.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h48
-rw-r--r--arch/x86/include/asm/kvm_host.h36
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/emulate.c538
-rw-r--r--arch/x86/kvm/i8254.c64
-rw-r--r--arch/x86/kvm/i8254.h6
-rw-r--r--arch/x86/kvm/i8259.c70
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c484
-rw-r--r--arch/x86/kvm/lapic.h61
-rw-r--r--arch/x86/kvm/mmu.c240
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h199
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c82
-rw-r--r--arch/x86/kvm/timer.c47
-rw-r--r--arch/x86/kvm/vmx.c233
-rw-r--r--arch/x86/kvm/x86.c384
-rw-r--r--arch/x86/kvm/x86.h1
29 files changed, 1489 insertions, 1113 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7f9a395c5254..b72777ff32a9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -586,23 +586,18 @@ config PARAVIRT_TIME_ACCOUNTING
586 586
587source "arch/x86/xen/Kconfig" 587source "arch/x86/xen/Kconfig"
588 588
589config KVM_CLOCK
590 bool "KVM paravirtualized clock"
591 select PARAVIRT
592 select PARAVIRT_CLOCK
593 ---help---
594 Turning on this option will allow you to run a paravirtualized clock
595 when running over the KVM hypervisor. Instead of relying on a PIT
596 (or probably other) emulation by the underlying device model, the host
597 provides the guest with timing infrastructure such as time of day, and
598 system time
599
600config KVM_GUEST 589config KVM_GUEST
601 bool "KVM Guest support" 590 bool "KVM Guest support (including kvmclock)"
591 select PARAVIRT
602 select PARAVIRT 592 select PARAVIRT
593 select PARAVIRT_CLOCK
594 default y if PARAVIRT_GUEST
603 ---help--- 595 ---help---
604 This option enables various optimizations for running under the KVM 596 This option enables various optimizations for running under the KVM
605 hypervisor. 597 hypervisor. It includes a paravirtualized clock, so that instead
598 of relying on a PIT (or probably other) emulation by the
599 underlying device model, the host provides the guest with
600 timing infrastructure such as time of day, and system time
606 601
607source "arch/x86/lguest/Kconfig" 602source "arch/x86/lguest/Kconfig"
608 603
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 41e08cb6a092..a65ec29e6ffb 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -41,6 +41,7 @@
41#define __KVM_HAVE_DEBUGREGS 41#define __KVM_HAVE_DEBUGREGS
42#define __KVM_HAVE_XSAVE 42#define __KVM_HAVE_XSAVE
43#define __KVM_HAVE_XCRS 43#define __KVM_HAVE_XCRS
44#define __KVM_HAVE_READONLY_MEM
44 45
45/* Architectural interrupt line count. */ 46/* Architectural interrupt line count. */
46#define KVM_NR_INTERRUPTS 256 47#define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c764f43b71c5..15f960c06ff7 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -86,6 +86,19 @@ struct x86_instruction_info {
86 86
87struct x86_emulate_ops { 87struct x86_emulate_ops {
88 /* 88 /*
89 * read_gpr: read a general purpose register (rax - r15)
90 *
91 * @reg: gpr number.
92 */
93 ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
94 /*
95 * write_gpr: write a general purpose register (rax - r15)
96 *
97 * @reg: gpr number.
98 * @val: value to write.
99 */
100 void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
101 /*
89 * read_std: Read bytes of standard (non-emulated/special) memory. 102 * read_std: Read bytes of standard (non-emulated/special) memory.
90 * Used for descriptor reading. 103 * Used for descriptor reading.
91 * @addr: [IN ] Linear address from which to read. 104 * @addr: [IN ] Linear address from which to read.
@@ -200,8 +213,9 @@ typedef u32 __attribute__((vector_size(16))) sse128_t;
200 213
201/* Type, address-of, and value of an instruction's operand. */ 214/* Type, address-of, and value of an instruction's operand. */
202struct operand { 215struct operand {
203 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; 216 enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
204 unsigned int bytes; 217 unsigned int bytes;
218 unsigned int count;
205 union { 219 union {
206 unsigned long orig_val; 220 unsigned long orig_val;
207 u64 orig_val64; 221 u64 orig_val64;
@@ -221,6 +235,7 @@ struct operand {
221 char valptr[sizeof(unsigned long) + 2]; 235 char valptr[sizeof(unsigned long) + 2];
222 sse128_t vec_val; 236 sse128_t vec_val;
223 u64 mm_val; 237 u64 mm_val;
238 void *data;
224 }; 239 };
225}; 240};
226 241
@@ -236,14 +251,23 @@ struct read_cache {
236 unsigned long end; 251 unsigned long end;
237}; 252};
238 253
254/* Execution mode, passed to the emulator. */
255enum x86emul_mode {
256 X86EMUL_MODE_REAL, /* Real mode. */
257 X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
258 X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
259 X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
260 X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
261};
262
239struct x86_emulate_ctxt { 263struct x86_emulate_ctxt {
240 struct x86_emulate_ops *ops; 264 const struct x86_emulate_ops *ops;
241 265
242 /* Register state before/after emulation. */ 266 /* Register state before/after emulation. */
243 unsigned long eflags; 267 unsigned long eflags;
244 unsigned long eip; /* eip before instruction emulation */ 268 unsigned long eip; /* eip before instruction emulation */
245 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
246 int mode; 270 enum x86emul_mode mode;
247 271
248 /* interruptibility state, as a result of execution of STI or MOV SS */ 272 /* interruptibility state, as a result of execution of STI or MOV SS */
249 int interruptibility; 273 int interruptibility;
@@ -281,8 +305,10 @@ struct x86_emulate_ctxt {
281 bool rip_relative; 305 bool rip_relative;
282 unsigned long _eip; 306 unsigned long _eip;
283 struct operand memop; 307 struct operand memop;
308 u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
309 u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
284 /* Fields above regs are cleared together. */ 310 /* Fields above regs are cleared together. */
285 unsigned long regs[NR_VCPU_REGS]; 311 unsigned long _regs[NR_VCPU_REGS];
286 struct operand *memopp; 312 struct operand *memopp;
287 struct fetch_cache fetch; 313 struct fetch_cache fetch;
288 struct read_cache io_read; 314 struct read_cache io_read;
@@ -293,17 +319,6 @@ struct x86_emulate_ctxt {
293#define REPE_PREFIX 0xf3 319#define REPE_PREFIX 0xf3
294#define REPNE_PREFIX 0xf2 320#define REPNE_PREFIX 0xf2
295 321
296/* Execution mode, passed to the emulator. */
297#define X86EMUL_MODE_REAL 0 /* Real mode. */
298#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
299#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
300#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
301#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
302
303/* any protected mode */
304#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
305 X86EMUL_MODE_PROT64)
306
307/* CPUID vendors */ 322/* CPUID vendors */
308#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 323#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
309#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 324#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
@@ -394,4 +409,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
394 u16 tss_selector, int idt_index, int reason, 409 u16 tss_selector, int idt_index, int reason,
395 bool has_error_code, u32 error_code); 410 bool has_error_code, u32 error_code);
396int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); 411int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
412void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
413void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
414
397#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 415#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1eaa6b056670..b2e11f452435 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -271,10 +271,24 @@ struct kvm_mmu {
271 union kvm_mmu_page_role base_role; 271 union kvm_mmu_page_role base_role;
272 bool direct_map; 272 bool direct_map;
273 273
274 /*
275 * Bitmap; bit set = permission fault
276 * Byte index: page fault error code [4:1]
277 * Bit index: pte permissions in ACC_* format
278 */
279 u8 permissions[16];
280
274 u64 *pae_root; 281 u64 *pae_root;
275 u64 *lm_root; 282 u64 *lm_root;
276 u64 rsvd_bits_mask[2][4]; 283 u64 rsvd_bits_mask[2][4];
277 284
285 /*
286 * Bitmap: bit set = last pte in walk
287 * index[0:1]: level (zero-based)
288 * index[2]: pte.ps
289 */
290 u8 last_pte_bitmap;
291
278 bool nx; 292 bool nx;
279 293
280 u64 pdptrs[4]; /* pae */ 294 u64 pdptrs[4]; /* pae */
@@ -398,12 +412,15 @@ struct kvm_vcpu_arch {
398 struct x86_emulate_ctxt emulate_ctxt; 412 struct x86_emulate_ctxt emulate_ctxt;
399 bool emulate_regs_need_sync_to_vcpu; 413 bool emulate_regs_need_sync_to_vcpu;
400 bool emulate_regs_need_sync_from_vcpu; 414 bool emulate_regs_need_sync_from_vcpu;
415 int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
401 416
402 gpa_t time; 417 gpa_t time;
403 struct pvclock_vcpu_time_info hv_clock; 418 struct pvclock_vcpu_time_info hv_clock;
404 unsigned int hw_tsc_khz; 419 unsigned int hw_tsc_khz;
405 unsigned int time_offset; 420 unsigned int time_offset;
406 struct page *time_page; 421 struct page *time_page;
422 /* set guest stopped flag in pvclock flags field */
423 bool pvclock_set_guest_stopped_request;
407 424
408 struct { 425 struct {
409 u64 msr_val; 426 u64 msr_val;
@@ -438,6 +455,7 @@ struct kvm_vcpu_arch {
438 unsigned long dr6; 455 unsigned long dr6;
439 unsigned long dr7; 456 unsigned long dr7;
440 unsigned long eff_db[KVM_NR_DB_REGS]; 457 unsigned long eff_db[KVM_NR_DB_REGS];
458 unsigned long guest_debug_dr7;
441 459
442 u64 mcg_cap; 460 u64 mcg_cap;
443 u64 mcg_status; 461 u64 mcg_status;
@@ -484,14 +502,24 @@ struct kvm_vcpu_arch {
484}; 502};
485 503
486struct kvm_lpage_info { 504struct kvm_lpage_info {
487 unsigned long rmap_pde;
488 int write_count; 505 int write_count;
489}; 506};
490 507
491struct kvm_arch_memory_slot { 508struct kvm_arch_memory_slot {
509 unsigned long *rmap[KVM_NR_PAGE_SIZES];
492 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; 510 struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
493}; 511};
494 512
513struct kvm_apic_map {
514 struct rcu_head rcu;
515 u8 ldr_bits;
516 /* fields bellow are used to decode ldr values in different modes */
517 u32 cid_shift, cid_mask, lid_mask;
518 struct kvm_lapic *phys_map[256];
519 /* first index is cluster id second is cpu id in a cluster */
520 struct kvm_lapic *logical_map[16][16];
521};
522
495struct kvm_arch { 523struct kvm_arch {
496 unsigned int n_used_mmu_pages; 524 unsigned int n_used_mmu_pages;
497 unsigned int n_requested_mmu_pages; 525 unsigned int n_requested_mmu_pages;
@@ -509,6 +537,8 @@ struct kvm_arch {
509 struct kvm_ioapic *vioapic; 537 struct kvm_ioapic *vioapic;
510 struct kvm_pit *vpit; 538 struct kvm_pit *vpit;
511 int vapics_in_nmi_mode; 539 int vapics_in_nmi_mode;
540 struct mutex apic_map_lock;
541 struct kvm_apic_map *apic_map;
512 542
513 unsigned int tss_addr; 543 unsigned int tss_addr;
514 struct page *apic_access_page; 544 struct page *apic_access_page;
@@ -602,8 +632,7 @@ struct kvm_x86_ops {
602 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 632 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
603 void (*vcpu_put)(struct kvm_vcpu *vcpu); 633 void (*vcpu_put)(struct kvm_vcpu *vcpu);
604 634
605 void (*set_guest_debug)(struct kvm_vcpu *vcpu, 635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
606 struct kvm_guest_debug *dbg);
607 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
608 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 637 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
609 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -941,6 +970,7 @@ extern bool kvm_rebooting;
941 970
942#define KVM_ARCH_WANT_MMU_NOTIFIER 971#define KVM_ARCH_WANT_MMU_NOTIFIER
943int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 972int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
973int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
944int kvm_age_hva(struct kvm *kvm, unsigned long hva); 974int kvm_age_hva(struct kvm *kvm, unsigned long hva);
945int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 975int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
946void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 976void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 2f7712e08b1e..eb3e9d85e1f1 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -102,21 +102,21 @@ struct kvm_vcpu_pv_apf_data {
102extern void kvmclock_init(void); 102extern void kvmclock_init(void);
103extern int kvm_register_clock(char *txt); 103extern int kvm_register_clock(char *txt);
104 104
105#ifdef CONFIG_KVM_CLOCK 105#ifdef CONFIG_KVM_GUEST
106bool kvm_check_and_clear_guest_paused(void); 106bool kvm_check_and_clear_guest_paused(void);
107#else 107#else
108static inline bool kvm_check_and_clear_guest_paused(void) 108static inline bool kvm_check_and_clear_guest_paused(void)
109{ 109{
110 return false; 110 return false;
111} 111}
112#endif /* CONFIG_KVMCLOCK */ 112#endif /* CONFIG_KVM_GUEST */
113 113
114/* This instruction is vmcall. On non-VT architectures, it will generate a 114/* This instruction is vmcall. On non-VT architectures, it will generate a
115 * trap that we will then rewrite to the appropriate instruction. 115 * trap that we will then rewrite to the appropriate instruction.
116 */ 116 */
117#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" 117#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
118 118
119/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun 119/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
120 * instruction. The hypervisor may replace it with something else but only the 120 * instruction. The hypervisor may replace it with something else but only the
121 * instructions are guaranteed to be supported. 121 * instructions are guaranteed to be supported.
122 * 122 *
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8d7a619718b5..a48ea05157d3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o 82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 83
84obj-$(CONFIG_KVM_GUEST) += kvm.o 84obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
86obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
87obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o 86obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
88obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 87obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f1..b3e5e51bc907 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 354 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
355 wrmsrl(MSR_KVM_PV_EOI_EN, 0); 355 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
356 kvm_pv_disable_apf(); 356 kvm_pv_disable_apf();
357 kvm_disable_steal_time();
357} 358}
358 359
359static int kvm_pv_reboot_notify(struct notifier_block *nb, 360static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
396#ifdef CONFIG_SMP 397#ifdef CONFIG_SMP
397static void __init kvm_smp_prepare_boot_cpu(void) 398static void __init kvm_smp_prepare_boot_cpu(void)
398{ 399{
399#ifdef CONFIG_KVM_CLOCK
400 WARN_ON(kvm_register_clock("primary cpu clock")); 400 WARN_ON(kvm_register_clock("primary cpu clock"));
401#endif
402 kvm_guest_cpu_init(); 401 kvm_guest_cpu_init();
403 native_smp_prepare_boot_cpu(); 402 native_smp_prepare_boot_cpu();
404} 403}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4f165479c453..d609be046b57 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -957,7 +957,7 @@ void __init setup_arch(char **cmdline_p)
957 initmem_init(); 957 initmem_init();
958 memblock_find_dma_reserve(); 958 memblock_find_dma_reserve();
959 959
960#ifdef CONFIG_KVM_CLOCK 960#ifdef CONFIG_KVM_GUEST
961 kvmclock_init(); 961 kvmclock_init();
962#endif 962#endif
963 963
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843ea..586f00059805 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
20config KVM 20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 depends on HIGH_RES_TIMERS
23 # for device assignment: 24 # for device assignment:
24 depends on PCI 25 depends on PCI
25 # for TASKSTATS/TASK_DELAY_ACCT: 26 # for TASKSTATS/TASK_DELAY_ACCT:
@@ -37,6 +38,7 @@ config KVM
37 select TASK_DELAY_ACCT 38 select TASK_DELAY_ACCT
38 select PERF_EVENTS 39 select PERF_EVENTS
39 select HAVE_KVM_MSI 40 select HAVE_KVM_MSI
41 select HAVE_KVM_CPU_RELAX_INTERCEPT
40 ---help--- 42 ---help---
41 Support hosting fully virtualized guest machines using hardware 43 Support hosting fully virtualized guest machines using hardware
42 virtualization extensions. You will need a fairly recent 44 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcacf..04d30401c5cb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o timer.o cpuid.o pmu.o 15 i8254.o cpuid.o pmu.o
16kvm-intel-y += vmx.o 16kvm-intel-y += vmx.o
17kvm-amd-y += svm.o 17kvm-amd-y += svm.o
18 18
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7c..ec79e773342e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
316 } 316 }
317 case 7: { 317 case 7: {
318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
319 /* Mask ebx against host capbability word 9 */ 319 /* Mask ebx against host capability word 9 */
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 break; 397 break;
398 } 398 }
399 case KVM_CPUID_SIGNATURE: { 399 case KVM_CPUID_SIGNATURE: {
400 char signature[12] = "KVMKVMKVM\0\0"; 400 static const char signature[12] = "KVMKVMKVM\0\0";
401 u32 *sigptr = (u32 *)signature; 401 const u32 *sigptr = (const u32 *)signature;
402 entry->eax = KVM_CPUID_FEATURES; 402 entry->eax = KVM_CPUID_FEATURES;
403 entry->ebx = sigptr[0]; 403 entry->ebx = sigptr[0];
404 entry->ecx = sigptr[1]; 404 entry->ecx = sigptr[1];
@@ -484,10 +484,10 @@ struct kvm_cpuid_param {
484 u32 func; 484 u32 func;
485 u32 idx; 485 u32 idx;
486 bool has_leaf_count; 486 bool has_leaf_count;
487 bool (*qualifier)(struct kvm_cpuid_param *param); 487 bool (*qualifier)(const struct kvm_cpuid_param *param);
488}; 488};
489 489
490static bool is_centaur_cpu(struct kvm_cpuid_param *param) 490static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
491{ 491{
492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; 492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
493} 493}
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
498 struct kvm_cpuid_entry2 *cpuid_entries; 498 struct kvm_cpuid_entry2 *cpuid_entries;
499 int limit, nent = 0, r = -E2BIG, i; 499 int limit, nent = 0, r = -E2BIG, i;
500 u32 func; 500 u32 func;
501 static struct kvm_cpuid_param param[] = { 501 static const struct kvm_cpuid_param param[] = {
502 { .func = 0, .has_leaf_count = true }, 502 { .func = 0, .has_leaf_count = true },
503 { .func = 0x80000000, .has_leaf_count = true }, 503 { .func = 0x80000000, .has_leaf_count = true },
504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, 504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
517 517
518 r = 0; 518 r = 0;
519 for (i = 0; i < ARRAY_SIZE(param); i++) { 519 for (i = 0; i < ARRAY_SIZE(param); i++) {
520 struct kvm_cpuid_param *ent = &param[i]; 520 const struct kvm_cpuid_param *ent = &param[i];
521 521
522 if (ent->qualifier && !ent->qualifier(ent)) 522 if (ent->qualifier && !ent->qualifier(ent))
523 continue; 523 continue;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3b57a27be88..39171cb307ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,9 +161,9 @@ struct opcode {
161 u64 intercept : 8; 161 u64 intercept : 8;
162 union { 162 union {
163 int (*execute)(struct x86_emulate_ctxt *ctxt); 163 int (*execute)(struct x86_emulate_ctxt *ctxt);
164 struct opcode *group; 164 const struct opcode *group;
165 struct group_dual *gdual; 165 const struct group_dual *gdual;
166 struct gprefix *gprefix; 166 const struct gprefix *gprefix;
167 } u; 167 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 168 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 169};
@@ -202,6 +202,42 @@ struct gprefix {
202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
203#define EFLG_RESERVED_ONE_MASK 2 203#define EFLG_RESERVED_ONE_MASK 2
204 204
205static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
206{
207 if (!(ctxt->regs_valid & (1 << nr))) {
208 ctxt->regs_valid |= 1 << nr;
209 ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
210 }
211 return ctxt->_regs[nr];
212}
213
214static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
215{
216 ctxt->regs_valid |= 1 << nr;
217 ctxt->regs_dirty |= 1 << nr;
218 return &ctxt->_regs[nr];
219}
220
221static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
222{
223 reg_read(ctxt, nr);
224 return reg_write(ctxt, nr);
225}
226
227static void writeback_registers(struct x86_emulate_ctxt *ctxt)
228{
229 unsigned reg;
230
231 for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
232 ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
233}
234
235static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
236{
237 ctxt->regs_dirty = 0;
238 ctxt->regs_valid = 0;
239}
240
205/* 241/*
206 * Instruction emulation: 242 * Instruction emulation:
207 * Most instructions are emulated directly via a fragment of inline assembly 243 * Most instructions are emulated directly via a fragment of inline assembly
@@ -374,8 +410,8 @@ struct gprefix {
374#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
375 do { \ 411 do { \
376 unsigned long _tmp; \ 412 unsigned long _tmp; \
377 ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ 413 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
378 ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ 414 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
379 \ 415 \
380 __asm__ __volatile__ ( \ 416 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0", "5", "1") \ 417 _PRE_EFLAGS("0", "5", "1") \
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
494 530
495static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) 531static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
496{ 532{
497 masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); 533 masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
498} 534}
499 535
500static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 536static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -632,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
632 668
633 la = seg_base(ctxt, addr.seg) + addr.ea; 669 la = seg_base(ctxt, addr.seg) + addr.ea;
634 switch (ctxt->mode) { 670 switch (ctxt->mode) {
635 case X86EMUL_MODE_REAL:
636 break;
637 case X86EMUL_MODE_PROT64: 671 case X86EMUL_MODE_PROT64:
638 if (((signed long)la << 16) >> 16 != la) 672 if (((signed long)la << 16) >> 16 != la)
639 return emulate_gp(ctxt, 0); 673 return emulate_gp(ctxt, 0);
@@ -655,7 +689,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
655 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) 689 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
656 goto bad; 690 goto bad;
657 } else { 691 } else {
658 /* exapand-down segment */ 692 /* expand-down segment */
659 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) 693 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
660 goto bad; 694 goto bad;
661 lim = desc.d ? 0xffffffff : 0xffff; 695 lim = desc.d ? 0xffffffff : 0xffff;
@@ -663,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 goto bad; 697 goto bad;
664 } 698 }
665 cpl = ctxt->ops->cpl(ctxt); 699 cpl = ctxt->ops->cpl(ctxt);
666 rpl = sel & 3; 700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
667 cpl = max(cpl, rpl); 704 cpl = max(cpl, rpl);
668 if (!(desc.type & 8)) { 705 if (!(desc.type & 8)) {
669 /* data segment */ 706 /* data segment */
@@ -688,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
688 return X86EMUL_CONTINUE; 725 return X86EMUL_CONTINUE;
689bad: 726bad:
690 if (addr.seg == VCPU_SREG_SS) 727 if (addr.seg == VCPU_SREG_SS)
691 return emulate_ss(ctxt, addr.seg); 728 return emulate_ss(ctxt, sel);
692 else 729 else
693 return emulate_gp(ctxt, addr.seg); 730 return emulate_gp(ctxt, sel);
694} 731}
695 732
696static int linearize(struct x86_emulate_ctxt *ctxt, 733static int linearize(struct x86_emulate_ctxt *ctxt,
@@ -786,14 +823,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
786 * pointer into the block that addresses the relevant register. 823 * pointer into the block that addresses the relevant register.
787 * @highbyte_regs specifies whether to decode AH,CH,DH,BH. 824 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
788 */ 825 */
789static void *decode_register(u8 modrm_reg, unsigned long *regs, 826static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
790 int highbyte_regs) 827 int highbyte_regs)
791{ 828{
792 void *p; 829 void *p;
793 830
794 p = &regs[modrm_reg];
795 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) 831 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
796 p = (unsigned char *)&regs[modrm_reg & 3] + 1; 832 p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
833 else
834 p = reg_rmw(ctxt, modrm_reg);
797 return p; 835 return p;
798} 836}
799 837
@@ -871,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
871{ 909{
872 ctxt->ops->get_fpu(ctxt); 910 ctxt->ops->get_fpu(ctxt);
873 switch (reg) { 911 switch (reg) {
874 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; 912 case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
875 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; 913 case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
876 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; 914 case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
877 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; 915 case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
878 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; 916 case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
879 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; 917 case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
880 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; 918 case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
881 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; 919 case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
882#ifdef CONFIG_X86_64 920#ifdef CONFIG_X86_64
883 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; 921 case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
884 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; 922 case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
885 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; 923 case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
886 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; 924 case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
887 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; 925 case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
888 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; 926 case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
889 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; 927 case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
890 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; 928 case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
891#endif 929#endif
892 default: BUG(); 930 default: BUG();
893 } 931 }
@@ -899,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
899{ 937{
900 ctxt->ops->get_fpu(ctxt); 938 ctxt->ops->get_fpu(ctxt);
901 switch (reg) { 939 switch (reg) {
902 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; 940 case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
903 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; 941 case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
904 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; 942 case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
905 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; 943 case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
906 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; 944 case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
907 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; 945 case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
908 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; 946 case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
909 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; 947 case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
910#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
911 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; 949 case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
912 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; 950 case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
913 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; 951 case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
914 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; 952 case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
915 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; 953 case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
916 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; 954 case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
917 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; 955 case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
918 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; 956 case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
919#endif 957#endif
920 default: BUG(); 958 default: BUG();
921 } 959 }
@@ -982,10 +1020,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
982 1020
983 op->type = OP_REG; 1021 op->type = OP_REG;
984 if (ctxt->d & ByteOp) { 1022 if (ctxt->d & ByteOp) {
985 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 1023 op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
986 op->bytes = 1; 1024 op->bytes = 1;
987 } else { 1025 } else {
988 op->addr.reg = decode_register(reg, ctxt->regs, 0); 1026 op->addr.reg = decode_register(ctxt, reg, 0);
989 op->bytes = ctxt->op_bytes; 1027 op->bytes = ctxt->op_bytes;
990 } 1028 }
991 fetch_register_operand(op); 1029 fetch_register_operand(op);
@@ -1020,8 +1058,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1020 if (ctxt->modrm_mod == 3) { 1058 if (ctxt->modrm_mod == 3) {
1021 op->type = OP_REG; 1059 op->type = OP_REG;
1022 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1060 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
1023 op->addr.reg = decode_register(ctxt->modrm_rm, 1061 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
1024 ctxt->regs, ctxt->d & ByteOp);
1025 if (ctxt->d & Sse) { 1062 if (ctxt->d & Sse) {
1026 op->type = OP_XMM; 1063 op->type = OP_XMM;
1027 op->bytes = 16; 1064 op->bytes = 16;
@@ -1042,10 +1079,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1042 op->type = OP_MEM; 1079 op->type = OP_MEM;
1043 1080
1044 if (ctxt->ad_bytes == 2) { 1081 if (ctxt->ad_bytes == 2) {
1045 unsigned bx = ctxt->regs[VCPU_REGS_RBX]; 1082 unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
1046 unsigned bp = ctxt->regs[VCPU_REGS_RBP]; 1083 unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
1047 unsigned si = ctxt->regs[VCPU_REGS_RSI]; 1084 unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
1048 unsigned di = ctxt->regs[VCPU_REGS_RDI]; 1085 unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
1049 1086
1050 /* 16-bit ModR/M decode. */ 1087 /* 16-bit ModR/M decode. */
1051 switch (ctxt->modrm_mod) { 1088 switch (ctxt->modrm_mod) {
@@ -1102,17 +1139,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1102 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1139 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1103 modrm_ea += insn_fetch(s32, ctxt); 1140 modrm_ea += insn_fetch(s32, ctxt);
1104 else { 1141 else {
1105 modrm_ea += ctxt->regs[base_reg]; 1142 modrm_ea += reg_read(ctxt, base_reg);
1106 adjust_modrm_seg(ctxt, base_reg); 1143 adjust_modrm_seg(ctxt, base_reg);
1107 } 1144 }
1108 if (index_reg != 4) 1145 if (index_reg != 4)
1109 modrm_ea += ctxt->regs[index_reg] << scale; 1146 modrm_ea += reg_read(ctxt, index_reg) << scale;
1110 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1147 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1111 if (ctxt->mode == X86EMUL_MODE_PROT64) 1148 if (ctxt->mode == X86EMUL_MODE_PROT64)
1112 ctxt->rip_relative = 1; 1149 ctxt->rip_relative = 1;
1113 } else { 1150 } else {
1114 base_reg = ctxt->modrm_rm; 1151 base_reg = ctxt->modrm_rm;
1115 modrm_ea += ctxt->regs[base_reg]; 1152 modrm_ea += reg_read(ctxt, base_reg);
1116 adjust_modrm_seg(ctxt, base_reg); 1153 adjust_modrm_seg(ctxt, base_reg);
1117 } 1154 }
1118 switch (ctxt->modrm_mod) { 1155 switch (ctxt->modrm_mod) {
@@ -1179,24 +1216,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1179 int rc; 1216 int rc;
1180 struct read_cache *mc = &ctxt->mem_read; 1217 struct read_cache *mc = &ctxt->mem_read;
1181 1218
1182 while (size) { 1219 if (mc->pos < mc->end)
1183 int n = min(size, 8u); 1220 goto read_cached;
1184 size -= n;
1185 if (mc->pos < mc->end)
1186 goto read_cached;
1187 1221
1188 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1222 WARN_ON((mc->end + size) >= sizeof(mc->data));
1189 &ctxt->exception);
1190 if (rc != X86EMUL_CONTINUE)
1191 return rc;
1192 mc->end += n;
1193 1223
1194 read_cached: 1224 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
1195 memcpy(dest, mc->data + mc->pos, n); 1225 &ctxt->exception);
1196 mc->pos += n; 1226 if (rc != X86EMUL_CONTINUE)
1197 dest += n; 1227 return rc;
1198 addr += n; 1228
1199 } 1229 mc->end += size;
1230
1231read_cached:
1232 memcpy(dest, mc->data + mc->pos, size);
1233 mc->pos += size;
1200 return X86EMUL_CONTINUE; 1234 return X86EMUL_CONTINUE;
1201} 1235}
1202 1236
@@ -1253,10 +1287,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1253 if (rc->pos == rc->end) { /* refill pio read ahead */ 1287 if (rc->pos == rc->end) { /* refill pio read ahead */
1254 unsigned int in_page, n; 1288 unsigned int in_page, n;
1255 unsigned int count = ctxt->rep_prefix ? 1289 unsigned int count = ctxt->rep_prefix ?
1256 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; 1290 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
1257 in_page = (ctxt->eflags & EFLG_DF) ? 1291 in_page = (ctxt->eflags & EFLG_DF) ?
1258 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : 1292 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
1259 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); 1293 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
1260 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1294 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1261 count); 1295 count);
1262 if (n == 0) 1296 if (n == 0)
@@ -1267,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 rc->end = n * size; 1301 rc->end = n * size;
1268 } 1302 }
1269 1303
1270 memcpy(dest, rc->data + rc->pos, size); 1304 if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
1271 rc->pos += size; 1305 ctxt->dst.data = rc->data + rc->pos;
1306 ctxt->dst.type = OP_MEM_STR;
1307 ctxt->dst.count = (rc->end - rc->pos) / size;
1308 rc->pos = rc->end;
1309 } else {
1310 memcpy(dest, rc->data + rc->pos, size);
1311 rc->pos += size;
1312 }
1272 return 1; 1313 return 1;
1273} 1314}
1274 1315
@@ -1291,7 +1332,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1291static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1332static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1292 u16 selector, struct desc_ptr *dt) 1333 u16 selector, struct desc_ptr *dt)
1293{ 1334{
1294 struct x86_emulate_ops *ops = ctxt->ops; 1335 const struct x86_emulate_ops *ops = ctxt->ops;
1295 1336
1296 if (selector & 1 << 2) { 1337 if (selector & 1 << 2) {
1297 struct desc_struct desc; 1338 struct desc_struct desc;
@@ -1355,19 +1396,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1396 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1356 ulong desc_addr; 1397 ulong desc_addr;
1357 int ret; 1398 int ret;
1399 u16 dummy;
1358 1400
1359 memset(&seg_desc, 0, sizeof seg_desc); 1401 memset(&seg_desc, 0, sizeof seg_desc);
1360 1402
1361 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1403 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1362 || ctxt->mode == X86EMUL_MODE_REAL) { 1404 || ctxt->mode == X86EMUL_MODE_REAL) {
1363 /* set real mode segment descriptor */ 1405 /* set real mode segment descriptor */
1406 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
1364 set_desc_base(&seg_desc, selector << 4); 1407 set_desc_base(&seg_desc, selector << 4);
1365 set_desc_limit(&seg_desc, 0xffff);
1366 seg_desc.type = 3;
1367 seg_desc.p = 1;
1368 seg_desc.s = 1;
1369 if (ctxt->mode == X86EMUL_MODE_VM86)
1370 seg_desc.dpl = 3;
1371 goto load; 1408 goto load;
1372 } 1409 }
1373 1410
@@ -1396,7 +1433,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1396 err_code = selector & 0xfffc; 1433 err_code = selector & 0xfffc;
1397 err_vec = GP_VECTOR; 1434 err_vec = GP_VECTOR;
1398 1435
1399 /* can't load system descriptor into segment selecor */ 1436 /* can't load system descriptor into segment selector */
1400 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1437 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1401 goto exception; 1438 goto exception;
1402 1439
@@ -1516,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1516 if (rc != X86EMUL_CONTINUE) 1553 if (rc != X86EMUL_CONTINUE)
1517 return rc; 1554 return rc;
1518 break; 1555 break;
1556 case OP_MEM_STR:
1557 rc = segmented_write(ctxt,
1558 ctxt->dst.addr.mem,
1559 ctxt->dst.data,
1560 ctxt->dst.bytes * ctxt->dst.count);
1561 if (rc != X86EMUL_CONTINUE)
1562 return rc;
1563 break;
1519 case OP_XMM: 1564 case OP_XMM:
1520 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1565 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1521 break; 1566 break;
@@ -1536,7 +1581,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1536 struct segmented_address addr; 1581 struct segmented_address addr;
1537 1582
1538 rsp_increment(ctxt, -bytes); 1583 rsp_increment(ctxt, -bytes);
1539 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1584 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1540 addr.seg = VCPU_SREG_SS; 1585 addr.seg = VCPU_SREG_SS;
1541 1586
1542 return segmented_write(ctxt, addr, data, bytes); 1587 return segmented_write(ctxt, addr, data, bytes);
@@ -1555,7 +1600,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1555 int rc; 1600 int rc;
1556 struct segmented_address addr; 1601 struct segmented_address addr;
1557 1602
1558 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1603 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1559 addr.seg = VCPU_SREG_SS; 1604 addr.seg = VCPU_SREG_SS;
1560 rc = segmented_read(ctxt, addr, dest, len); 1605 rc = segmented_read(ctxt, addr, dest, len);
1561 if (rc != X86EMUL_CONTINUE) 1606 if (rc != X86EMUL_CONTINUE)
@@ -1623,26 +1668,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
1623 int rc; 1668 int rc;
1624 unsigned frame_size = ctxt->src.val; 1669 unsigned frame_size = ctxt->src.val;
1625 unsigned nesting_level = ctxt->src2.val & 31; 1670 unsigned nesting_level = ctxt->src2.val & 31;
1671 ulong rbp;
1626 1672
1627 if (nesting_level) 1673 if (nesting_level)
1628 return X86EMUL_UNHANDLEABLE; 1674 return X86EMUL_UNHANDLEABLE;
1629 1675
1630 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); 1676 rbp = reg_read(ctxt, VCPU_REGS_RBP);
1677 rc = push(ctxt, &rbp, stack_size(ctxt));
1631 if (rc != X86EMUL_CONTINUE) 1678 if (rc != X86EMUL_CONTINUE)
1632 return rc; 1679 return rc;
1633 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], 1680 assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
1634 stack_mask(ctxt)); 1681 stack_mask(ctxt));
1635 assign_masked(&ctxt->regs[VCPU_REGS_RSP], 1682 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
1636 ctxt->regs[VCPU_REGS_RSP] - frame_size, 1683 reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
1637 stack_mask(ctxt)); 1684 stack_mask(ctxt));
1638 return X86EMUL_CONTINUE; 1685 return X86EMUL_CONTINUE;
1639} 1686}
1640 1687
1641static int em_leave(struct x86_emulate_ctxt *ctxt) 1688static int em_leave(struct x86_emulate_ctxt *ctxt)
1642{ 1689{
1643 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], 1690 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
1644 stack_mask(ctxt)); 1691 stack_mask(ctxt));
1645 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); 1692 return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
1646} 1693}
1647 1694
1648static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1695static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
@@ -1670,13 +1717,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1670 1717
1671static int em_pusha(struct x86_emulate_ctxt *ctxt) 1718static int em_pusha(struct x86_emulate_ctxt *ctxt)
1672{ 1719{
1673 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; 1720 unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
1674 int rc = X86EMUL_CONTINUE; 1721 int rc = X86EMUL_CONTINUE;
1675 int reg = VCPU_REGS_RAX; 1722 int reg = VCPU_REGS_RAX;
1676 1723
1677 while (reg <= VCPU_REGS_RDI) { 1724 while (reg <= VCPU_REGS_RDI) {
1678 (reg == VCPU_REGS_RSP) ? 1725 (reg == VCPU_REGS_RSP) ?
1679 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); 1726 (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
1680 1727
1681 rc = em_push(ctxt); 1728 rc = em_push(ctxt);
1682 if (rc != X86EMUL_CONTINUE) 1729 if (rc != X86EMUL_CONTINUE)
@@ -1705,7 +1752,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1705 --reg; 1752 --reg;
1706 } 1753 }
1707 1754
1708 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); 1755 rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
1709 if (rc != X86EMUL_CONTINUE) 1756 if (rc != X86EMUL_CONTINUE)
1710 break; 1757 break;
1711 --reg; 1758 --reg;
@@ -1713,9 +1760,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1713 return rc; 1760 return rc;
1714} 1761}
1715 1762
1716int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) 1763static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1717{ 1764{
1718 struct x86_emulate_ops *ops = ctxt->ops; 1765 const struct x86_emulate_ops *ops = ctxt->ops;
1719 int rc; 1766 int rc;
1720 struct desc_ptr dt; 1767 struct desc_ptr dt;
1721 gva_t cs_addr; 1768 gva_t cs_addr;
@@ -1762,11 +1809,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1762 return rc; 1809 return rc;
1763} 1810}
1764 1811
1812int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1813{
1814 int rc;
1815
1816 invalidate_registers(ctxt);
1817 rc = __emulate_int_real(ctxt, irq);
1818 if (rc == X86EMUL_CONTINUE)
1819 writeback_registers(ctxt);
1820 return rc;
1821}
1822
1765static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) 1823static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1766{ 1824{
1767 switch(ctxt->mode) { 1825 switch(ctxt->mode) {
1768 case X86EMUL_MODE_REAL: 1826 case X86EMUL_MODE_REAL:
1769 return emulate_int_real(ctxt, irq); 1827 return __emulate_int_real(ctxt, irq);
1770 case X86EMUL_MODE_VM86: 1828 case X86EMUL_MODE_VM86:
1771 case X86EMUL_MODE_PROT16: 1829 case X86EMUL_MODE_PROT16:
1772 case X86EMUL_MODE_PROT32: 1830 case X86EMUL_MODE_PROT32:
@@ -1973,14 +2031,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
1973{ 2031{
1974 u64 old = ctxt->dst.orig_val64; 2032 u64 old = ctxt->dst.orig_val64;
1975 2033
1976 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || 2034 if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
1977 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { 2035 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
1978 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 2036 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
1979 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 2037 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
1980 ctxt->eflags &= ~EFLG_ZF; 2038 ctxt->eflags &= ~EFLG_ZF;
1981 } else { 2039 } else {
1982 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | 2040 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
1983 (u32) ctxt->regs[VCPU_REGS_RBX]; 2041 (u32) reg_read(ctxt, VCPU_REGS_RBX);
1984 2042
1985 ctxt->eflags |= EFLG_ZF; 2043 ctxt->eflags |= EFLG_ZF;
1986 } 2044 }
@@ -2016,7 +2074,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2016{ 2074{
2017 /* Save real source value, then compare EAX against destination. */ 2075 /* Save real source value, then compare EAX against destination. */
2018 ctxt->src.orig_val = ctxt->src.val; 2076 ctxt->src.orig_val = ctxt->src.val;
2019 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; 2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2020 emulate_2op_SrcV(ctxt, "cmp"); 2078 emulate_2op_SrcV(ctxt, "cmp");
2021 2079
2022 if (ctxt->eflags & EFLG_ZF) { 2080 if (ctxt->eflags & EFLG_ZF) {
@@ -2025,7 +2083,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2025 } else { 2083 } else {
2026 /* Failure: write the value we saw to EAX. */ 2084 /* Failure: write the value we saw to EAX. */
2027 ctxt->dst.type = OP_REG; 2085 ctxt->dst.type = OP_REG;
2028 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; 2086 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
2029 } 2087 }
2030 return X86EMUL_CONTINUE; 2088 return X86EMUL_CONTINUE;
2031} 2089}
@@ -2050,12 +2108,6 @@ static void
2050setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 2108setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2051 struct desc_struct *cs, struct desc_struct *ss) 2109 struct desc_struct *cs, struct desc_struct *ss)
2052{ 2110{
2053 u16 selector;
2054
2055 memset(cs, 0, sizeof(struct desc_struct));
2056 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
2057 memset(ss, 0, sizeof(struct desc_struct));
2058
2059 cs->l = 0; /* will be adjusted later */ 2111 cs->l = 0; /* will be adjusted later */
2060 set_desc_base(cs, 0); /* flat segment */ 2112 set_desc_base(cs, 0); /* flat segment */
2061 cs->g = 1; /* 4kb granularity */ 2113 cs->g = 1; /* 4kb granularity */
@@ -2065,6 +2117,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2065 cs->dpl = 0; /* will be adjusted later */ 2117 cs->dpl = 0; /* will be adjusted later */
2066 cs->p = 1; 2118 cs->p = 1;
2067 cs->d = 1; 2119 cs->d = 1;
2120 cs->avl = 0;
2068 2121
2069 set_desc_base(ss, 0); /* flat segment */ 2122 set_desc_base(ss, 0); /* flat segment */
2070 set_desc_limit(ss, 0xfffff); /* 4GB limit */ 2123 set_desc_limit(ss, 0xfffff); /* 4GB limit */
@@ -2074,6 +2127,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2074 ss->d = 1; /* 32bit stack segment */ 2127 ss->d = 1; /* 32bit stack segment */
2075 ss->dpl = 0; 2128 ss->dpl = 0;
2076 ss->p = 1; 2129 ss->p = 1;
2130 ss->l = 0;
2131 ss->avl = 0;
2077} 2132}
2078 2133
2079static bool vendor_intel(struct x86_emulate_ctxt *ctxt) 2134static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2089,7 +2144,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
2089 2144
2090static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 2145static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2091{ 2146{
2092 struct x86_emulate_ops *ops = ctxt->ops; 2147 const struct x86_emulate_ops *ops = ctxt->ops;
2093 u32 eax, ebx, ecx, edx; 2148 u32 eax, ebx, ecx, edx;
2094 2149
2095 /* 2150 /*
@@ -2133,7 +2188,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2133 2188
2134static int em_syscall(struct x86_emulate_ctxt *ctxt) 2189static int em_syscall(struct x86_emulate_ctxt *ctxt)
2135{ 2190{
2136 struct x86_emulate_ops *ops = ctxt->ops; 2191 const struct x86_emulate_ops *ops = ctxt->ops;
2137 struct desc_struct cs, ss; 2192 struct desc_struct cs, ss;
2138 u64 msr_data; 2193 u64 msr_data;
2139 u16 cs_sel, ss_sel; 2194 u16 cs_sel, ss_sel;
@@ -2165,10 +2220,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2165 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2220 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2166 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2221 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2167 2222
2168 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; 2223 *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
2169 if (efer & EFER_LMA) { 2224 if (efer & EFER_LMA) {
2170#ifdef CONFIG_X86_64 2225#ifdef CONFIG_X86_64
2171 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 2226 *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
2172 2227
2173 ops->get_msr(ctxt, 2228 ops->get_msr(ctxt,
2174 ctxt->mode == X86EMUL_MODE_PROT64 ? 2229 ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2191,7 +2246,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2191 2246
2192static int em_sysenter(struct x86_emulate_ctxt *ctxt) 2247static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2193{ 2248{
2194 struct x86_emulate_ops *ops = ctxt->ops; 2249 const struct x86_emulate_ops *ops = ctxt->ops;
2195 struct desc_struct cs, ss; 2250 struct desc_struct cs, ss;
2196 u64 msr_data; 2251 u64 msr_data;
2197 u16 cs_sel, ss_sel; 2252 u16 cs_sel, ss_sel;
@@ -2228,6 +2283,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2228 if (msr_data == 0x0) 2283 if (msr_data == 0x0)
2229 return emulate_gp(ctxt, 0); 2284 return emulate_gp(ctxt, 0);
2230 break; 2285 break;
2286 default:
2287 break;
2231 } 2288 }
2232 2289
2233 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2290 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -2247,14 +2304,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2247 ctxt->_eip = msr_data; 2304 ctxt->_eip = msr_data;
2248 2305
2249 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 2306 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2250 ctxt->regs[VCPU_REGS_RSP] = msr_data; 2307 *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
2251 2308
2252 return X86EMUL_CONTINUE; 2309 return X86EMUL_CONTINUE;
2253} 2310}
2254 2311
2255static int em_sysexit(struct x86_emulate_ctxt *ctxt) 2312static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2256{ 2313{
2257 struct x86_emulate_ops *ops = ctxt->ops; 2314 const struct x86_emulate_ops *ops = ctxt->ops;
2258 struct desc_struct cs, ss; 2315 struct desc_struct cs, ss;
2259 u64 msr_data; 2316 u64 msr_data;
2260 int usermode; 2317 int usermode;
@@ -2297,8 +2354,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2297 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2354 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2298 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2355 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2299 2356
2300 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; 2357 ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
2301 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; 2358 *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
2302 2359
2303 return X86EMUL_CONTINUE; 2360 return X86EMUL_CONTINUE;
2304} 2361}
@@ -2317,7 +2374,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2317static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2374static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2318 u16 port, u16 len) 2375 u16 port, u16 len)
2319{ 2376{
2320 struct x86_emulate_ops *ops = ctxt->ops; 2377 const struct x86_emulate_ops *ops = ctxt->ops;
2321 struct desc_struct tr_seg; 2378 struct desc_struct tr_seg;
2322 u32 base3; 2379 u32 base3;
2323 int r; 2380 int r;
@@ -2367,14 +2424,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2367{ 2424{
2368 tss->ip = ctxt->_eip; 2425 tss->ip = ctxt->_eip;
2369 tss->flag = ctxt->eflags; 2426 tss->flag = ctxt->eflags;
2370 tss->ax = ctxt->regs[VCPU_REGS_RAX]; 2427 tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
2371 tss->cx = ctxt->regs[VCPU_REGS_RCX]; 2428 tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
2372 tss->dx = ctxt->regs[VCPU_REGS_RDX]; 2429 tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
2373 tss->bx = ctxt->regs[VCPU_REGS_RBX]; 2430 tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
2374 tss->sp = ctxt->regs[VCPU_REGS_RSP]; 2431 tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
2375 tss->bp = ctxt->regs[VCPU_REGS_RBP]; 2432 tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
2376 tss->si = ctxt->regs[VCPU_REGS_RSI]; 2433 tss->si = reg_read(ctxt, VCPU_REGS_RSI);
2377 tss->di = ctxt->regs[VCPU_REGS_RDI]; 2434 tss->di = reg_read(ctxt, VCPU_REGS_RDI);
2378 2435
2379 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2436 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2380 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2437 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2390,14 +2447,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2390 2447
2391 ctxt->_eip = tss->ip; 2448 ctxt->_eip = tss->ip;
2392 ctxt->eflags = tss->flag | 2; 2449 ctxt->eflags = tss->flag | 2;
2393 ctxt->regs[VCPU_REGS_RAX] = tss->ax; 2450 *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
2394 ctxt->regs[VCPU_REGS_RCX] = tss->cx; 2451 *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
2395 ctxt->regs[VCPU_REGS_RDX] = tss->dx; 2452 *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
2396 ctxt->regs[VCPU_REGS_RBX] = tss->bx; 2453 *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
2397 ctxt->regs[VCPU_REGS_RSP] = tss->sp; 2454 *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
2398 ctxt->regs[VCPU_REGS_RBP] = tss->bp; 2455 *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
2399 ctxt->regs[VCPU_REGS_RSI] = tss->si; 2456 *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
2400 ctxt->regs[VCPU_REGS_RDI] = tss->di; 2457 *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
2401 2458
2402 /* 2459 /*
2403 * SDM says that segment selectors are loaded before segment 2460 * SDM says that segment selectors are loaded before segment
@@ -2410,7 +2467,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2410 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2467 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2411 2468
2412 /* 2469 /*
2413 * Now load segment descriptors. If fault happenes at this stage 2470 * Now load segment descriptors. If fault happens at this stage
2414 * it is handled in a context of new task 2471 * it is handled in a context of new task
2415 */ 2472 */
2416 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2473 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2436,7 +2493,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2436 u16 tss_selector, u16 old_tss_sel, 2493 u16 tss_selector, u16 old_tss_sel,
2437 ulong old_tss_base, struct desc_struct *new_desc) 2494 ulong old_tss_base, struct desc_struct *new_desc)
2438{ 2495{
2439 struct x86_emulate_ops *ops = ctxt->ops; 2496 const struct x86_emulate_ops *ops = ctxt->ops;
2440 struct tss_segment_16 tss_seg; 2497 struct tss_segment_16 tss_seg;
2441 int ret; 2498 int ret;
2442 u32 new_tss_base = get_desc_base(new_desc); 2499 u32 new_tss_base = get_desc_base(new_desc);
@@ -2482,14 +2539,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2482 tss->cr3 = ctxt->ops->get_cr(ctxt, 3); 2539 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2483 tss->eip = ctxt->_eip; 2540 tss->eip = ctxt->_eip;
2484 tss->eflags = ctxt->eflags; 2541 tss->eflags = ctxt->eflags;
2485 tss->eax = ctxt->regs[VCPU_REGS_RAX]; 2542 tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
2486 tss->ecx = ctxt->regs[VCPU_REGS_RCX]; 2543 tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
2487 tss->edx = ctxt->regs[VCPU_REGS_RDX]; 2544 tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
2488 tss->ebx = ctxt->regs[VCPU_REGS_RBX]; 2545 tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
2489 tss->esp = ctxt->regs[VCPU_REGS_RSP]; 2546 tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
2490 tss->ebp = ctxt->regs[VCPU_REGS_RBP]; 2547 tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
2491 tss->esi = ctxt->regs[VCPU_REGS_RSI]; 2548 tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
2492 tss->edi = ctxt->regs[VCPU_REGS_RDI]; 2549 tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
2493 2550
2494 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2551 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2495 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2552 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2511,14 +2568,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2511 ctxt->eflags = tss->eflags | 2; 2568 ctxt->eflags = tss->eflags | 2;
2512 2569
2513 /* General purpose registers */ 2570 /* General purpose registers */
2514 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2571 *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
2515 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2572 *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
2516 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2573 *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
2517 ctxt->regs[VCPU_REGS_RBX] = tss->ebx; 2574 *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
2518 ctxt->regs[VCPU_REGS_RSP] = tss->esp; 2575 *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
2519 ctxt->regs[VCPU_REGS_RBP] = tss->ebp; 2576 *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
2520 ctxt->regs[VCPU_REGS_RSI] = tss->esi; 2577 *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
2521 ctxt->regs[VCPU_REGS_RDI] = tss->edi; 2578 *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
2522 2579
2523 /* 2580 /*
2524 * SDM says that segment selectors are loaded before segment 2581 * SDM says that segment selectors are loaded before segment
@@ -2583,7 +2640,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2583 u16 tss_selector, u16 old_tss_sel, 2640 u16 tss_selector, u16 old_tss_sel,
2584 ulong old_tss_base, struct desc_struct *new_desc) 2641 ulong old_tss_base, struct desc_struct *new_desc)
2585{ 2642{
2586 struct x86_emulate_ops *ops = ctxt->ops; 2643 const struct x86_emulate_ops *ops = ctxt->ops;
2587 struct tss_segment_32 tss_seg; 2644 struct tss_segment_32 tss_seg;
2588 int ret; 2645 int ret;
2589 u32 new_tss_base = get_desc_base(new_desc); 2646 u32 new_tss_base = get_desc_base(new_desc);
@@ -2627,7 +2684,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2627 u16 tss_selector, int idt_index, int reason, 2684 u16 tss_selector, int idt_index, int reason,
2628 bool has_error_code, u32 error_code) 2685 bool has_error_code, u32 error_code)
2629{ 2686{
2630 struct x86_emulate_ops *ops = ctxt->ops; 2687 const struct x86_emulate_ops *ops = ctxt->ops;
2631 struct desc_struct curr_tss_desc, next_tss_desc; 2688 struct desc_struct curr_tss_desc, next_tss_desc;
2632 int ret; 2689 int ret;
2633 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2690 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2652,7 +2709,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2652 * 2709 *
2653 * 1. jmp/call/int to task gate: Check against DPL of the task gate 2710 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2654 * 2. Exception/IRQ/iret: No check is performed 2711 * 2. Exception/IRQ/iret: No check is performed
2655 * 3. jmp/call to TSS: Check agains DPL of the TSS 2712 * 3. jmp/call to TSS: Check against DPL of the TSS
2656 */ 2713 */
2657 if (reason == TASK_SWITCH_GATE) { 2714 if (reason == TASK_SWITCH_GATE) {
2658 if (idt_index != -1) { 2715 if (idt_index != -1) {
@@ -2693,7 +2750,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2693 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; 2750 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2694 2751
2695 /* set back link to prev task only if NT bit is set in eflags 2752 /* set back link to prev task only if NT bit is set in eflags
2696 note that old_tss_sel is not used afetr this point */ 2753 note that old_tss_sel is not used after this point */
2697 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 2754 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2698 old_tss_sel = 0xffff; 2755 old_tss_sel = 0xffff;
2699 2756
@@ -2733,26 +2790,28 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2733{ 2790{
2734 int rc; 2791 int rc;
2735 2792
2793 invalidate_registers(ctxt);
2736 ctxt->_eip = ctxt->eip; 2794 ctxt->_eip = ctxt->eip;
2737 ctxt->dst.type = OP_NONE; 2795 ctxt->dst.type = OP_NONE;
2738 2796
2739 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, 2797 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2740 has_error_code, error_code); 2798 has_error_code, error_code);
2741 2799
2742 if (rc == X86EMUL_CONTINUE) 2800 if (rc == X86EMUL_CONTINUE) {
2743 ctxt->eip = ctxt->_eip; 2801 ctxt->eip = ctxt->_eip;
2802 writeback_registers(ctxt);
2803 }
2744 2804
2745 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2805 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2746} 2806}
2747 2807
2748static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2808static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
2749 int reg, struct operand *op) 2809 struct operand *op)
2750{ 2810{
2751 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2811 int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
2752 2812
2753 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); 2813 register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
2754 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); 2814 op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
2755 op->addr.mem.seg = seg;
2756} 2815}
2757 2816
2758static int em_das(struct x86_emulate_ctxt *ctxt) 2817static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -2927,7 +2986,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
2927{ 2986{
2928 ctxt->dst.type = OP_REG; 2987 ctxt->dst.type = OP_REG;
2929 ctxt->dst.bytes = ctxt->src.bytes; 2988 ctxt->dst.bytes = ctxt->src.bytes;
2930 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 2989 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
2931 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); 2990 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2932 2991
2933 return X86EMUL_CONTINUE; 2992 return X86EMUL_CONTINUE;
@@ -2938,8 +2997,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2938 u64 tsc = 0; 2997 u64 tsc = 0;
2939 2998
2940 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2999 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2941 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; 3000 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
2942 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; 3001 *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
2943 return X86EMUL_CONTINUE; 3002 return X86EMUL_CONTINUE;
2944} 3003}
2945 3004
@@ -2947,10 +3006,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2947{ 3006{
2948 u64 pmc; 3007 u64 pmc;
2949 3008
2950 if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) 3009 if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
2951 return emulate_gp(ctxt, 0); 3010 return emulate_gp(ctxt, 0);
2952 ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; 3011 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
2953 ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; 3012 *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
2954 return X86EMUL_CONTINUE; 3013 return X86EMUL_CONTINUE;
2955} 3014}
2956 3015
@@ -2992,9 +3051,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
2992{ 3051{
2993 u64 msr_data; 3052 u64 msr_data;
2994 3053
2995 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] 3054 msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
2996 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); 3055 | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
2997 if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) 3056 if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
2998 return emulate_gp(ctxt, 0); 3057 return emulate_gp(ctxt, 0);
2999 3058
3000 return X86EMUL_CONTINUE; 3059 return X86EMUL_CONTINUE;
@@ -3004,11 +3063,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
3004{ 3063{
3005 u64 msr_data; 3064 u64 msr_data;
3006 3065
3007 if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) 3066 if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
3008 return emulate_gp(ctxt, 0); 3067 return emulate_gp(ctxt, 0);
3009 3068
3010 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; 3069 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
3011 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; 3070 *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
3012 return X86EMUL_CONTINUE; 3071 return X86EMUL_CONTINUE;
3013} 3072}
3014 3073
@@ -3188,8 +3247,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
3188 3247
3189static int em_loop(struct x86_emulate_ctxt *ctxt) 3248static int em_loop(struct x86_emulate_ctxt *ctxt)
3190{ 3249{
3191 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 3250 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
3192 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && 3251 if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
3193 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) 3252 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
3194 jmp_rel(ctxt, ctxt->src.val); 3253 jmp_rel(ctxt, ctxt->src.val);
3195 3254
@@ -3198,7 +3257,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
3198 3257
3199static int em_jcxz(struct x86_emulate_ctxt *ctxt) 3258static int em_jcxz(struct x86_emulate_ctxt *ctxt)
3200{ 3259{
3201 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) 3260 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
3202 jmp_rel(ctxt, ctxt->src.val); 3261 jmp_rel(ctxt, ctxt->src.val);
3203 3262
3204 return X86EMUL_CONTINUE; 3263 return X86EMUL_CONTINUE;
@@ -3286,20 +3345,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3286{ 3345{
3287 u32 eax, ebx, ecx, edx; 3346 u32 eax, ebx, ecx, edx;
3288 3347
3289 eax = ctxt->regs[VCPU_REGS_RAX]; 3348 eax = reg_read(ctxt, VCPU_REGS_RAX);
3290 ecx = ctxt->regs[VCPU_REGS_RCX]; 3349 ecx = reg_read(ctxt, VCPU_REGS_RCX);
3291 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3350 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3292 ctxt->regs[VCPU_REGS_RAX] = eax; 3351 *reg_write(ctxt, VCPU_REGS_RAX) = eax;
3293 ctxt->regs[VCPU_REGS_RBX] = ebx; 3352 *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
3294 ctxt->regs[VCPU_REGS_RCX] = ecx; 3353 *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
3295 ctxt->regs[VCPU_REGS_RDX] = edx; 3354 *reg_write(ctxt, VCPU_REGS_RDX) = edx;
3296 return X86EMUL_CONTINUE; 3355 return X86EMUL_CONTINUE;
3297} 3356}
3298 3357
3299static int em_lahf(struct x86_emulate_ctxt *ctxt) 3358static int em_lahf(struct x86_emulate_ctxt *ctxt)
3300{ 3359{
3301 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; 3360 *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
3302 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; 3361 *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
3303 return X86EMUL_CONTINUE; 3362 return X86EMUL_CONTINUE;
3304} 3363}
3305 3364
@@ -3456,7 +3515,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
3456 3515
3457static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 3516static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
3458{ 3517{
3459 u64 rax = ctxt->regs[VCPU_REGS_RAX]; 3518 u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
3460 3519
3461 /* Valid physical address? */ 3520 /* Valid physical address? */
3462 if (rax & 0xffff000000000000ULL) 3521 if (rax & 0xffff000000000000ULL)
@@ -3478,7 +3537,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
3478static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 3537static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
3479{ 3538{
3480 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 3539 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
3481 u64 rcx = ctxt->regs[VCPU_REGS_RCX]; 3540 u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
3482 3541
3483 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 3542 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
3484 (rcx > 3)) 3543 (rcx > 3))
@@ -3531,13 +3590,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3531 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3532 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3533 3592
3534static struct opcode group7_rm1[] = { 3593static const struct opcode group7_rm1[] = {
3535 DI(SrcNone | Priv, monitor), 3594 DI(SrcNone | Priv, monitor),
3536 DI(SrcNone | Priv, mwait), 3595 DI(SrcNone | Priv, mwait),
3537 N, N, N, N, N, N, 3596 N, N, N, N, N, N,
3538}; 3597};
3539 3598
3540static struct opcode group7_rm3[] = { 3599static const struct opcode group7_rm3[] = {
3541 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 3600 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3542 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), 3601 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3543 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), 3602 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
@@ -3548,13 +3607,13 @@ static struct opcode group7_rm3[] = {
3548 DIP(SrcNone | Prot | Priv, invlpga, check_svme), 3607 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3549}; 3608};
3550 3609
3551static struct opcode group7_rm7[] = { 3610static const struct opcode group7_rm7[] = {
3552 N, 3611 N,
3553 DIP(SrcNone, rdtscp, check_rdtsc), 3612 DIP(SrcNone, rdtscp, check_rdtsc),
3554 N, N, N, N, N, N, 3613 N, N, N, N, N, N,
3555}; 3614};
3556 3615
3557static struct opcode group1[] = { 3616static const struct opcode group1[] = {
3558 I(Lock, em_add), 3617 I(Lock, em_add),
3559 I(Lock | PageTable, em_or), 3618 I(Lock | PageTable, em_or),
3560 I(Lock, em_adc), 3619 I(Lock, em_adc),
@@ -3565,11 +3624,11 @@ static struct opcode group1[] = {
3565 I(0, em_cmp), 3624 I(0, em_cmp),
3566}; 3625};
3567 3626
3568static struct opcode group1A[] = { 3627static const struct opcode group1A[] = {
3569 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3570}; 3629};
3571 3630
3572static struct opcode group3[] = { 3631static const struct opcode group3[] = {
3573 I(DstMem | SrcImm, em_test), 3632 I(DstMem | SrcImm, em_test),
3574 I(DstMem | SrcImm, em_test), 3633 I(DstMem | SrcImm, em_test),
3575 I(DstMem | SrcNone | Lock, em_not), 3634 I(DstMem | SrcNone | Lock, em_not),
@@ -3580,13 +3639,13 @@ static struct opcode group3[] = {
3580 I(SrcMem, em_idiv_ex), 3639 I(SrcMem, em_idiv_ex),
3581}; 3640};
3582 3641
3583static struct opcode group4[] = { 3642static const struct opcode group4[] = {
3584 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3585 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3586 N, N, N, N, N, N, 3645 N, N, N, N, N, N,
3587}; 3646};
3588 3647
3589static struct opcode group5[] = { 3648static const struct opcode group5[] = {
3590 I(DstMem | SrcNone | Lock, em_grp45), 3649 I(DstMem | SrcNone | Lock, em_grp45),
3591 I(DstMem | SrcNone | Lock, em_grp45), 3650 I(DstMem | SrcNone | Lock, em_grp45),
3592 I(SrcMem | Stack, em_grp45), 3651 I(SrcMem | Stack, em_grp45),
@@ -3596,7 +3655,7 @@ static struct opcode group5[] = {
3596 I(SrcMem | Stack, em_grp45), N, 3655 I(SrcMem | Stack, em_grp45), N,
3597}; 3656};
3598 3657
3599static struct opcode group6[] = { 3658static const struct opcode group6[] = {
3600 DI(Prot, sldt), 3659 DI(Prot, sldt),
3601 DI(Prot, str), 3660 DI(Prot, str),
3602 II(Prot | Priv | SrcMem16, em_lldt, lldt), 3661 II(Prot | Priv | SrcMem16, em_lldt, lldt),
@@ -3604,7 +3663,7 @@ static struct opcode group6[] = {
3604 N, N, N, N, 3663 N, N, N, N,
3605}; 3664};
3606 3665
3607static struct group_dual group7 = { { 3666static const struct group_dual group7 = { {
3608 II(Mov | DstMem | Priv, em_sgdt, sgdt), 3667 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3609 II(Mov | DstMem | Priv, em_sidt, sidt), 3668 II(Mov | DstMem | Priv, em_sidt, sidt),
3610 II(SrcMem | Priv, em_lgdt, lgdt), 3669 II(SrcMem | Priv, em_lgdt, lgdt),
@@ -3621,7 +3680,7 @@ static struct group_dual group7 = { {
3621 EXT(0, group7_rm7), 3680 EXT(0, group7_rm7),
3622} }; 3681} };
3623 3682
3624static struct opcode group8[] = { 3683static const struct opcode group8[] = {
3625 N, N, N, N, 3684 N, N, N, N,
3626 I(DstMem | SrcImmByte, em_bt), 3685 I(DstMem | SrcImmByte, em_bt),
3627 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
@@ -3629,26 +3688,26 @@ static struct opcode group8[] = {
3629 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3630}; 3689};
3631 3690
3632static struct group_dual group9 = { { 3691static const struct group_dual group9 = { {
3633 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3692 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3634}, { 3693}, {
3635 N, N, N, N, N, N, N, N, 3694 N, N, N, N, N, N, N, N,
3636} }; 3695} };
3637 3696
3638static struct opcode group11[] = { 3697static const struct opcode group11[] = {
3639 I(DstMem | SrcImm | Mov | PageTable, em_mov), 3698 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3640 X7(D(Undefined)), 3699 X7(D(Undefined)),
3641}; 3700};
3642 3701
3643static struct gprefix pfx_0f_6f_0f_7f = { 3702static const struct gprefix pfx_0f_6f_0f_7f = {
3644 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 3703 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3645}; 3704};
3646 3705
3647static struct gprefix pfx_vmovntpx = { 3706static const struct gprefix pfx_vmovntpx = {
3648 I(0, em_mov), N, N, N, 3707 I(0, em_mov), N, N, N,
3649}; 3708};
3650 3709
3651static struct opcode opcode_table[256] = { 3710static const struct opcode opcode_table[256] = {
3652 /* 0x00 - 0x07 */ 3711 /* 0x00 - 0x07 */
3653 I6ALU(Lock, em_add), 3712 I6ALU(Lock, em_add),
3654 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
@@ -3689,7 +3748,7 @@ static struct opcode opcode_table[256] = {
3689 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3748 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3690 I(SrcImmByte | Mov | Stack, em_push), 3749 I(SrcImmByte | Mov | Stack, em_push),
3691 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3750 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3692 I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ 3751 I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
3693 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ 3752 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
3694 /* 0x70 - 0x7F */ 3753 /* 0x70 - 0x7F */
3695 X16(D(SrcImmByte)), 3754 X16(D(SrcImmByte)),
@@ -3765,7 +3824,7 @@ static struct opcode opcode_table[256] = {
3765 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3824 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3766}; 3825};
3767 3826
3768static struct opcode twobyte_table[256] = { 3827static const struct opcode twobyte_table[256] = {
3769 /* 0x00 - 0x0F */ 3828 /* 0x00 - 0x0F */
3770 G(0, group6), GD(0, &group7), N, N, 3829 G(0, group6), GD(0, &group7), N, N,
3771 N, I(ImplicitOps | VendorSpecific, em_syscall), 3830 N, I(ImplicitOps | VendorSpecific, em_syscall),
@@ -3936,7 +3995,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3936 case OpAcc: 3995 case OpAcc:
3937 op->type = OP_REG; 3996 op->type = OP_REG;
3938 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3997 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3939 op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; 3998 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3940 fetch_register_operand(op); 3999 fetch_register_operand(op);
3941 op->orig_val = op->val; 4000 op->orig_val = op->val;
3942 break; 4001 break;
@@ -3944,19 +4003,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3944 op->type = OP_MEM; 4003 op->type = OP_MEM;
3945 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4004 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3946 op->addr.mem.ea = 4005 op->addr.mem.ea =
3947 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); 4006 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
3948 op->addr.mem.seg = VCPU_SREG_ES; 4007 op->addr.mem.seg = VCPU_SREG_ES;
3949 op->val = 0; 4008 op->val = 0;
4009 op->count = 1;
3950 break; 4010 break;
3951 case OpDX: 4011 case OpDX:
3952 op->type = OP_REG; 4012 op->type = OP_REG;
3953 op->bytes = 2; 4013 op->bytes = 2;
3954 op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 4014 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3955 fetch_register_operand(op); 4015 fetch_register_operand(op);
3956 break; 4016 break;
3957 case OpCL: 4017 case OpCL:
3958 op->bytes = 1; 4018 op->bytes = 1;
3959 op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; 4019 op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
3960 break; 4020 break;
3961 case OpImmByte: 4021 case OpImmByte:
3962 rc = decode_imm(ctxt, op, 1, true); 4022 rc = decode_imm(ctxt, op, 1, true);
@@ -3987,9 +4047,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3987 op->type = OP_MEM; 4047 op->type = OP_MEM;
3988 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4048 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3989 op->addr.mem.ea = 4049 op->addr.mem.ea =
3990 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); 4050 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
3991 op->addr.mem.seg = seg_override(ctxt); 4051 op->addr.mem.seg = seg_override(ctxt);
3992 op->val = 0; 4052 op->val = 0;
4053 op->count = 1;
3993 break; 4054 break;
3994 case OpImmFAddr: 4055 case OpImmFAddr:
3995 op->type = OP_IMM; 4056 op->type = OP_IMM;
@@ -4293,9 +4354,10 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4293 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4294} 4355}
4295 4356
4357
4296int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4297{ 4359{
4298 struct x86_emulate_ops *ops = ctxt->ops; 4360 const struct x86_emulate_ops *ops = ctxt->ops;
4299 int rc = X86EMUL_CONTINUE; 4361 int rc = X86EMUL_CONTINUE;
4300 int saved_dst_type = ctxt->dst.type; 4362 int saved_dst_type = ctxt->dst.type;
4301 4363
@@ -4356,7 +4418,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4356 } 4418 }
4357 4419
4358 /* Instruction can only be executed in protected mode */ 4420 /* Instruction can only be executed in protected mode */
4359 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 4421 if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
4360 rc = emulate_ud(ctxt); 4422 rc = emulate_ud(ctxt);
4361 goto done; 4423 goto done;
4362 } 4424 }
@@ -4377,7 +4439,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4377 4439
4378 if (ctxt->rep_prefix && (ctxt->d & String)) { 4440 if (ctxt->rep_prefix && (ctxt->d & String)) {
4379 /* All REP prefixes have the same first termination condition */ 4441 /* All REP prefixes have the same first termination condition */
4380 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { 4442 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
4381 ctxt->eip = ctxt->_eip; 4443 ctxt->eip = ctxt->_eip;
4382 goto done; 4444 goto done;
4383 } 4445 }
@@ -4450,7 +4512,7 @@ special_insn:
4450 ctxt->dst.val = ctxt->src.addr.mem.ea; 4512 ctxt->dst.val = ctxt->src.addr.mem.ea;
4451 break; 4513 break;
4452 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 4514 case 0x90 ... 0x97: /* nop / xchg reg, rax */
4453 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) 4515 if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
4454 break; 4516 break;
4455 rc = em_xchg(ctxt); 4517 rc = em_xchg(ctxt);
4456 break; 4518 break;
@@ -4478,7 +4540,7 @@ special_insn:
4478 rc = em_grp2(ctxt); 4540 rc = em_grp2(ctxt);
4479 break; 4541 break;
4480 case 0xd2 ... 0xd3: /* Grp2 */ 4542 case 0xd2 ... 0xd3: /* Grp2 */
4481 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; 4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4482 rc = em_grp2(ctxt); 4544 rc = em_grp2(ctxt);
4483 break; 4545 break;
4484 case 0xe9: /* jmp rel */ 4546 case 0xe9: /* jmp rel */
@@ -4524,23 +4586,27 @@ writeback:
4524 ctxt->dst.type = saved_dst_type; 4586 ctxt->dst.type = saved_dst_type;
4525 4587
4526 if ((ctxt->d & SrcMask) == SrcSI) 4588 if ((ctxt->d & SrcMask) == SrcSI)
4527 string_addr_inc(ctxt, seg_override(ctxt), 4589 string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
4528 VCPU_REGS_RSI, &ctxt->src);
4529 4590
4530 if ((ctxt->d & DstMask) == DstDI) 4591 if ((ctxt->d & DstMask) == DstDI)
4531 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4592 string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
4532 &ctxt->dst);
4533 4593
4534 if (ctxt->rep_prefix && (ctxt->d & String)) { 4594 if (ctxt->rep_prefix && (ctxt->d & String)) {
4595 unsigned int count;
4535 struct read_cache *r = &ctxt->io_read; 4596 struct read_cache *r = &ctxt->io_read;
4536 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 4597 if ((ctxt->d & SrcMask) == SrcSI)
4598 count = ctxt->src.count;
4599 else
4600 count = ctxt->dst.count;
4601 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
4602 -count);
4537 4603
4538 if (!string_insn_completed(ctxt)) { 4604 if (!string_insn_completed(ctxt)) {
4539 /* 4605 /*
4540 * Re-enter guest when pio read ahead buffer is empty 4606 * Re-enter guest when pio read ahead buffer is empty
4541 * or, if it is not used, after each 1024 iteration. 4607 * or, if it is not used, after each 1024 iteration.
4542 */ 4608 */
4543 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && 4609 if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
4544 (r->end == 0 || r->end != r->pos)) { 4610 (r->end == 0 || r->end != r->pos)) {
4545 /* 4611 /*
4546 * Reset read cache. Usually happens before 4612 * Reset read cache. Usually happens before
@@ -4548,6 +4614,7 @@ writeback:
4548 * we have to do it here. 4614 * we have to do it here.
4549 */ 4615 */
4550 ctxt->mem_read.end = 0; 4616 ctxt->mem_read.end = 0;
4617 writeback_registers(ctxt);
4551 return EMULATION_RESTART; 4618 return EMULATION_RESTART;
4552 } 4619 }
4553 goto done; /* skip rip writeback */ 4620 goto done; /* skip rip writeback */
@@ -4562,6 +4629,9 @@ done:
4562 if (rc == X86EMUL_INTERCEPTED) 4629 if (rc == X86EMUL_INTERCEPTED)
4563 return EMULATION_INTERCEPTED; 4630 return EMULATION_INTERCEPTED;
4564 4631
4632 if (rc == X86EMUL_CONTINUE)
4633 writeback_registers(ctxt);
4634
4565 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4635 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4566 4636
4567twobyte_insn: 4637twobyte_insn:
@@ -4634,3 +4704,13 @@ twobyte_insn:
4634cannot_emulate: 4704cannot_emulate:
4635 return EMULATION_FAILED; 4705 return EMULATION_FAILED;
4636} 4706}
4707
4708void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
4709{
4710 invalidate_registers(ctxt);
4711}
4712
4713void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
4714{
4715 writeback_registers(ctxt);
4716}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1a..11300d2fa714 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
108 ktime_t remaining; 108 ktime_t remaining;
109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
110 110
111 if (!ps->pit_timer.period) 111 if (!ps->period)
112 return 0; 112 return 0;
113 113
114 /* 114 /*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
120 * itself with the initial count and continues counting 120 * itself with the initial count and continues counting
121 * from there. 121 * from there.
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->pit_timer.timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->pit_timer.period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->pit_timer.period); 125 elapsed = mod_64(elapsed, ps->period);
126 126
127 return elapsed; 127 return elapsed;
128} 128}
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
238 int value; 238 int value;
239 239
240 spin_lock(&ps->inject_lock); 240 spin_lock(&ps->inject_lock);
241 value = atomic_dec_return(&ps->pit_timer.pending); 241 value = atomic_dec_return(&ps->pending);
242 if (value < 0) 242 if (value < 0)
243 /* spurious acks can be generated if, for example, the 243 /* spurious acks can be generated if, for example, the
244 * PIC is being reset. Handle it gracefully here 244 * PIC is being reset. Handle it gracefully here
245 */ 245 */
246 atomic_inc(&ps->pit_timer.pending); 246 atomic_inc(&ps->pending);
247 else if (value > 0) 247 else if (value > 0)
248 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
249 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
261 if (!kvm_vcpu_is_bsp(vcpu) || !pit) 261 if (!kvm_vcpu_is_bsp(vcpu) || !pit)
262 return; 262 return;
263 263
264 timer = &pit->pit_state.pit_timer.timer; 264 timer = &pit->pit_state.timer;
265 if (hrtimer_cancel(timer)) 265 if (hrtimer_cancel(timer))
266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
267} 267}
268 268
269static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
270{ 270{
271 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.timer);
272 flush_kthread_work(&pit->expired); 272 flush_kthread_work(&pit->expired);
273} 273}
274 274
275static bool kpit_is_periodic(struct kvm_timer *ktimer)
276{
277 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
278 pit_timer);
279 return ps->is_periodic;
280}
281
282static struct kvm_timer_ops kpit_ops = {
283 .is_periodic = kpit_is_periodic,
284};
285
286static void pit_do_work(struct kthread_work *work) 275static void pit_do_work(struct kthread_work *work)
287{ 276{
288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 277 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
322 311
323static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 312static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
324{ 313{
325 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 314 struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
326 struct kvm_pit *pt = ktimer->kvm->arch.vpit; 315 struct kvm_pit *pt = ps->kvm->arch.vpit;
327 316
328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 317 if (ps->reinject || !atomic_read(&ps->pending)) {
329 atomic_inc(&ktimer->pending); 318 atomic_inc(&ps->pending);
330 queue_kthread_work(&pt->worker, &pt->expired); 319 queue_kthread_work(&pt->worker, &pt->expired);
331 } 320 }
332 321
333 if (ktimer->t_ops->is_periodic(ktimer)) { 322 if (ps->is_periodic) {
334 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 323 hrtimer_add_expires_ns(&ps->timer, ps->period);
335 return HRTIMER_RESTART; 324 return HRTIMER_RESTART;
336 } else 325 } else
337 return HRTIMER_NORESTART; 326 return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
340static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) 329static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
341{ 330{
342 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 331 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 332 s64 interval;
345 333
346 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) 334 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
351 pr_debug("create pit timer, interval is %llu nsec\n", interval); 339 pr_debug("create pit timer, interval is %llu nsec\n", interval);
352 340
353 /* TODO The new value only affected after the retriggered */ 341 /* TODO The new value only affected after the retriggered */
354 hrtimer_cancel(&pt->timer); 342 hrtimer_cancel(&ps->timer);
355 flush_kthread_work(&ps->pit->expired); 343 flush_kthread_work(&ps->pit->expired);
356 pt->period = interval; 344 ps->period = interval;
357 ps->is_periodic = is_period; 345 ps->is_periodic = is_period;
358 346
359 pt->timer.function = pit_timer_fn; 347 ps->timer.function = pit_timer_fn;
360 pt->t_ops = &kpit_ops; 348 ps->kvm = ps->pit->kvm;
361 pt->kvm = ps->pit->kvm;
362 349
363 atomic_set(&pt->pending, 0); 350 atomic_set(&ps->pending, 0);
364 ps->irq_ack = 1; 351 ps->irq_ack = 1;
365 352
366 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 353 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
367 HRTIMER_MODE_ABS); 354 HRTIMER_MODE_ABS);
368} 355}
369 356
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
639 } 626 }
640 mutex_unlock(&pit->pit_state.lock); 627 mutex_unlock(&pit->pit_state.lock);
641 628
642 atomic_set(&pit->pit_state.pit_timer.pending, 0); 629 atomic_set(&pit->pit_state.pending, 0);
643 pit->pit_state.irq_ack = 1; 630 pit->pit_state.irq_ack = 1;
644} 631}
645 632
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
648 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); 635 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
649 636
650 if (!mask) { 637 if (!mask) {
651 atomic_set(&pit->pit_state.pit_timer.pending, 0); 638 atomic_set(&pit->pit_state.pending, 0);
652 pit->pit_state.irq_ack = 1; 639 pit->pit_state.irq_ack = 1;
653 } 640 }
654} 641}
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
706 693
707 pit_state = &pit->pit_state; 694 pit_state = &pit->pit_state;
708 pit_state->pit = pit; 695 pit_state->pit = pit;
709 hrtimer_init(&pit_state->pit_timer.timer, 696 hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
710 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
711 pit_state->irq_ack_notifier.gsi = 0; 697 pit_state->irq_ack_notifier.gsi = 0;
712 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; 698 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
713 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 699 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
714 pit_state->pit_timer.reinject = true; 700 pit_state->reinject = true;
715 mutex_unlock(&pit->pit_state.lock); 701 mutex_unlock(&pit->pit_state.lock);
716 702
717 kvm_pit_reset(pit); 703 kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
761 kvm_unregister_irq_ack_notifier(kvm, 747 kvm_unregister_irq_ack_notifier(kvm,
762 &kvm->arch.vpit->pit_state.irq_ack_notifier); 748 &kvm->arch.vpit->pit_state.irq_ack_notifier);
763 mutex_lock(&kvm->arch.vpit->pit_state.lock); 749 mutex_lock(&kvm->arch.vpit->pit_state.lock);
764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 750 timer = &kvm->arch.vpit->pit_state.timer;
765 hrtimer_cancel(timer); 751 hrtimer_cancel(timer);
766 flush_kthread_work(&kvm->arch.vpit->expired); 752 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task); 753 kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1d..dd1b16b611b0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
24struct kvm_kpit_state { 24struct kvm_kpit_state {
25 struct kvm_kpit_channel_state channels[3]; 25 struct kvm_kpit_channel_state channels[3];
26 u32 flags; 26 u32 flags;
27 struct kvm_timer pit_timer;
28 bool is_periodic; 27 bool is_periodic;
28 s64 period; /* unit: ns */
29 struct hrtimer timer;
30 atomic_t pending; /* accumulated triggered timers */
31 bool reinject;
32 struct kvm *kvm;
29 u32 speaker_data_on; 33 u32 speaker_data_on;
30 struct mutex lock; 34 struct mutex lock;
31 struct kvm_pit *pit; 35 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9fc9aa7ac703..848206df0967 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
190 190
191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 int ret = -1; 193 int ret, irq_level;
194
195 BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
194 196
195 pic_lock(s); 197 pic_lock(s);
196 if (irq >= 0 && irq < PIC_NUM_PINS) { 198 irq_level = __kvm_irq_line_state(&s->irq_states[irq],
197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq], 199 irq_source_id, level);
198 irq_source_id, level); 200 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); 201 pic_update_irq(s);
200 pic_update_irq(s); 202 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 203 s->pics[irq >> 3].imr, ret == 0);
202 s->pics[irq >> 3].imr, ret == 0);
203 }
204 pic_unlock(s); 204 pic_unlock(s);
205 205
206 return ret; 206 return ret;
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
275{ 275{
276 int irq, i; 276 int irq, i;
277 struct kvm_vcpu *vcpu; 277 struct kvm_vcpu *vcpu;
278 u8 irr = s->irr, isr = s->imr; 278 u8 edge_irr = s->irr & ~s->elcr;
279 bool found = false; 279 bool found = false;
280 280
281 s->last_irr = 0; 281 s->last_irr = 0;
282 s->irr = 0; 282 s->irr &= s->elcr;
283 s->imr = 0; 283 s->imr = 0;
284 s->isr = 0;
285 s->priority_add = 0; 284 s->priority_add = 0;
286 s->irq_base = 0;
287 s->read_reg_select = 0;
288 s->poll = 0;
289 s->special_mask = 0; 285 s->special_mask = 0;
290 s->init_state = 0; 286 s->read_reg_select = 0;
291 s->auto_eoi = 0; 287 if (!s->init4) {
292 s->rotate_on_auto_eoi = 0; 288 s->special_fully_nested_mode = 0;
293 s->special_fully_nested_mode = 0; 289 s->auto_eoi = 0;
294 s->init4 = 0; 290 }
291 s->init_state = 1;
295 292
296 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) 293 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
297 if (kvm_apic_accept_pic_intr(vcpu)) { 294 if (kvm_apic_accept_pic_intr(vcpu)) {
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
304 return; 301 return;
305 302
306 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) 303 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
307 if (irr & (1 << irq) || isr & (1 << irq)) 304 if (edge_irr & (1 << irq))
308 pic_clear_isr(s, irq); 305 pic_clear_isr(s, irq);
309} 306}
310 307
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
316 addr &= 1; 313 addr &= 1;
317 if (addr == 0) { 314 if (addr == 0) {
318 if (val & 0x10) { 315 if (val & 0x10) {
319 u8 edge_irr = s->irr & ~s->elcr;
320 int i;
321 bool found = false;
322 struct kvm_vcpu *vcpu;
323
324 s->init4 = val & 1; 316 s->init4 = val & 1;
325 s->last_irr = 0;
326 s->irr &= s->elcr;
327 s->imr = 0;
328 s->priority_add = 0;
329 s->special_mask = 0;
330 s->read_reg_select = 0;
331 if (!s->init4) {
332 s->special_fully_nested_mode = 0;
333 s->auto_eoi = 0;
334 }
335 s->init_state = 1;
336 if (val & 0x02) 317 if (val & 0x02)
337 pr_pic_unimpl("single mode not supported"); 318 pr_pic_unimpl("single mode not supported");
338 if (val & 0x08) 319 if (val & 0x08)
339 pr_pic_unimpl( 320 pr_pic_unimpl(
340 "level sensitive irq not supported"); 321 "level sensitive irq not supported");
341 322 kvm_pic_reset(s);
342 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
343 if (kvm_apic_accept_pic_intr(vcpu)) {
344 found = true;
345 break;
346 }
347
348
349 if (found)
350 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
351 if (edge_irr & (1 << irq))
352 pic_clear_isr(s, irq);
353 } else if (val & 0x08) { 323 } else if (val & 0x08) {
354 if (val & 0x04) 324 if (val & 0x04)
355 s->poll = 1; 325 s->poll = 1;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba33..2d03568e9498 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
70 struct kvm_io_device dev_slave; 70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr; 71 struct kvm_io_device dev_eclr;
72 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
73 unsigned long irq_states[16]; 73 unsigned long irq_states[PIC_NUM_PINS];
74}; 74};
75 75
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 76struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d4..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 u32 timer_mode_mask;
6 u64 tscdeadline;
7 atomic_t pending; /* accumulated triggered timers */
8 bool reinject;
9 struct kvm_timer_ops *t_ops;
10 struct kvm *kvm;
11 struct kvm_vcpu *vcpu;
12};
13
14struct kvm_timer_ops {
15 bool (*is_periodic)(struct kvm_timer *);
16};
17
18enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39f..c6e6b721b6ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <linux/atomic.h> 36#include <linux/atomic.h>
37#include <linux/jump_label.h>
37#include "kvm_cache_regs.h" 38#include "kvm_cache_regs.h"
38#include "irq.h" 39#include "irq.h"
39#include "trace.h" 40#include "trace.h"
@@ -65,6 +66,7 @@
65#define APIC_DEST_NOSHORT 0x0 66#define APIC_DEST_NOSHORT 0x0
66#define APIC_DEST_MASK 0x800 67#define APIC_DEST_MASK 0x800
67#define MAX_APIC_VECTOR 256 68#define MAX_APIC_VECTOR 256
69#define APIC_VECTORS_PER_REG 32
68 70
69#define VEC_POS(v) ((v) & (32 - 1)) 71#define VEC_POS(v) ((v) & (32 - 1))
70#define REG_POS(v) (((v) >> 5) << 4) 72#define REG_POS(v) (((v) >> 5) << 4)
@@ -72,11 +74,6 @@
72static unsigned int min_timer_period_us = 500; 74static unsigned int min_timer_period_us = 500;
73module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 75module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
74 76
75static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
76{
77 return *((u32 *) (apic->regs + reg_off));
78}
79
80static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) 77static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
81{ 78{
82 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +114,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 114 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118} 115}
119 116
120static inline int apic_hw_enabled(struct kvm_lapic *apic) 117struct static_key_deferred apic_hw_disabled __read_mostly;
121{ 118struct static_key_deferred apic_sw_disabled __read_mostly;
122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
123}
124 119
125static inline int apic_sw_enabled(struct kvm_lapic *apic) 120static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
126{ 121{
127 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; 122 if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
123 if (val & APIC_SPIV_APIC_ENABLED)
124 static_key_slow_dec_deferred(&apic_sw_disabled);
125 else
126 static_key_slow_inc(&apic_sw_disabled.key);
127 }
128 apic_set_reg(apic, APIC_SPIV, val);
128} 129}
129 130
130static inline int apic_enabled(struct kvm_lapic *apic) 131static inline int apic_enabled(struct kvm_lapic *apic)
131{ 132{
132 return apic_sw_enabled(apic) && apic_hw_enabled(apic); 133 return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
133} 134}
134 135
135#define LVT_MASK \ 136#define LVT_MASK \
@@ -139,36 +140,135 @@ static inline int apic_enabled(struct kvm_lapic *apic)
139 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
140 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
141 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
142static inline int kvm_apic_id(struct kvm_lapic *apic) 148static inline int kvm_apic_id(struct kvm_lapic *apic)
143{ 149{
144 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151}
152
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
154{
155 u16 cid;
156 ldr >>= 32 - map->ldr_bits;
157 cid = (ldr >> map->cid_shift) & map->cid_mask;
158
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
160
161 return cid;
162}
163
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
165{
166 ldr >>= (32 - map->ldr_bits);
167 return ldr & map->lid_mask;
168}
169
170static void recalculate_apic_map(struct kvm *kvm)
171{
172 struct kvm_apic_map *new, *old = NULL;
173 struct kvm_vcpu *vcpu;
174 int i;
175
176 new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
177
178 mutex_lock(&kvm->arch.apic_map_lock);
179
180 if (!new)
181 goto out;
182
183 new->ldr_bits = 8;
184 /* flat mode is default */
185 new->cid_shift = 8;
186 new->cid_mask = 0;
187 new->lid_mask = 0xff;
188
189 kvm_for_each_vcpu(i, vcpu, kvm) {
190 struct kvm_lapic *apic = vcpu->arch.apic;
191 u16 cid, lid;
192 u32 ldr;
193
194 if (!kvm_apic_present(vcpu))
195 continue;
196
197 /*
198 * All APICs have to be configured in the same mode by an OS.
199 * We take advatage of this while building logical id loockup
200 * table. After reset APICs are in xapic/flat mode, so if we
201 * find apic with different setting we assume this is the mode
202 * OS wants all apics to be in; build lookup table accordingly.
203 */
204 if (apic_x2apic_mode(apic)) {
205 new->ldr_bits = 32;
206 new->cid_shift = 16;
207 new->cid_mask = new->lid_mask = 0xffff;
208 } else if (kvm_apic_sw_enabled(apic) &&
209 !new->cid_mask /* flat mode */ &&
210 kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
211 new->cid_shift = 4;
212 new->cid_mask = 0xf;
213 new->lid_mask = 0xf;
214 }
215
216 new->phys_map[kvm_apic_id(apic)] = apic;
217
218 ldr = kvm_apic_get_reg(apic, APIC_LDR);
219 cid = apic_cluster_id(new, ldr);
220 lid = apic_logical_id(new, ldr);
221
222 if (lid)
223 new->logical_map[cid][ffs(lid) - 1] = apic;
224 }
225out:
226 old = rcu_dereference_protected(kvm->arch.apic_map,
227 lockdep_is_held(&kvm->arch.apic_map_lock));
228 rcu_assign_pointer(kvm->arch.apic_map, new);
229 mutex_unlock(&kvm->arch.apic_map_lock);
230
231 if (old)
232 kfree_rcu(old, rcu);
233}
234
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
236{
237 apic_set_reg(apic, APIC_ID, id << 24);
238 recalculate_apic_map(apic->vcpu->kvm);
239}
240
241static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
242{
243 apic_set_reg(apic, APIC_LDR, id);
244 recalculate_apic_map(apic->vcpu->kvm);
145} 245}
146 246
147static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) 247static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
148{ 248{
149 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); 249 return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
150} 250}
151 251
152static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) 252static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
153{ 253{
154 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; 254 return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
155} 255}
156 256
157static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) 257static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
158{ 258{
159 return ((apic_get_reg(apic, APIC_LVTT) & 259 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
160 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); 260 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
161} 261}
162 262
163static inline int apic_lvtt_period(struct kvm_lapic *apic) 263static inline int apic_lvtt_period(struct kvm_lapic *apic)
164{ 264{
165 return ((apic_get_reg(apic, APIC_LVTT) & 265 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
166 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); 266 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
167} 267}
168 268
169static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) 269static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
170{ 270{
171 return ((apic_get_reg(apic, APIC_LVTT) & 271 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
172 apic->lapic_timer.timer_mode_mask) == 272 apic->lapic_timer.timer_mode_mask) ==
173 APIC_LVT_TIMER_TSCDEADLINE); 273 APIC_LVT_TIMER_TSCDEADLINE);
174} 274}
@@ -184,7 +284,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
184 struct kvm_cpuid_entry2 *feat; 284 struct kvm_cpuid_entry2 *feat;
185 u32 v = APIC_VERSION; 285 u32 v = APIC_VERSION;
186 286
187 if (!irqchip_in_kernel(vcpu->kvm)) 287 if (!kvm_vcpu_has_lapic(vcpu))
188 return; 288 return;
189 289
190 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); 290 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -193,12 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
193 apic_set_reg(apic, APIC_LVR, v); 293 apic_set_reg(apic, APIC_LVR, v);
194} 294}
195 295
196static inline int apic_x2apic_mode(struct kvm_lapic *apic) 296static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
197{
198 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
199}
200
201static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
202 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ 297 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
203 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 298 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
204 LVT_MASK | APIC_MODE_MASK, /* LVTPC */ 299 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
@@ -208,25 +303,30 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
208 303
209static int find_highest_vector(void *bitmap) 304static int find_highest_vector(void *bitmap)
210{ 305{
211 u32 *word = bitmap; 306 int vec;
212 int word_offset = MAX_APIC_VECTOR >> 5; 307 u32 *reg;
213 308
214 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) 309 for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
215 continue; 310 vec >= 0; vec -= APIC_VECTORS_PER_REG) {
311 reg = bitmap + REG_POS(vec);
312 if (*reg)
313 return fls(*reg) - 1 + vec;
314 }
216 315
217 if (likely(!word_offset && !word[0])) 316 return -1;
218 return -1;
219 else
220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
221} 317}
222 318
223static u8 count_vectors(void *bitmap) 319static u8 count_vectors(void *bitmap)
224{ 320{
225 u32 *word = bitmap; 321 int vec;
226 int word_offset; 322 u32 *reg;
227 u8 count = 0; 323 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) 324
229 count += hweight32(word[word_offset << 2]); 325 for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
326 reg = bitmap + REG_POS(vec);
327 count += hweight32(*reg);
328 }
329
230 return count; 330 return count;
231} 331}
232 332
@@ -285,7 +385,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
285 385
286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 386int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
287{ 387{
288 struct kvm_lapic *apic = vcpu->arch.apic;
289 int highest_irr; 388 int highest_irr;
290 389
291 /* This may race with setting of irr in __apic_accept_irq() and 390 /* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +392,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
293 * will cause vmexit immediately and the value will be recalculated 392 * will cause vmexit immediately and the value will be recalculated
294 * on the next vmentry. 393 * on the next vmentry.
295 */ 394 */
296 if (!apic) 395 if (!kvm_vcpu_has_lapic(vcpu))
297 return 0; 396 return 0;
298 highest_irr = apic_find_highest_irr(apic); 397 highest_irr = apic_find_highest_irr(vcpu->arch.apic);
299 398
300 return highest_irr; 399 return highest_irr;
301} 400}
@@ -378,8 +477,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
378 u32 tpr, isrv, ppr, old_ppr; 477 u32 tpr, isrv, ppr, old_ppr;
379 int isr; 478 int isr;
380 479
381 old_ppr = apic_get_reg(apic, APIC_PROCPRI); 480 old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
382 tpr = apic_get_reg(apic, APIC_TASKPRI); 481 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
383 isr = apic_find_highest_isr(apic); 482 isr = apic_find_highest_isr(apic);
384 isrv = (isr != -1) ? isr : 0; 483 isrv = (isr != -1) ? isr : 0;
385 484
@@ -415,13 +514,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
415 u32 logical_id; 514 u32 logical_id;
416 515
417 if (apic_x2apic_mode(apic)) { 516 if (apic_x2apic_mode(apic)) {
418 logical_id = apic_get_reg(apic, APIC_LDR); 517 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
419 return logical_id & mda; 518 return logical_id & mda;
420 } 519 }
421 520
422 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); 521 logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
423 522
424 switch (apic_get_reg(apic, APIC_DFR)) { 523 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
425 case APIC_DFR_FLAT: 524 case APIC_DFR_FLAT:
426 if (logical_id & mda) 525 if (logical_id & mda)
427 result = 1; 526 result = 1;
@@ -433,7 +532,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
433 break; 532 break;
434 default: 533 default:
435 apic_debug("Bad DFR vcpu %d: %08x\n", 534 apic_debug("Bad DFR vcpu %d: %08x\n",
436 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); 535 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
437 break; 536 break;
438 } 537 }
439 538
@@ -478,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
478 return result; 577 return result;
479} 578}
480 579
580bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
581 struct kvm_lapic_irq *irq, int *r)
582{
583 struct kvm_apic_map *map;
584 unsigned long bitmap = 1;
585 struct kvm_lapic **dst;
586 int i;
587 bool ret = false;
588
589 *r = -1;
590
591 if (irq->shorthand == APIC_DEST_SELF) {
592 *r = kvm_apic_set_irq(src->vcpu, irq);
593 return true;
594 }
595
596 if (irq->shorthand)
597 return false;
598
599 rcu_read_lock();
600 map = rcu_dereference(kvm->arch.apic_map);
601
602 if (!map)
603 goto out;
604
605 if (irq->dest_mode == 0) { /* physical mode */
606 if (irq->delivery_mode == APIC_DM_LOWEST ||
607 irq->dest_id == 0xff)
608 goto out;
609 dst = &map->phys_map[irq->dest_id & 0xff];
610 } else {
611 u32 mda = irq->dest_id << (32 - map->ldr_bits);
612
613 dst = map->logical_map[apic_cluster_id(map, mda)];
614
615 bitmap = apic_logical_id(map, mda);
616
617 if (irq->delivery_mode == APIC_DM_LOWEST) {
618 int l = -1;
619 for_each_set_bit(i, &bitmap, 16) {
620 if (!dst[i])
621 continue;
622 if (l < 0)
623 l = i;
624 else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
625 l = i;
626 }
627
628 bitmap = (l >= 0) ? 1 << l : 0;
629 }
630 }
631
632 for_each_set_bit(i, &bitmap, 16) {
633 if (!dst[i])
634 continue;
635 if (*r < 0)
636 *r = 0;
637 *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
638 }
639
640 ret = true;
641out:
642 rcu_read_unlock();
643 return ret;
644}
645
481/* 646/*
482 * Add a pending IRQ into lapic. 647 * Add a pending IRQ into lapic.
483 * Return 1 if successfully added and 0 if discarded. 648 * Return 1 if successfully added and 0 if discarded.
@@ -591,7 +756,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
591 apic_clear_isr(vector, apic); 756 apic_clear_isr(vector, apic);
592 apic_update_ppr(apic); 757 apic_update_ppr(apic);
593 758
594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
595 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
596 int trigger_mode; 761 int trigger_mode;
597 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +771,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
606 771
607static void apic_send_ipi(struct kvm_lapic *apic) 772static void apic_send_ipi(struct kvm_lapic *apic)
608{ 773{
609 u32 icr_low = apic_get_reg(apic, APIC_ICR); 774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
610 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 775 u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
611 struct kvm_lapic_irq irq; 776 struct kvm_lapic_irq irq;
612 777
613 irq.vector = icr_low & APIC_VECTOR_MASK; 778 irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +807,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
642 ASSERT(apic != NULL); 807 ASSERT(apic != NULL);
643 808
644 /* if initial count is 0, current count should also be 0 */ 809 /* if initial count is 0, current count should also be 0 */
645 if (apic_get_reg(apic, APIC_TMICT) == 0) 810 if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
646 return 0; 811 return 0;
647 812
648 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); 813 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +861,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
696 861
697 val = apic_get_tmcct(apic); 862 val = apic_get_tmcct(apic);
698 break; 863 break;
699 864 case APIC_PROCPRI:
865 apic_update_ppr(apic);
866 val = kvm_apic_get_reg(apic, offset);
867 break;
700 case APIC_TASKPRI: 868 case APIC_TASKPRI:
701 report_tpr_access(apic, false); 869 report_tpr_access(apic, false);
702 /* fall thru */ 870 /* fall thru */
703 default: 871 default:
704 apic_update_ppr(apic); 872 val = kvm_apic_get_reg(apic, offset);
705 val = apic_get_reg(apic, offset);
706 break; 873 break;
707 } 874 }
708 875
@@ -719,7 +886,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
719{ 886{
720 unsigned char alignment = offset & 0xf; 887 unsigned char alignment = offset & 0xf;
721 u32 result; 888 u32 result;
722 /* this bitmask has a bit cleared for each reserver register */ 889 /* this bitmask has a bit cleared for each reserved register */
723 static const u64 rmask = 0x43ff01ffffffe70cULL; 890 static const u64 rmask = 0x43ff01ffffffe70cULL;
724 891
725 if ((alignment + len) > 4) { 892 if ((alignment + len) > 4) {
@@ -754,7 +921,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
754 921
755static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) 922static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
756{ 923{
757 return apic_hw_enabled(apic) && 924 return kvm_apic_hw_enabled(apic) &&
758 addr >= apic->base_address && 925 addr >= apic->base_address &&
759 addr < apic->base_address + LAPIC_MMIO_LENGTH; 926 addr < apic->base_address + LAPIC_MMIO_LENGTH;
760} 927}
@@ -777,7 +944,7 @@ static void update_divide_count(struct kvm_lapic *apic)
777{ 944{
778 u32 tmp1, tmp2, tdcr; 945 u32 tmp1, tmp2, tdcr;
779 946
780 tdcr = apic_get_reg(apic, APIC_TDCR); 947 tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
781 tmp1 = tdcr & 0xf; 948 tmp1 = tdcr & 0xf;
782 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 949 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
783 apic->divide_count = 0x1 << (tmp2 & 0x7); 950 apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +959,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
792 atomic_set(&apic->lapic_timer.pending, 0); 959 atomic_set(&apic->lapic_timer.pending, 0);
793 960
794 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 961 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
795 /* lapic timer in oneshot or peroidic mode */ 962 /* lapic timer in oneshot or periodic mode */
796 now = apic->lapic_timer.timer.base->get_time(); 963 now = apic->lapic_timer.timer.base->get_time();
797 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) 964 apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
798 * APIC_BUS_CYCLE_NS * apic->divide_count; 965 * APIC_BUS_CYCLE_NS * apic->divide_count;
799 966
800 if (!apic->lapic_timer.period) 967 if (!apic->lapic_timer.period)
@@ -826,7 +993,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
826 "timer initial count 0x%x, period %lldns, " 993 "timer initial count 0x%x, period %lldns, "
827 "expire @ 0x%016" PRIx64 ".\n", __func__, 994 "expire @ 0x%016" PRIx64 ".\n", __func__,
828 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 995 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
829 apic_get_reg(apic, APIC_TMICT), 996 kvm_apic_get_reg(apic, APIC_TMICT),
830 apic->lapic_timer.period, 997 apic->lapic_timer.period,
831 ktime_to_ns(ktime_add_ns(now, 998 ktime_to_ns(ktime_add_ns(now,
832 apic->lapic_timer.period))); 999 apic->lapic_timer.period)));
@@ -858,7 +1025,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
858 1025
859static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 1026static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
860{ 1027{
861 int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); 1028 int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
862 1029
863 if (apic_lvt_nmi_mode(lvt0_val)) { 1030 if (apic_lvt_nmi_mode(lvt0_val)) {
864 if (!nmi_wd_enabled) { 1031 if (!nmi_wd_enabled) {
@@ -879,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
879 switch (reg) { 1046 switch (reg) {
880 case APIC_ID: /* Local APIC ID */ 1047 case APIC_ID: /* Local APIC ID */
881 if (!apic_x2apic_mode(apic)) 1048 if (!apic_x2apic_mode(apic))
882 apic_set_reg(apic, APIC_ID, val); 1049 kvm_apic_set_id(apic, val >> 24);
883 else 1050 else
884 ret = 1; 1051 ret = 1;
885 break; 1052 break;
@@ -895,29 +1062,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
895 1062
896 case APIC_LDR: 1063 case APIC_LDR:
897 if (!apic_x2apic_mode(apic)) 1064 if (!apic_x2apic_mode(apic))
898 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); 1065 kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
899 else 1066 else
900 ret = 1; 1067 ret = 1;
901 break; 1068 break;
902 1069
903 case APIC_DFR: 1070 case APIC_DFR:
904 if (!apic_x2apic_mode(apic)) 1071 if (!apic_x2apic_mode(apic)) {
905 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); 1072 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
906 else 1073 recalculate_apic_map(apic->vcpu->kvm);
1074 } else
907 ret = 1; 1075 ret = 1;
908 break; 1076 break;
909 1077
910 case APIC_SPIV: { 1078 case APIC_SPIV: {
911 u32 mask = 0x3ff; 1079 u32 mask = 0x3ff;
912 if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) 1080 if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
913 mask |= APIC_SPIV_DIRECTED_EOI; 1081 mask |= APIC_SPIV_DIRECTED_EOI;
914 apic_set_reg(apic, APIC_SPIV, val & mask); 1082 apic_set_spiv(apic, val & mask);
915 if (!(val & APIC_SPIV_APIC_ENABLED)) { 1083 if (!(val & APIC_SPIV_APIC_ENABLED)) {
916 int i; 1084 int i;
917 u32 lvt_val; 1085 u32 lvt_val;
918 1086
919 for (i = 0; i < APIC_LVT_NUM; i++) { 1087 for (i = 0; i < APIC_LVT_NUM; i++) {
920 lvt_val = apic_get_reg(apic, 1088 lvt_val = kvm_apic_get_reg(apic,
921 APIC_LVTT + 0x10 * i); 1089 APIC_LVTT + 0x10 * i);
922 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 1090 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
923 lvt_val | APIC_LVT_MASKED); 1091 lvt_val | APIC_LVT_MASKED);
@@ -946,7 +1114,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
946 case APIC_LVT1: 1114 case APIC_LVT1:
947 case APIC_LVTERR: 1115 case APIC_LVTERR:
948 /* TODO: Check vector */ 1116 /* TODO: Check vector */
949 if (!apic_sw_enabled(apic)) 1117 if (!kvm_apic_sw_enabled(apic))
950 val |= APIC_LVT_MASKED; 1118 val |= APIC_LVT_MASKED;
951 1119
952 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; 1120 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +1123,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
955 break; 1123 break;
956 1124
957 case APIC_LVTT: 1125 case APIC_LVTT:
958 if ((apic_get_reg(apic, APIC_LVTT) & 1126 if ((kvm_apic_get_reg(apic, APIC_LVTT) &
959 apic->lapic_timer.timer_mode_mask) != 1127 apic->lapic_timer.timer_mode_mask) !=
960 (val & apic->lapic_timer.timer_mode_mask)) 1128 (val & apic->lapic_timer.timer_mode_mask))
961 hrtimer_cancel(&apic->lapic_timer.timer); 1129 hrtimer_cancel(&apic->lapic_timer.timer);
962 1130
963 if (!apic_sw_enabled(apic)) 1131 if (!kvm_apic_sw_enabled(apic))
964 val |= APIC_LVT_MASKED; 1132 val |= APIC_LVT_MASKED;
965 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); 1133 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
966 apic_set_reg(apic, APIC_LVTT, val); 1134 apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1207,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
1039 1207
1040void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) 1208void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1041{ 1209{
1042 struct kvm_lapic *apic = vcpu->arch.apic; 1210 if (kvm_vcpu_has_lapic(vcpu))
1043
1044 if (apic)
1045 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); 1211 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
1046} 1212}
1047EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1048 1214
1049void kvm_free_lapic(struct kvm_vcpu *vcpu) 1215void kvm_free_lapic(struct kvm_vcpu *vcpu)
1050{ 1216{
1217 struct kvm_lapic *apic = vcpu->arch.apic;
1218
1051 if (!vcpu->arch.apic) 1219 if (!vcpu->arch.apic)
1052 return; 1220 return;
1053 1221
1054 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 1222 hrtimer_cancel(&apic->lapic_timer.timer);
1223
1224 if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
1225 static_key_slow_dec_deferred(&apic_hw_disabled);
1226
1227 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
1228 static_key_slow_dec_deferred(&apic_sw_disabled);
1055 1229
1056 if (vcpu->arch.apic->regs) 1230 if (apic->regs)
1057 free_page((unsigned long)vcpu->arch.apic->regs); 1231 free_page((unsigned long)apic->regs);
1058 1232
1059 kfree(vcpu->arch.apic); 1233 kfree(apic);
1060} 1234}
1061 1235
1062/* 1236/*
@@ -1068,10 +1242,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
1068u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) 1242u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1069{ 1243{
1070 struct kvm_lapic *apic = vcpu->arch.apic; 1244 struct kvm_lapic *apic = vcpu->arch.apic;
1071 if (!apic)
1072 return 0;
1073 1245
1074 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1246 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1247 apic_lvtt_period(apic))
1075 return 0; 1248 return 0;
1076 1249
1077 return apic->lapic_timer.tscdeadline; 1250 return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1253,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1080void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) 1253void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
1081{ 1254{
1082 struct kvm_lapic *apic = vcpu->arch.apic; 1255 struct kvm_lapic *apic = vcpu->arch.apic;
1083 if (!apic)
1084 return;
1085 1256
1086 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1257 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1258 apic_lvtt_period(apic))
1087 return; 1259 return;
1088 1260
1089 hrtimer_cancel(&apic->lapic_timer.timer); 1261 hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1267,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
1095{ 1267{
1096 struct kvm_lapic *apic = vcpu->arch.apic; 1268 struct kvm_lapic *apic = vcpu->arch.apic;
1097 1269
1098 if (!apic) 1270 if (!kvm_vcpu_has_lapic(vcpu))
1099 return; 1271 return;
1272
1100 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 1273 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
1101 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 1274 | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
1102} 1275}
1103 1276
1104u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 1277u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1105{ 1278{
1106 struct kvm_lapic *apic = vcpu->arch.apic;
1107 u64 tpr; 1279 u64 tpr;
1108 1280
1109 if (!apic) 1281 if (!kvm_vcpu_has_lapic(vcpu))
1110 return 0; 1282 return 0;
1111 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); 1283
1284 tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
1112 1285
1113 return (tpr & 0xf0) >> 4; 1286 return (tpr & 0xf0) >> 4;
1114} 1287}
@@ -1123,6 +1296,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1123 return; 1296 return;
1124 } 1297 }
1125 1298
1299 /* update jump label if enable bit changes */
1300 if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
1301 if (value & MSR_IA32_APICBASE_ENABLE)
1302 static_key_slow_dec_deferred(&apic_hw_disabled);
1303 else
1304 static_key_slow_inc(&apic_hw_disabled.key);
1305 recalculate_apic_map(vcpu->kvm);
1306 }
1307
1126 if (!kvm_vcpu_is_bsp(apic->vcpu)) 1308 if (!kvm_vcpu_is_bsp(apic->vcpu))
1127 value &= ~MSR_IA32_APICBASE_BSP; 1309 value &= ~MSR_IA32_APICBASE_BSP;
1128 1310
@@ -1130,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1130 if (apic_x2apic_mode(apic)) { 1312 if (apic_x2apic_mode(apic)) {
1131 u32 id = kvm_apic_id(apic); 1313 u32 id = kvm_apic_id(apic);
1132 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); 1314 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
1133 apic_set_reg(apic, APIC_LDR, ldr); 1315 kvm_apic_set_ldr(apic, ldr);
1134 } 1316 }
1135 apic->base_address = apic->vcpu->arch.apic_base & 1317 apic->base_address = apic->vcpu->arch.apic_base &
1136 MSR_IA32_APICBASE_BASE; 1318 MSR_IA32_APICBASE_BASE;
@@ -1155,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1155 /* Stop the timer in case it's a reset to an active apic */ 1337 /* Stop the timer in case it's a reset to an active apic */
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1338 hrtimer_cancel(&apic->lapic_timer.timer);
1157 1339
1158 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 1340 kvm_apic_set_id(apic, vcpu->vcpu_id);
1159 kvm_apic_set_version(apic->vcpu); 1341 kvm_apic_set_version(apic->vcpu);
1160 1342
1161 for (i = 0; i < APIC_LVT_NUM; i++) 1343 for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1164,9 +1346,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1164 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); 1346 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
1165 1347
1166 apic_set_reg(apic, APIC_DFR, 0xffffffffU); 1348 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
1167 apic_set_reg(apic, APIC_SPIV, 0xff); 1349 apic_set_spiv(apic, 0xff);
1168 apic_set_reg(apic, APIC_TASKPRI, 0); 1350 apic_set_reg(apic, APIC_TASKPRI, 0);
1169 apic_set_reg(apic, APIC_LDR, 0); 1351 kvm_apic_set_ldr(apic, 0);
1170 apic_set_reg(apic, APIC_ESR, 0); 1352 apic_set_reg(apic, APIC_ESR, 0);
1171 apic_set_reg(apic, APIC_ICR, 0); 1353 apic_set_reg(apic, APIC_ICR, 0);
1172 apic_set_reg(apic, APIC_ICR2, 0); 1354 apic_set_reg(apic, APIC_ICR2, 0);
@@ -1183,7 +1365,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1183 update_divide_count(apic); 1365 update_divide_count(apic);
1184 atomic_set(&apic->lapic_timer.pending, 0); 1366 atomic_set(&apic->lapic_timer.pending, 0);
1185 if (kvm_vcpu_is_bsp(vcpu)) 1367 if (kvm_vcpu_is_bsp(vcpu))
1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1368 kvm_lapic_set_base(vcpu,
1369 vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
1187 vcpu->arch.pv_eoi.msr_val = 0; 1370 vcpu->arch.pv_eoi.msr_val = 0;
1188 apic_update_ppr(apic); 1371 apic_update_ppr(apic);
1189 1372
@@ -1196,45 +1379,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1196 vcpu->arch.apic_base, apic->base_address); 1379 vcpu->arch.apic_base, apic->base_address);
1197} 1380}
1198 1381
1199bool kvm_apic_present(struct kvm_vcpu *vcpu)
1200{
1201 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
1202}
1203
1204int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
1205{
1206 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
1207}
1208
1209/* 1382/*
1210 *---------------------------------------------------------------------- 1383 *----------------------------------------------------------------------
1211 * timer interface 1384 * timer interface
1212 *---------------------------------------------------------------------- 1385 *----------------------------------------------------------------------
1213 */ 1386 */
1214 1387
1215static bool lapic_is_periodic(struct kvm_timer *ktimer) 1388static bool lapic_is_periodic(struct kvm_lapic *apic)
1216{ 1389{
1217 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
1218 lapic_timer);
1219 return apic_lvtt_period(apic); 1390 return apic_lvtt_period(apic);
1220} 1391}
1221 1392
1222int apic_has_pending_timer(struct kvm_vcpu *vcpu) 1393int apic_has_pending_timer(struct kvm_vcpu *vcpu)
1223{ 1394{
1224 struct kvm_lapic *lapic = vcpu->arch.apic; 1395 struct kvm_lapic *apic = vcpu->arch.apic;
1225 1396
1226 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 1397 if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
1227 return atomic_read(&lapic->lapic_timer.pending); 1398 apic_lvt_enabled(apic, APIC_LVTT))
1399 return atomic_read(&apic->lapic_timer.pending);
1228 1400
1229 return 0; 1401 return 0;
1230} 1402}
1231 1403
1232int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) 1404int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1233{ 1405{
1234 u32 reg = apic_get_reg(apic, lvt_type); 1406 u32 reg = kvm_apic_get_reg(apic, lvt_type);
1235 int vector, mode, trig_mode; 1407 int vector, mode, trig_mode;
1236 1408
1237 if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { 1409 if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
1238 vector = reg & APIC_VECTOR_MASK; 1410 vector = reg & APIC_VECTOR_MASK;
1239 mode = reg & APIC_MODE_MASK; 1411 mode = reg & APIC_MODE_MASK;
1240 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1412 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1423,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
1251 kvm_apic_local_deliver(apic, APIC_LVT0); 1423 kvm_apic_local_deliver(apic, APIC_LVT0);
1252} 1424}
1253 1425
1254static struct kvm_timer_ops lapic_timer_ops = {
1255 .is_periodic = lapic_is_periodic,
1256};
1257
1258static const struct kvm_io_device_ops apic_mmio_ops = { 1426static const struct kvm_io_device_ops apic_mmio_ops = {
1259 .read = apic_mmio_read, 1427 .read = apic_mmio_read,
1260 .write = apic_mmio_write, 1428 .write = apic_mmio_write,
1261}; 1429};
1262 1430
1431static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
1432{
1433 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
1434 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
1435 struct kvm_vcpu *vcpu = apic->vcpu;
1436 wait_queue_head_t *q = &vcpu->wq;
1437
1438 /*
1439 * There is a race window between reading and incrementing, but we do
1440 * not care about potentially losing timer events in the !reinject
1441 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
1442 * in vcpu_enter_guest.
1443 */
1444 if (!atomic_read(&ktimer->pending)) {
1445 atomic_inc(&ktimer->pending);
1446 /* FIXME: this code should not know anything about vcpus */
1447 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1448 }
1449
1450 if (waitqueue_active(q))
1451 wake_up_interruptible(q);
1452
1453 if (lapic_is_periodic(apic)) {
1454 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
1455 return HRTIMER_RESTART;
1456 } else
1457 return HRTIMER_NORESTART;
1458}
1459
1263int kvm_create_lapic(struct kvm_vcpu *vcpu) 1460int kvm_create_lapic(struct kvm_vcpu *vcpu)
1264{ 1461{
1265 struct kvm_lapic *apic; 1462 struct kvm_lapic *apic;
@@ -1283,14 +1480,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1283 1480
1284 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1481 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1285 HRTIMER_MODE_ABS); 1482 HRTIMER_MODE_ABS);
1286 apic->lapic_timer.timer.function = kvm_timer_fn; 1483 apic->lapic_timer.timer.function = apic_timer_fn;
1287 apic->lapic_timer.t_ops = &lapic_timer_ops;
1288 apic->lapic_timer.kvm = vcpu->kvm;
1289 apic->lapic_timer.vcpu = vcpu;
1290 1484
1291 apic->base_address = APIC_DEFAULT_PHYS_BASE; 1485 /*
1292 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 1486 * APIC is created enabled. This will prevent kvm_lapic_set_base from
1487 * thinking that APIC satet has changed.
1488 */
1489 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
1490 kvm_lapic_set_base(vcpu,
1491 APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
1293 1492
1493 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
1294 kvm_lapic_reset(vcpu); 1494 kvm_lapic_reset(vcpu);
1295 kvm_iodevice_init(&apic->dev, &apic_mmio_ops); 1495 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
1296 1496
@@ -1306,23 +1506,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1306 struct kvm_lapic *apic = vcpu->arch.apic; 1506 struct kvm_lapic *apic = vcpu->arch.apic;
1307 int highest_irr; 1507 int highest_irr;
1308 1508
1309 if (!apic || !apic_enabled(apic)) 1509 if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
1310 return -1; 1510 return -1;
1311 1511
1312 apic_update_ppr(apic); 1512 apic_update_ppr(apic);
1313 highest_irr = apic_find_highest_irr(apic); 1513 highest_irr = apic_find_highest_irr(apic);
1314 if ((highest_irr == -1) || 1514 if ((highest_irr == -1) ||
1315 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) 1515 ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
1316 return -1; 1516 return -1;
1317 return highest_irr; 1517 return highest_irr;
1318} 1518}
1319 1519
1320int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1520int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1321{ 1521{
1322 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1522 u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1323 int r = 0; 1523 int r = 0;
1324 1524
1325 if (!apic_hw_enabled(vcpu->arch.apic)) 1525 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
1326 r = 1; 1526 r = 1;
1327 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1527 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1328 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1528 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1534,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1334{ 1534{
1335 struct kvm_lapic *apic = vcpu->arch.apic; 1535 struct kvm_lapic *apic = vcpu->arch.apic;
1336 1536
1337 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { 1537 if (!kvm_vcpu_has_lapic(vcpu))
1538 return;
1539
1540 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1338 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1541 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1339 atomic_dec(&apic->lapic_timer.pending); 1542 atomic_dec(&apic->lapic_timer.pending);
1340 } 1543 }
@@ -1354,12 +1557,17 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1354 return vector; 1557 return vector;
1355} 1558}
1356 1559
1357void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1560void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1561 struct kvm_lapic_state *s)
1358{ 1562{
1359 struct kvm_lapic *apic = vcpu->arch.apic; 1563 struct kvm_lapic *apic = vcpu->arch.apic;
1360 1564
1361 apic->base_address = vcpu->arch.apic_base & 1565 kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
1362 MSR_IA32_APICBASE_BASE; 1566 /* set SPIV separately to get count of SW disabled APICs right */
1567 apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
1568 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1569 /* call kvm_apic_set_id() to put apic into apic_map */
1570 kvm_apic_set_id(apic, kvm_apic_id(apic));
1363 kvm_apic_set_version(vcpu); 1571 kvm_apic_set_version(vcpu);
1364 1572
1365 apic_update_ppr(apic); 1573 apic_update_ppr(apic);
@@ -1374,13 +1582,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1374 1582
1375void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1583void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1376{ 1584{
1377 struct kvm_lapic *apic = vcpu->arch.apic;
1378 struct hrtimer *timer; 1585 struct hrtimer *timer;
1379 1586
1380 if (!apic) 1587 if (!kvm_vcpu_has_lapic(vcpu))
1381 return; 1588 return;
1382 1589
1383 timer = &apic->lapic_timer.timer; 1590 timer = &vcpu->arch.apic->lapic_timer.timer;
1384 if (hrtimer_cancel(timer)) 1591 if (hrtimer_cancel(timer))
1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1592 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1386} 1593}
@@ -1478,7 +1685,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1685 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1479 return; 1686 return;
1480 1687
1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1688 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1482 max_irr = apic_find_highest_irr(apic); 1689 max_irr = apic_find_highest_irr(apic);
1483 if (max_irr < 0) 1690 if (max_irr < 0)
1484 max_irr = 0; 1691 max_irr = 0;
@@ -1537,7 +1744,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1537{ 1744{
1538 struct kvm_lapic *apic = vcpu->arch.apic; 1745 struct kvm_lapic *apic = vcpu->arch.apic;
1539 1746
1540 if (!irqchip_in_kernel(vcpu->kvm)) 1747 if (!kvm_vcpu_has_lapic(vcpu))
1541 return 1; 1748 return 1;
1542 1749
1543 /* if this is ICR write vector before command */ 1750 /* if this is ICR write vector before command */
@@ -1551,7 +1758,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1551 struct kvm_lapic *apic = vcpu->arch.apic; 1758 struct kvm_lapic *apic = vcpu->arch.apic;
1552 u32 low, high = 0; 1759 u32 low, high = 0;
1553 1760
1554 if (!irqchip_in_kernel(vcpu->kvm)) 1761 if (!kvm_vcpu_has_lapic(vcpu))
1555 return 1; 1762 return 1;
1556 1763
1557 if (apic_reg_read(apic, reg, 4, &low)) 1764 if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1783,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 1783 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr); 1784 addr);
1578} 1785}
1786
1787void kvm_lapic_init(void)
1788{
1789 /* do not patch jump label more than once per second */
1790 jump_label_rate_limit(&apic_hw_disabled, HZ);
1791 jump_label_rate_limit(&apic_sw_disabled, HZ);
1792}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e2..e5ebf9f3571f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,17 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
6 5
7#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
8 7
8struct kvm_timer {
9 struct hrtimer timer;
10 s64 period; /* unit: ns */
11 u32 timer_mode_mask;
12 u64 tscdeadline;
13 atomic_t pending; /* accumulated triggered timers */
14};
15
9struct kvm_lapic { 16struct kvm_lapic {
10 unsigned long base_address; 17 unsigned long base_address;
11 struct kvm_io_device dev; 18 struct kvm_io_device dev;
@@ -45,11 +52,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
45int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 52int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
46int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 53int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
47 54
55bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
56 struct kvm_lapic_irq *irq, int *r);
57
48u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 58u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
49void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 59void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
50void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 60void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
51int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 61 struct kvm_lapic_state *s);
52bool kvm_apic_present(struct kvm_vcpu *vcpu);
53int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 62int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
54 63
55u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +80,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
71} 80}
72 81
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); 82int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
83void kvm_lapic_init(void);
84
85static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
86{
87 return *((u32 *) (apic->regs + reg_off));
88}
89
90extern struct static_key kvm_no_apic_vcpu;
91
92static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
93{
94 if (static_key_false(&kvm_no_apic_vcpu))
95 return vcpu->arch.apic;
96 return true;
97}
98
99extern struct static_key_deferred apic_hw_disabled;
100
101static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
102{
103 if (static_key_false(&apic_hw_disabled.key))
104 return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
105 return MSR_IA32_APICBASE_ENABLE;
106}
107
108extern struct static_key_deferred apic_sw_disabled;
109
110static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
111{
112 if (static_key_false(&apic_sw_disabled.key))
113 return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
114 return APIC_SPIV_APIC_ENABLED;
115}
116
117static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
118{
119 return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
120}
121
122static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
123{
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125}
126
74#endif 127#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d273ea8..d289fee1ffb8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
556 return 0; 556 return 0;
557 557
558 pfn = spte_to_pfn(old_spte); 558 pfn = spte_to_pfn(old_spte);
559
560 /*
561 * KVM does not hold the refcount of the page used by
562 * kvm mmu, before reclaiming the page, we should
563 * unmap it from mmu first.
564 */
565 WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
566
559 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 567 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
560 kvm_set_pfn_accessed(pfn); 568 kvm_set_pfn_accessed(pfn);
561 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 569 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
960static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, 968static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
961 struct kvm_memory_slot *slot) 969 struct kvm_memory_slot *slot)
962{ 970{
963 struct kvm_lpage_info *linfo; 971 unsigned long idx;
964
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967 972
968 linfo = lpage_info_slot(gfn, slot, level); 973 idx = gfn_to_index(gfn, slot->base_gfn, level);
969 return &linfo->rmap_pde; 974 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
970} 975}
971 976
972/* 977/*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1173 unsigned long *rmapp; 1178 unsigned long *rmapp;
1174 1179
1175 while (mask) { 1180 while (mask) {
1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot);
1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1178 1184
1179 /* clear the first set bit */ 1185 /* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1200} 1206}
1201 1207
1202static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1208static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1203 unsigned long data) 1209 struct kvm_memory_slot *slot, unsigned long data)
1204{ 1210{
1205 u64 *sptep; 1211 u64 *sptep;
1206 struct rmap_iterator iter; 1212 struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1218} 1224}
1219 1225
1220static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1226static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1221 unsigned long data) 1227 struct kvm_memory_slot *slot, unsigned long data)
1222{ 1228{
1223 u64 *sptep; 1229 u64 *sptep;
1224 struct rmap_iterator iter; 1230 struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1259 return 0; 1265 return 0;
1260} 1266}
1261 1267
1262static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1268static int kvm_handle_hva_range(struct kvm *kvm,
1263 unsigned long data, 1269 unsigned long start,
1264 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1270 unsigned long end,
1265 unsigned long data)) 1271 unsigned long data,
1272 int (*handler)(struct kvm *kvm,
1273 unsigned long *rmapp,
1274 struct kvm_memory_slot *slot,
1275 unsigned long data))
1266{ 1276{
1267 int j; 1277 int j;
1268 int ret; 1278 int ret = 0;
1269 int retval = 0;
1270 struct kvm_memslots *slots; 1279 struct kvm_memslots *slots;
1271 struct kvm_memory_slot *memslot; 1280 struct kvm_memory_slot *memslot;
1272 1281
1273 slots = kvm_memslots(kvm); 1282 slots = kvm_memslots(kvm);
1274 1283
1275 kvm_for_each_memslot(memslot, slots) { 1284 kvm_for_each_memslot(memslot, slots) {
1276 unsigned long start = memslot->userspace_addr; 1285 unsigned long hva_start, hva_end;
1277 unsigned long end; 1286 gfn_t gfn_start, gfn_end;
1278 1287
1279 end = start + (memslot->npages << PAGE_SHIFT); 1288 hva_start = max(start, memslot->userspace_addr);
1280 if (hva >= start && hva < end) { 1289 hva_end = min(end, memslot->userspace_addr +
1281 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 1290 (memslot->npages << PAGE_SHIFT));
1282 gfn_t gfn = memslot->base_gfn + gfn_offset; 1291 if (hva_start >= hva_end)
1292 continue;
1293 /*
1294 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1295 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1296 */
1297 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1298 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1283 1299
1284 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 1300 for (j = PT_PAGE_TABLE_LEVEL;
1301 j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
1302 unsigned long idx, idx_end;
1303 unsigned long *rmapp;
1285 1304
1286 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 1305 /*
1287 struct kvm_lpage_info *linfo; 1306 * {idx(page_j) | page_j intersects with
1307 * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
1308 */
1309 idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
1310 idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
1288 1311
1289 linfo = lpage_info_slot(gfn, memslot, 1312 rmapp = __gfn_to_rmap(gfn_start, j, memslot);
1290 PT_DIRECTORY_LEVEL + j); 1313
1291 ret |= handler(kvm, &linfo->rmap_pde, data); 1314 for (; idx <= idx_end; ++idx)
1292 } 1315 ret |= handler(kvm, rmapp++, memslot, data);
1293 trace_kvm_age_page(hva, memslot, ret);
1294 retval |= ret;
1295 } 1316 }
1296 } 1317 }
1297 1318
1298 return retval; 1319 return ret;
1320}
1321
1322static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1323 unsigned long data,
1324 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1325 struct kvm_memory_slot *slot,
1326 unsigned long data))
1327{
1328 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1299} 1329}
1300 1330
1301int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1331int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1303 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1333 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1304} 1334}
1305 1335
1336int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1337{
1338 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1339}
1340
1306void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1341void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1307{ 1342{
1308 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1343 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1309} 1344}
1310 1345
1311static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1346static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1312 unsigned long data) 1347 struct kvm_memory_slot *slot, unsigned long data)
1313{ 1348{
1314 u64 *sptep; 1349 u64 *sptep;
1315 struct rmap_iterator uninitialized_var(iter); 1350 struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1323 * This has some overhead, but not as much as the cost of swapping 1358 * This has some overhead, but not as much as the cost of swapping
1324 * out actively used pages or breaking up actively used hugepages. 1359 * out actively used pages or breaking up actively used hugepages.
1325 */ 1360 */
1326 if (!shadow_accessed_mask) 1361 if (!shadow_accessed_mask) {
1327 return kvm_unmap_rmapp(kvm, rmapp, data); 1362 young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
1363 goto out;
1364 }
1328 1365
1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1366 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1330 sptep = rmap_get_next(&iter)) { 1367 sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1336 (unsigned long *)sptep); 1373 (unsigned long *)sptep);
1337 } 1374 }
1338 } 1375 }
1339 1376out:
1377 /* @data has hva passed to kvm_age_hva(). */
1378 trace_kvm_age_page(data, slot, young);
1340 return young; 1379 return young;
1341} 1380}
1342 1381
1343static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1382static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1344 unsigned long data) 1383 struct kvm_memory_slot *slot, unsigned long data)
1345{ 1384{
1346 u64 *sptep; 1385 u64 *sptep;
1347 struct rmap_iterator iter; 1386 struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1379 1418
1380 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1419 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1381 1420
1382 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 1421 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
1383 kvm_flush_remote_tlbs(vcpu->kvm); 1422 kvm_flush_remote_tlbs(vcpu->kvm);
1384} 1423}
1385 1424
1386int kvm_age_hva(struct kvm *kvm, unsigned long hva) 1425int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1387{ 1426{
1388 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 1427 return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
1389} 1428}
1390 1429
1391int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1430int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2457 rmap_recycle(vcpu, sptep, gfn); 2496 rmap_recycle(vcpu, sptep, gfn);
2458 } 2497 }
2459 } 2498 }
2460 kvm_release_pfn_clean(pfn); 2499
2500 if (!is_error_pfn(pfn))
2501 kvm_release_pfn_clean(pfn);
2461} 2502}
2462 2503
2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2504static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2469 bool no_dirty_log) 2510 bool no_dirty_log)
2470{ 2511{
2471 struct kvm_memory_slot *slot; 2512 struct kvm_memory_slot *slot;
2472 unsigned long hva;
2473 2513
2474 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2514 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2475 if (!slot) { 2515 if (!slot)
2476 get_page(fault_page); 2516 return KVM_PFN_ERR_FAULT;
2477 return page_to_pfn(fault_page);
2478 }
2479 2517
2480 hva = gfn_to_hva_memslot(slot, gfn); 2518 return gfn_to_pfn_memslot_atomic(slot, gfn);
2481
2482 return hva_to_pfn_atomic(vcpu->kvm, hva);
2483} 2519}
2484 2520
2485static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2521static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2580,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2580 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2616 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2581 iterator.level - 1, 2617 iterator.level - 1,
2582 1, ACC_ALL, iterator.sptep); 2618 1, ACC_ALL, iterator.sptep);
2583 if (!sp) {
2584 pgprintk("nonpaging_map: ENOMEM\n");
2585 kvm_release_pfn_clean(pfn);
2586 return -ENOMEM;
2587 }
2588 2619
2589 mmu_spte_set(iterator.sptep, 2620 mmu_spte_set(iterator.sptep,
2590 __pa(sp->spt) 2621 __pa(sp->spt)
@@ -2611,8 +2642,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2611 2642
2612static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) 2643static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2613{ 2644{
2614 kvm_release_pfn_clean(pfn); 2645 /*
2615 if (is_hwpoison_pfn(pfn)) { 2646 * Do not cache the mmio info caused by writing the readonly gfn
2647 * into the spte otherwise read access on readonly gfn also can
2648 * caused mmio page fault and treat it as mmio access.
2649 * Return 1 to tell kvm to emulate it.
2650 */
2651 if (pfn == KVM_PFN_ERR_RO_FAULT)
2652 return 1;
2653
2654 if (pfn == KVM_PFN_ERR_HWPOISON) {
2616 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); 2655 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2617 return 0; 2656 return 0;
2618 } 2657 }
@@ -3236,8 +3275,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3236 if (!async) 3275 if (!async)
3237 return false; /* *pfn has correct page already */ 3276 return false; /* *pfn has correct page already */
3238 3277
3239 put_page(pfn_to_page(*pfn));
3240
3241 if (!prefault && can_do_async_pf(vcpu)) { 3278 if (!prefault && can_do_async_pf(vcpu)) {
3242 trace_kvm_try_async_get_page(gva, gfn); 3279 trace_kvm_try_async_get_page(gva, gfn);
3243 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3280 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
@@ -3371,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3371 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3408 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3372} 3409}
3373 3410
3411static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3412{
3413 unsigned mask;
3414
3415 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3416
3417 mask = (unsigned)~ACC_WRITE_MASK;
3418 /* Allow write access to dirty gptes */
3419 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3420 *access &= mask;
3421}
3422
3374static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3423static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3375 int *nr_present) 3424 int *nr_present)
3376{ 3425{
@@ -3388,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3388 return false; 3437 return false;
3389} 3438}
3390 3439
3440static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3441{
3442 unsigned access;
3443
3444 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3445 access &= ~(gpte >> PT64_NX_SHIFT);
3446
3447 return access;
3448}
3449
3450static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3451{
3452 unsigned index;
3453
3454 index = level - 1;
3455 index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
3456 return mmu->last_pte_bitmap & (1 << index);
3457}
3458
3391#define PTTYPE 64 3459#define PTTYPE 64
3392#include "paging_tmpl.h" 3460#include "paging_tmpl.h"
3393#undef PTTYPE 3461#undef PTTYPE
@@ -3457,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3457 } 3525 }
3458} 3526}
3459 3527
3528static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3529{
3530 unsigned bit, byte, pfec;
3531 u8 map;
3532 bool fault, x, w, u, wf, uf, ff, smep;
3533
3534 smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3535 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
3536 pfec = byte << 1;
3537 map = 0;
3538 wf = pfec & PFERR_WRITE_MASK;
3539 uf = pfec & PFERR_USER_MASK;
3540 ff = pfec & PFERR_FETCH_MASK;
3541 for (bit = 0; bit < 8; ++bit) {
3542 x = bit & ACC_EXEC_MASK;
3543 w = bit & ACC_WRITE_MASK;
3544 u = bit & ACC_USER_MASK;
3545
3546 /* Not really needed: !nx will cause pte.nx to fault */
3547 x |= !mmu->nx;
3548 /* Allow supervisor writes if !cr0.wp */
3549 w |= !is_write_protection(vcpu) && !uf;
3550 /* Disallow supervisor fetches of user code if cr4.smep */
3551 x &= !(smep && u && !uf);
3552
3553 fault = (ff && !x) || (uf && !u) || (wf && !w);
3554 map |= fault << bit;
3555 }
3556 mmu->permissions[byte] = map;
3557 }
3558}
3559
3560static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3561{
3562 u8 map;
3563 unsigned level, root_level = mmu->root_level;
3564 const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
3565
3566 if (root_level == PT32E_ROOT_LEVEL)
3567 --root_level;
3568 /* PT_PAGE_TABLE_LEVEL always terminates */
3569 map = 1 | (1 << ps_set_index);
3570 for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
3571 if (level <= PT_PDPE_LEVEL
3572 && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
3573 map |= 1 << (ps_set_index | (level - 1));
3574 }
3575 mmu->last_pte_bitmap = map;
3576}
3577
3460static int paging64_init_context_common(struct kvm_vcpu *vcpu, 3578static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3461 struct kvm_mmu *context, 3579 struct kvm_mmu *context,
3462 int level) 3580 int level)
@@ -3465,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3465 context->root_level = level; 3583 context->root_level = level;
3466 3584
3467 reset_rsvds_bits_mask(vcpu, context); 3585 reset_rsvds_bits_mask(vcpu, context);
3586 update_permission_bitmask(vcpu, context);
3587 update_last_pte_bitmap(vcpu, context);
3468 3588
3469 ASSERT(is_pae(vcpu)); 3589 ASSERT(is_pae(vcpu));
3470 context->new_cr3 = paging_new_cr3; 3590 context->new_cr3 = paging_new_cr3;
@@ -3493,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3493 context->root_level = PT32_ROOT_LEVEL; 3613 context->root_level = PT32_ROOT_LEVEL;
3494 3614
3495 reset_rsvds_bits_mask(vcpu, context); 3615 reset_rsvds_bits_mask(vcpu, context);
3616 update_permission_bitmask(vcpu, context);
3617 update_last_pte_bitmap(vcpu, context);
3496 3618
3497 context->new_cr3 = paging_new_cr3; 3619 context->new_cr3 = paging_new_cr3;
3498 context->page_fault = paging32_page_fault; 3620 context->page_fault = paging32_page_fault;
@@ -3553,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3553 context->gva_to_gpa = paging32_gva_to_gpa; 3675 context->gva_to_gpa = paging32_gva_to_gpa;
3554 } 3676 }
3555 3677
3678 update_permission_bitmask(vcpu, context);
3679 update_last_pte_bitmap(vcpu, context);
3680
3556 return 0; 3681 return 0;
3557} 3682}
3558 3683
@@ -3628,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3628 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3753 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3629 } 3754 }
3630 3755
3756 update_permission_bitmask(vcpu, g_context);
3757 update_last_pte_bitmap(vcpu, g_context);
3758
3631 return 0; 3759 return 0;
3632} 3760}
3633 3761
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9af021..69871080e866 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,8 +18,10 @@
18#define PT_PCD_MASK (1ULL << 4) 18#define PT_PCD_MASK (1ULL << 4)
19#define PT_ACCESSED_SHIFT 5 19#define PT_ACCESSED_SHIFT 5
20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) 20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
21#define PT_DIRTY_MASK (1ULL << 6) 21#define PT_DIRTY_SHIFT 6
22#define PT_PAGE_SIZE_MASK (1ULL << 7) 22#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
23#define PT_PAGE_SIZE_SHIFT 7
24#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
23#define PT_PAT_MASK (1ULL << 7) 25#define PT_PAT_MASK (1ULL << 7)
24#define PT_GLOBAL_MASK (1ULL << 8) 26#define PT_GLOBAL_MASK (1ULL << 8)
25#define PT64_NX_SHIFT 63 27#define PT64_NX_SHIFT 63
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 90 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89} 91}
90 92
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu, 93/*
92 bool write_fault, bool user_fault, 94 * Will a fault with a given page-fault error code (pfec) cause a permission
93 unsigned long pte) 95 * fault with the given access (in ACC_* format)?
96 */
97static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
98 unsigned pfec)
94{ 99{
95 if (unlikely(write_fault && !is_writable_pte(pte) 100 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103} 101}
102
104#endif 103#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23eb..daff69e21150 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); 117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
118 118
119 if (is_error_pfn(pfn)) { 119 if (is_error_pfn(pfn))
120 kvm_release_pfn_clean(pfn);
121 return; 120 return;
122 }
123 121
124 hpa = pfn << PAGE_SHIFT; 122 hpa = pfn << PAGE_SHIFT;
125 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 123 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
190 188
191static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) 189static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 190{
193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 191 unsigned long *rmapp;
195 u64 *sptep; 192 u64 *sptep;
196 struct rmap_iterator iter; 193 struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
198 if (sp->role.direct || sp->unsync || sp->role.invalid) 195 if (sp->role.direct || sp->unsync || sp->role.invalid)
199 return; 196 return;
200 197
201 slot = gfn_to_memslot(kvm, sp->gfn); 198 rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
203 199
204 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 200 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
205 sptep = rmap_get_next(&iter)) { 201 sptep = rmap_get_next(&iter)) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae76..714e2c01a6fe 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -63,10 +63,12 @@
63 */ 63 */
64struct guest_walker { 64struct guest_walker {
65 int level; 65 int level;
66 unsigned max_level;
66 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
67 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
68 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 69 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
70 unsigned pt_access; 72 unsigned pt_access;
71 unsigned pte_access; 73 unsigned pte_access;
72 gfn_t gfn; 74 gfn_t gfn;
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 103 return (ret != orig_pte);
102} 104}
103 105
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, 106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
105 bool last) 107 struct kvm_mmu *mmu,
108 struct guest_walker *walker,
109 int write_fault)
106{ 110{
107 unsigned access; 111 unsigned level, index;
108 112 pt_element_t pte, orig_pte;
109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 113 pt_element_t __user *ptep_user;
110 if (last && !is_dirty_gpte(gpte)) 114 gfn_t table_gfn;
111 access &= ~ACC_WRITE_MASK; 115 int ret;
112 116
113#if PTTYPE == 64 117 for (level = walker->max_level; level >= walker->level; --level) {
114 if (vcpu->arch.mmu.nx) 118 pte = orig_pte = walker->ptes[level - 1];
115 access &= ~(gpte >> PT64_NX_SHIFT); 119 table_gfn = walker->table_gfn[level - 1];
116#endif 120 ptep_user = walker->ptep_user[level - 1];
117 return access; 121 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
118} 122 if (!(pte & PT_ACCESSED_MASK)) {
119 123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
120static bool FNAME(is_last_gpte)(struct guest_walker *walker, 124 pte |= PT_ACCESSED_MASK;
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 125 }
122 pt_element_t gpte) 126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
123{ 127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
124 if (walker->level == PT_PAGE_TABLE_LEVEL) 128 pte |= PT_DIRTY_MASK;
125 return true; 129 }
126 130 if (pte == orig_pte)
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && 131 continue;
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130 132
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && 133 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
132 (mmu->root_level == PT64_ROOT_LEVEL)) 134 if (ret)
133 return true; 135 return ret;
134 136
135 return false; 137 mark_page_dirty(vcpu->kvm, table_gfn);
138 walker->ptes[level] = pte;
139 }
140 return 0;
136} 141}
137 142
138/* 143/*
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
142 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
143 gva_t addr, u32 access) 148 gva_t addr, u32 access)
144{ 149{
150 int ret;
145 pt_element_t pte; 151 pt_element_t pte;
146 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
147 gfn_t table_gfn; 153 gfn_t table_gfn;
148 unsigned index, pt_access, uninitialized_var(pte_access); 154 unsigned index, pt_access, pte_access, accessed_dirty, shift;
149 gpa_t pte_gpa; 155 gpa_t pte_gpa;
150 bool eperm, last_gpte;
151 int offset; 156 int offset;
152 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
153 const int user_fault = access & PFERR_USER_MASK; 158 const int user_fault = access & PFERR_USER_MASK;
154 const int fetch_fault = access & PFERR_FETCH_MASK; 159 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 160 u16 errcode = 0;
161 gpa_t real_gpa;
162 gfn_t gfn;
156 163
157 trace_kvm_mmu_pagetable_walk(addr, access); 164 trace_kvm_mmu_pagetable_walk(addr, access);
158retry_walk: 165retry_walk:
159 eperm = false;
160 walker->level = mmu->root_level; 166 walker->level = mmu->root_level;
161 pte = mmu->get_cr3(vcpu); 167 pte = mmu->get_cr3(vcpu);
162 168
@@ -169,15 +175,21 @@ retry_walk:
169 --walker->level; 175 --walker->level;
170 } 176 }
171#endif 177#endif
178 walker->max_level = walker->level;
172 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
173 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
174 181
175 pt_access = ACC_ALL; 182 accessed_dirty = PT_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL;
184 ++walker->level;
176 185
177 for (;;) { 186 do {
178 gfn_t real_gfn; 187 gfn_t real_gfn;
179 unsigned long host_addr; 188 unsigned long host_addr;
180 189
190 pt_access &= pte_access;
191 --walker->level;
192
181 index = PT_INDEX(addr, walker->level); 193 index = PT_INDEX(addr, walker->level);
182 194
183 table_gfn = gpte_to_gfn(pte); 195 table_gfn = gpte_to_gfn(pte);
@@ -199,6 +211,7 @@ retry_walk:
199 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 211 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
200 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) 212 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
201 goto error; 213 goto error;
214 walker->ptep_user[walker->level - 1] = ptep_user;
202 215
203 trace_kvm_mmu_paging_element(pte, walker->level); 216 trace_kvm_mmu_paging_element(pte, walker->level);
204 217
@@ -211,92 +224,48 @@ retry_walk:
211 goto error; 224 goto error;
212 } 225 }
213 226
214 if (!check_write_user_access(vcpu, write_fault, user_fault, 227 accessed_dirty &= pte;
215 pte)) 228 pte_access = pt_access & gpte_access(vcpu, pte);
216 eperm = true;
217
218#if PTTYPE == 64
219 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
220 eperm = true;
221#endif
222
223 last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
224 if (last_gpte) {
225 pte_access = pt_access &
226 FNAME(gpte_access)(vcpu, pte, true);
227 /* check if the kernel is fetching from user page */
228 if (unlikely(pte_access & PT_USER_MASK) &&
229 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
230 if (fetch_fault && !user_fault)
231 eperm = true;
232 }
233
234 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
235 int ret;
236 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
237 sizeof(pte));
238 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
239 pte, pte|PT_ACCESSED_MASK);
240 if (unlikely(ret < 0))
241 goto error;
242 else if (ret)
243 goto retry_walk;
244
245 mark_page_dirty(vcpu->kvm, table_gfn);
246 pte |= PT_ACCESSED_MASK;
247 }
248 229
249 walker->ptes[walker->level - 1] = pte; 230 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte));
250 232
251 if (last_gpte) { 233 if (unlikely(permission_fault(mmu, pte_access, access))) {
252 int lvl = walker->level; 234 errcode |= PFERR_PRESENT_MASK;
253 gpa_t real_gpa; 235 goto error;
254 gfn_t gfn; 236 }
255 u32 ac;
256
257 gfn = gpte_to_gfn_lvl(pte, lvl);
258 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
259
260 if (PTTYPE == 32 &&
261 walker->level == PT_DIRECTORY_LEVEL &&
262 is_cpuid_PSE36())
263 gfn += pse36_gfn_delta(pte);
264
265 ac = write_fault | fetch_fault | user_fault;
266 237
267 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), 238 gfn = gpte_to_gfn_lvl(pte, walker->level);
268 ac); 239 gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
269 if (real_gpa == UNMAPPED_GVA)
270 return 0;
271 240
272 walker->gfn = real_gpa >> PAGE_SHIFT; 241 if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
242 gfn += pse36_gfn_delta(pte);
273 243
274 break; 244 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
275 } 245 if (real_gpa == UNMAPPED_GVA)
246 return 0;
276 247
277 pt_access &= FNAME(gpte_access)(vcpu, pte, false); 248 walker->gfn = real_gpa >> PAGE_SHIFT;
278 --walker->level;
279 }
280 249
281 if (unlikely(eperm)) { 250 if (!write_fault)
282 errcode |= PFERR_PRESENT_MASK; 251 protect_clean_gpte(&pte_access, pte);
283 goto error;
284 }
285 252
286 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 253 /*
287 int ret; 254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
255 * place right.
256 *
257 * On a read fault, do nothing.
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
288 262
289 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 263 if (unlikely(!accessed_dirty)) {
290 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
291 pte, pte|PT_DIRTY_MASK);
292 if (unlikely(ret < 0)) 265 if (unlikely(ret < 0))
293 goto error; 266 goto error;
294 else if (ret) 267 else if (ret)
295 goto retry_walk; 268 goto retry_walk;
296
297 mark_page_dirty(vcpu->kvm, table_gfn);
298 pte |= PT_DIRTY_MASK;
299 walker->ptes[walker->level - 1] = pte;
300 } 269 }
301 270
302 walker->pt_access = pt_access; 271 walker->pt_access = pt_access;
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
368 return; 337 return;
369 338
370 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
371 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); 340 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte);
372 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
373 if (mmu_invalid_pfn(pfn)) { 343 if (mmu_invalid_pfn(pfn))
374 kvm_release_pfn_clean(pfn);
375 return; 344 return;
376 }
377 345
378 /* 346 /*
379 * we call mmu_set_spte() with host_writable = true because that 347 * we call mmu_set_spte() with host_writable = true because that
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
443 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
444 continue; 412 continue;
445 413
446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, 414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
447 true); 415 protect_clean_gpte(&pte_access, gpte);
448 gfn = gpte_to_gfn(gpte); 416 gfn = gpte_to_gfn(gpte);
449 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
450 pte_access & ACC_WRITE_MASK); 418 pte_access & ACC_WRITE_MASK);
451 if (mmu_invalid_pfn(pfn)) { 419 if (mmu_invalid_pfn(pfn))
452 kvm_release_pfn_clean(pfn);
453 break; 420 break;
454 }
455 421
456 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
457 NULL, PT_PAGE_TABLE_LEVEL, gfn, 423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
798 764
799 gfn = gpte_to_gfn(gpte); 765 gfn = gpte_to_gfn(gpte);
800 pte_access = sp->role.access; 766 pte_access = sp->role.access;
801 pte_access &= FNAME(gpte_access)(vcpu, gpte, true); 767 pte_access &= gpte_access(vcpu, gpte);
768 protect_clean_gpte(&pte_access, gpte);
802 769
803 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 770 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
804 continue; 771 continue;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab0..cfc258a6bf97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support 2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
3 * 3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 * 5 *
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c8..d017df3899ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
163 163
164#define MSR_INVALID 0xffffffffU 164#define MSR_INVALID 0xffffffffU
165 165
166static struct svm_direct_access_msrs { 166static const struct svm_direct_access_msrs {
167 u32 index; /* Index of the MSR */ 167 u32 index; /* Index of the MSR */
168 bool always; /* True if intercept is always on */ 168 bool always; /* True if intercept is always on */
169} direct_access_msrs[] = { 169} direct_access_msrs[] = {
@@ -400,7 +400,7 @@ struct svm_init_data {
400 int r; 400 int r;
401}; 401};
402 402
403static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 403static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
404 404
405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
406#define MSRS_RANGE_SIZE 2048 406#define MSRS_RANGE_SIZE 2048
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1146 1146
1147 svm_set_efer(&svm->vcpu, 0); 1147 svm_set_efer(&svm->vcpu, 0);
1148 save->dr6 = 0xffff0ff0; 1148 save->dr6 = 0xffff0ff0;
1149 save->dr7 = 0x400;
1150 kvm_set_rflags(&svm->vcpu, 2); 1149 kvm_set_rflags(&svm->vcpu, 2);
1151 save->rip = 0x0000fff0; 1150 save->rip = 0x0000fff0;
1152 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1151 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1643 mark_dirty(svm->vmcb, VMCB_SEG); 1642 mark_dirty(svm->vmcb, VMCB_SEG);
1644} 1643}
1645 1644
1646static void update_db_intercept(struct kvm_vcpu *vcpu) 1645static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
1647{ 1646{
1648 struct vcpu_svm *svm = to_svm(vcpu); 1647 struct vcpu_svm *svm = to_svm(vcpu);
1649 1648
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1663 vcpu->guest_debug = 0; 1662 vcpu->guest_debug = 0;
1664} 1663}
1665 1664
1666static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1667{
1668 struct vcpu_svm *svm = to_svm(vcpu);
1669
1670 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1671 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1672 else
1673 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1674
1675 mark_dirty(svm->vmcb, VMCB_DR);
1676
1677 update_db_intercept(vcpu);
1678}
1679
1680static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1665static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1681{ 1666{
1682 if (sd->next_asid > sd->max_asid) { 1667 if (sd->next_asid > sd->max_asid) {
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm)
1748 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1733 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1749 svm->vmcb->save.rflags &= 1734 svm->vmcb->save.rflags &=
1750 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1735 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1751 update_db_intercept(&svm->vcpu); 1736 update_db_bp_intercept(&svm->vcpu);
1752 } 1737 }
1753 1738
1754 if (svm->vcpu.guest_debug & 1739 if (svm->vcpu.guest_debug &
@@ -2063,7 +2048,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
2063 if (svm->nested.intercept & 1ULL) { 2048 if (svm->nested.intercept & 1ULL) {
2064 /* 2049 /*
2065 * The #vmexit can't be emulated here directly because this 2050 * The #vmexit can't be emulated here directly because this
2066 * code path runs with irqs and preemtion disabled. A 2051 * code path runs with irqs and preemption disabled. A
2067 * #vmexit emulation might sleep. Only signal request for 2052 * #vmexit emulation might sleep. Only signal request for
2068 * the #vmexit here. 2053 * the #vmexit here.
2069 */ 2054 */
@@ -2105,7 +2090,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2105 return kmap(page); 2090 return kmap(page);
2106 2091
2107error: 2092error:
2108 kvm_release_page_clean(page);
2109 kvm_inject_gp(&svm->vcpu, 0); 2093 kvm_inject_gp(&svm->vcpu, 0);
2110 2094
2111 return NULL; 2095 return NULL;
@@ -2409,7 +2393,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2409{ 2393{
2410 /* 2394 /*
2411 * This function merges the msr permission bitmaps of kvm and the 2395 * This function merges the msr permission bitmaps of kvm and the
2412 * nested vmcb. It is omptimized in that it only merges the parts where 2396 * nested vmcb. It is optimized in that it only merges the parts where
2413 * the kvm msr permission bitmap may contain zero bits 2397 * the kvm msr permission bitmap may contain zero bits
2414 */ 2398 */
2415 int i; 2399 int i;
@@ -3268,7 +3252,7 @@ static int pause_interception(struct vcpu_svm *svm)
3268 return 1; 3252 return 1;
3269} 3253}
3270 3254
3271static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3255static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3272 [SVM_EXIT_READ_CR0] = cr_interception, 3256 [SVM_EXIT_READ_CR0] = cr_interception,
3273 [SVM_EXIT_READ_CR3] = cr_interception, 3257 [SVM_EXIT_READ_CR3] = cr_interception,
3274 [SVM_EXIT_READ_CR4] = cr_interception, 3258 [SVM_EXIT_READ_CR4] = cr_interception,
@@ -3660,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3660 */ 3644 */
3661 svm->nmi_singlestep = true; 3645 svm->nmi_singlestep = true;
3662 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3646 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3663 update_db_intercept(vcpu); 3647 update_db_bp_intercept(vcpu);
3664} 3648}
3665 3649
3666static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3650static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3783,12 +3767,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3783 svm_complete_interrupts(svm); 3767 svm_complete_interrupts(svm);
3784} 3768}
3785 3769
3786#ifdef CONFIG_X86_64
3787#define R "r"
3788#else
3789#define R "e"
3790#endif
3791
3792static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3770static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3793{ 3771{
3794 struct vcpu_svm *svm = to_svm(vcpu); 3772 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3815,13 +3793,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3815 local_irq_enable(); 3793 local_irq_enable();
3816 3794
3817 asm volatile ( 3795 asm volatile (
3818 "push %%"R"bp; \n\t" 3796 "push %%" _ASM_BP "; \n\t"
3819 "mov %c[rbx](%[svm]), %%"R"bx \n\t" 3797 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
3820 "mov %c[rcx](%[svm]), %%"R"cx \n\t" 3798 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
3821 "mov %c[rdx](%[svm]), %%"R"dx \n\t" 3799 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
3822 "mov %c[rsi](%[svm]), %%"R"si \n\t" 3800 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
3823 "mov %c[rdi](%[svm]), %%"R"di \n\t" 3801 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
3824 "mov %c[rbp](%[svm]), %%"R"bp \n\t" 3802 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
3825#ifdef CONFIG_X86_64 3803#ifdef CONFIG_X86_64
3826 "mov %c[r8](%[svm]), %%r8 \n\t" 3804 "mov %c[r8](%[svm]), %%r8 \n\t"
3827 "mov %c[r9](%[svm]), %%r9 \n\t" 3805 "mov %c[r9](%[svm]), %%r9 \n\t"
@@ -3834,20 +3812,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3834#endif 3812#endif
3835 3813
3836 /* Enter guest mode */ 3814 /* Enter guest mode */
3837 "push %%"R"ax \n\t" 3815 "push %%" _ASM_AX " \n\t"
3838 "mov %c[vmcb](%[svm]), %%"R"ax \n\t" 3816 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
3839 __ex(SVM_VMLOAD) "\n\t" 3817 __ex(SVM_VMLOAD) "\n\t"
3840 __ex(SVM_VMRUN) "\n\t" 3818 __ex(SVM_VMRUN) "\n\t"
3841 __ex(SVM_VMSAVE) "\n\t" 3819 __ex(SVM_VMSAVE) "\n\t"
3842 "pop %%"R"ax \n\t" 3820 "pop %%" _ASM_AX " \n\t"
3843 3821
3844 /* Save guest registers, load host registers */ 3822 /* Save guest registers, load host registers */
3845 "mov %%"R"bx, %c[rbx](%[svm]) \n\t" 3823 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
3846 "mov %%"R"cx, %c[rcx](%[svm]) \n\t" 3824 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
3847 "mov %%"R"dx, %c[rdx](%[svm]) \n\t" 3825 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
3848 "mov %%"R"si, %c[rsi](%[svm]) \n\t" 3826 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
3849 "mov %%"R"di, %c[rdi](%[svm]) \n\t" 3827 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
3850 "mov %%"R"bp, %c[rbp](%[svm]) \n\t" 3828 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
3851#ifdef CONFIG_X86_64 3829#ifdef CONFIG_X86_64
3852 "mov %%r8, %c[r8](%[svm]) \n\t" 3830 "mov %%r8, %c[r8](%[svm]) \n\t"
3853 "mov %%r9, %c[r9](%[svm]) \n\t" 3831 "mov %%r9, %c[r9](%[svm]) \n\t"
@@ -3858,7 +3836,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3858 "mov %%r14, %c[r14](%[svm]) \n\t" 3836 "mov %%r14, %c[r14](%[svm]) \n\t"
3859 "mov %%r15, %c[r15](%[svm]) \n\t" 3837 "mov %%r15, %c[r15](%[svm]) \n\t"
3860#endif 3838#endif
3861 "pop %%"R"bp" 3839 "pop %%" _ASM_BP
3862 : 3840 :
3863 : [svm]"a"(svm), 3841 : [svm]"a"(svm),
3864 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 3842 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -3879,9 +3857,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3879 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 3857 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3880#endif 3858#endif
3881 : "cc", "memory" 3859 : "cc", "memory"
3882 , R"bx", R"cx", R"dx", R"si", R"di"
3883#ifdef CONFIG_X86_64 3860#ifdef CONFIG_X86_64
3861 , "rbx", "rcx", "rdx", "rsi", "rdi"
3884 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 3862 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3863#else
3864 , "ebx", "ecx", "edx", "esi", "edi"
3885#endif 3865#endif
3886 ); 3866 );
3887 3867
@@ -3941,8 +3921,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3941 mark_all_clean(svm->vmcb); 3921 mark_all_clean(svm->vmcb);
3942} 3922}
3943 3923
3944#undef R
3945
3946static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3924static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3947{ 3925{
3948 struct vcpu_svm *svm = to_svm(vcpu); 3926 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4069,7 +4047,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4069#define POST_MEM(exit) { .exit_code = (exit), \ 4047#define POST_MEM(exit) { .exit_code = (exit), \
4070 .stage = X86_ICPT_POST_MEMACCESS, } 4048 .stage = X86_ICPT_POST_MEMACCESS, }
4071 4049
4072static struct __x86_intercept { 4050static const struct __x86_intercept {
4073 u32 exit_code; 4051 u32 exit_code;
4074 enum x86_intercept_stage stage; 4052 enum x86_intercept_stage stage;
4075} x86_intercept_map[] = { 4053} x86_intercept_map[] = {
@@ -4260,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4260 .vcpu_load = svm_vcpu_load, 4238 .vcpu_load = svm_vcpu_load,
4261 .vcpu_put = svm_vcpu_put, 4239 .vcpu_put = svm_vcpu_put,
4262 4240
4263 .set_guest_debug = svm_guest_debug, 4241 .update_db_bp_intercept = update_db_bp_intercept,
4264 .get_msr = svm_get_msr, 4242 .get_msr = svm_get_msr,
4265 .set_msr = svm_set_msr, 4243 .set_msr = svm_set_msr,
4266 .get_segment_base = svm_get_segment_base, 4244 .get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f34..000000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
15#include <linux/kvm_host.h>
16#include <linux/kvm.h>
17#include <linux/hrtimer.h>
18#include <linux/atomic.h>
19#include "kvm_timer.h"
20
21enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
22{
23 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
24 struct kvm_vcpu *vcpu = ktimer->vcpu;
25 wait_queue_head_t *q = &vcpu->wq;
26
27 /*
28 * There is a race window between reading and incrementing, but we do
29 * not care about potentially losing timer events in the !reinject
30 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
31 * in vcpu_enter_guest.
32 */
33 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
34 atomic_inc(&ktimer->pending);
35 /* FIXME: this code should not know anything about vcpus */
36 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
37 }
38
39 if (waitqueue_active(q))
40 wake_up_interruptible(q);
41
42 if (ktimer->t_ops->is_periodic(ktimer)) {
43 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
44 return HRTIMER_RESTART;
45 } else
46 return HRTIMER_NORESTART;
47}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 851aa7c3b890..ad6b1dd06f8b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
128module_param(ple_window, int, S_IRUGO); 128module_param(ple_window, int, S_IRUGO);
129 129
130extern const ulong vmx_return;
131
130#define NR_AUTOLOAD_MSRS 8 132#define NR_AUTOLOAD_MSRS 8
131#define VMCS02_POOL_SIZE 1 133#define VMCS02_POOL_SIZE 1
132 134
@@ -405,16 +407,16 @@ struct vcpu_vmx {
405 struct { 407 struct {
406 int vm86_active; 408 int vm86_active;
407 ulong save_rflags; 409 ulong save_rflags;
410 struct kvm_segment segs[8];
411 } rmode;
412 struct {
413 u32 bitmask; /* 4 bits per segment (1 bit per field) */
408 struct kvm_save_segment { 414 struct kvm_save_segment {
409 u16 selector; 415 u16 selector;
410 unsigned long base; 416 unsigned long base;
411 u32 limit; 417 u32 limit;
412 u32 ar; 418 u32 ar;
413 } tr, es, ds, fs, gs; 419 } seg[8];
414 } rmode;
415 struct {
416 u32 bitmask; /* 4 bits per segment (1 bit per field) */
417 struct kvm_save_segment seg[8];
418 } segment_cache; 420 } segment_cache;
419 int vpid; 421 int vpid;
420 bool emulation_required; 422 bool emulation_required;
@@ -450,7 +452,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
450#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 452#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
451 [number##_HIGH] = VMCS12_OFFSET(name)+4 453 [number##_HIGH] = VMCS12_OFFSET(name)+4
452 454
453static unsigned short vmcs_field_to_offset_table[] = { 455static const unsigned short vmcs_field_to_offset_table[] = {
454 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 456 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
455 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 457 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
456 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 458 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@ -596,10 +598,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
596static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 598static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
597{ 599{
598 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 600 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
599 if (is_error_page(page)) { 601 if (is_error_page(page))
600 kvm_release_page_clean(page);
601 return NULL; 602 return NULL;
602 } 603
603 return page; 604 return page;
604} 605}
605 606
@@ -667,7 +668,7 @@ static struct vmx_capability {
667 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 668 .ar_bytes = GUEST_##seg##_AR_BYTES, \
668 } 669 }
669 670
670static struct kvm_vmx_segment_field { 671static const struct kvm_vmx_segment_field {
671 unsigned selector; 672 unsigned selector;
672 unsigned base; 673 unsigned base;
673 unsigned limit; 674 unsigned limit;
@@ -1343,7 +1344,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1343 guest_efer = vmx->vcpu.arch.efer; 1344 guest_efer = vmx->vcpu.arch.efer;
1344 1345
1345 /* 1346 /*
1346 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 1347 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1347 * outside long mode 1348 * outside long mode
1348 */ 1349 */
1349 ignore_bits = EFER_NX | EFER_SCE; 1350 ignore_bits = EFER_NX | EFER_SCE;
@@ -1995,7 +1996,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
1995#endif 1996#endif
1996 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 1997 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1997 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 1998 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1998 CPU_BASED_RDPMC_EXITING | 1999 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
1999 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2000 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2000 /* 2001 /*
2001 * We can allow some features even when not supported by the 2002 * We can allow some features even when not supported by the
@@ -2291,16 +2292,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2291 } 2292 }
2292} 2293}
2293 2294
2294static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
2295{
2296 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
2297 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
2298 else
2299 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2300
2301 update_exception_bitmap(vcpu);
2302}
2303
2304static __init int cpu_has_kvm_support(void) 2295static __init int cpu_has_kvm_support(void)
2305{ 2296{
2306 return cpu_has_vmx(); 2297 return cpu_has_vmx();
@@ -2698,20 +2689,17 @@ static __exit void hardware_unsetup(void)
2698 free_kvm_area(); 2689 free_kvm_area();
2699} 2690}
2700 2691
2701static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) 2692static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
2702{ 2693{
2703 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2694 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2695 struct kvm_segment tmp = *save;
2704 2696
2705 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { 2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2706 vmcs_write16(sf->selector, save->selector); 2698 tmp.base = vmcs_readl(sf->base);
2707 vmcs_writel(sf->base, save->base); 2699 tmp.selector = vmcs_read16(sf->selector);
2708 vmcs_write32(sf->limit, save->limit); 2700 tmp.s = 1;
2709 vmcs_write32(sf->ar_bytes, save->ar);
2710 } else {
2711 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
2712 << AR_DPL_SHIFT;
2713 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
2714 } 2701 }
2702 vmx_set_segment(vcpu, &tmp, seg);
2715} 2703}
2716 2704
2717static void enter_pmode(struct kvm_vcpu *vcpu) 2705static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2724,10 +2712,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2724 2712
2725 vmx_segment_cache_clear(vmx); 2713 vmx_segment_cache_clear(vmx);
2726 2714
2727 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); 2715 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2728 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
2729 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
2730 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
2731 2716
2732 flags = vmcs_readl(GUEST_RFLAGS); 2717 flags = vmcs_readl(GUEST_RFLAGS);
2733 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2718 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2742,10 +2727,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2742 if (emulate_invalid_guest_state) 2727 if (emulate_invalid_guest_state)
2743 return; 2728 return;
2744 2729
2745 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); 2730 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2746 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); 2731 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2747 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 2732 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2748 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 2733 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2749 2734
2750 vmx_segment_cache_clear(vmx); 2735 vmx_segment_cache_clear(vmx);
2751 2736
@@ -2773,14 +2758,10 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2773 return kvm->arch.tss_addr; 2758 return kvm->arch.tss_addr;
2774} 2759}
2775 2760
2776static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 2761static void fix_rmode_seg(int seg, struct kvm_segment *save)
2777{ 2762{
2778 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2779 2764
2780 save->selector = vmcs_read16(sf->selector);
2781 save->base = vmcs_readl(sf->base);
2782 save->limit = vmcs_read32(sf->limit);
2783 save->ar = vmcs_read32(sf->ar_bytes);
2784 vmcs_write16(sf->selector, save->base >> 4); 2765 vmcs_write16(sf->selector, save->base >> 4);
2785 vmcs_write32(sf->base, save->base & 0xffff0); 2766 vmcs_write32(sf->base, save->base & 0xffff0);
2786 vmcs_write32(sf->limit, 0xffff); 2767 vmcs_write32(sf->limit, 0xffff);
@@ -2800,9 +2781,16 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2800 if (enable_unrestricted_guest) 2781 if (enable_unrestricted_guest)
2801 return; 2782 return;
2802 2783
2784 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2785 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2786 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2787 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2788 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2789
2803 vmx->emulation_required = 1; 2790 vmx->emulation_required = 1;
2804 vmx->rmode.vm86_active = 1; 2791 vmx->rmode.vm86_active = 1;
2805 2792
2793
2806 /* 2794 /*
2807 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2795 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2808 * vcpu. Call it here with phys address pointing 16M below 4G. 2796 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2817,14 +2805,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2817 2805
2818 vmx_segment_cache_clear(vmx); 2806 vmx_segment_cache_clear(vmx);
2819 2807
2820 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
2821 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
2822 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 2808 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
2823
2824 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
2825 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 2809 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2826
2827 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
2828 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 2810 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2829 2811
2830 flags = vmcs_readl(GUEST_RFLAGS); 2812 flags = vmcs_readl(GUEST_RFLAGS);
@@ -3117,35 +3099,24 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3117 struct kvm_segment *var, int seg) 3099 struct kvm_segment *var, int seg)
3118{ 3100{
3119 struct vcpu_vmx *vmx = to_vmx(vcpu); 3101 struct vcpu_vmx *vmx = to_vmx(vcpu);
3120 struct kvm_save_segment *save;
3121 u32 ar; 3102 u32 ar;
3122 3103
3123 if (vmx->rmode.vm86_active 3104 if (vmx->rmode.vm86_active
3124 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES 3105 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3125 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS 3106 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3126 || seg == VCPU_SREG_GS) 3107 || seg == VCPU_SREG_GS)) {
3127 && !emulate_invalid_guest_state) { 3108 *var = vmx->rmode.segs[seg];
3128 switch (seg) {
3129 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
3130 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
3131 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
3132 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
3133 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
3134 default: BUG();
3135 }
3136 var->selector = save->selector;
3137 var->base = save->base;
3138 var->limit = save->limit;
3139 ar = save->ar;
3140 if (seg == VCPU_SREG_TR 3109 if (seg == VCPU_SREG_TR
3141 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3110 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3142 goto use_saved_rmode_seg; 3111 return;
3112 var->base = vmx_read_guest_seg_base(vmx, seg);
3113 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3114 return;
3143 } 3115 }
3144 var->base = vmx_read_guest_seg_base(vmx, seg); 3116 var->base = vmx_read_guest_seg_base(vmx, seg);
3145 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3117 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3146 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3118 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3147 ar = vmx_read_guest_seg_ar(vmx, seg); 3119 ar = vmx_read_guest_seg_ar(vmx, seg);
3148use_saved_rmode_seg:
3149 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 3120 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3150 ar = 0; 3121 ar = 0;
3151 var->type = ar & 15; 3122 var->type = ar & 15;
@@ -3227,23 +3198,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3227 struct kvm_segment *var, int seg) 3198 struct kvm_segment *var, int seg)
3228{ 3199{
3229 struct vcpu_vmx *vmx = to_vmx(vcpu); 3200 struct vcpu_vmx *vmx = to_vmx(vcpu);
3230 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3201 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3231 u32 ar; 3202 u32 ar;
3232 3203
3233 vmx_segment_cache_clear(vmx); 3204 vmx_segment_cache_clear(vmx);
3234 3205
3235 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3206 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
3236 vmcs_write16(sf->selector, var->selector); 3207 vmcs_write16(sf->selector, var->selector);
3237 vmx->rmode.tr.selector = var->selector; 3208 vmx->rmode.segs[VCPU_SREG_TR] = *var;
3238 vmx->rmode.tr.base = var->base;
3239 vmx->rmode.tr.limit = var->limit;
3240 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
3241 return; 3209 return;
3242 } 3210 }
3243 vmcs_writel(sf->base, var->base); 3211 vmcs_writel(sf->base, var->base);
3244 vmcs_write32(sf->limit, var->limit); 3212 vmcs_write32(sf->limit, var->limit);
3245 vmcs_write16(sf->selector, var->selector); 3213 vmcs_write16(sf->selector, var->selector);
3246 if (vmx->rmode.vm86_active && var->s) { 3214 if (vmx->rmode.vm86_active && var->s) {
3215 vmx->rmode.segs[seg] = *var;
3247 /* 3216 /*
3248 * Hack real-mode segments into vm86 compatibility. 3217 * Hack real-mode segments into vm86 compatibility.
3249 */ 3218 */
@@ -3258,7 +3227,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3258 * qemu binaries. 3227 * qemu binaries.
3259 * IA32 arch specifies that at the time of processor reset the 3228 * IA32 arch specifies that at the time of processor reset the
3260 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3229 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3261 * is setting it to 0 in the usedland code. This causes invalid guest 3230 * is setting it to 0 in the userland code. This causes invalid guest
3262 * state vmexit when "unrestricted guest" mode is turned on. 3231 * state vmexit when "unrestricted guest" mode is turned on.
3263 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3232 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3264 * tree. Newer qemu binaries with that qemu fix would not need this 3233 * tree. Newer qemu binaries with that qemu fix would not need this
@@ -3288,16 +3257,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3288 vmcs_readl(GUEST_CS_BASE) >> 4); 3257 vmcs_readl(GUEST_CS_BASE) >> 4);
3289 break; 3258 break;
3290 case VCPU_SREG_ES: 3259 case VCPU_SREG_ES:
3291 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3292 break;
3293 case VCPU_SREG_DS: 3260 case VCPU_SREG_DS:
3294 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3295 break;
3296 case VCPU_SREG_GS: 3261 case VCPU_SREG_GS:
3297 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3298 break;
3299 case VCPU_SREG_FS: 3262 case VCPU_SREG_FS:
3300 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 3263 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3301 break; 3264 break;
3302 case VCPU_SREG_SS: 3265 case VCPU_SREG_SS:
3303 vmcs_write16(GUEST_SS_SELECTOR, 3266 vmcs_write16(GUEST_SS_SELECTOR,
@@ -3351,9 +3314,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3351 3314
3352 if (var.base != (var.selector << 4)) 3315 if (var.base != (var.selector << 4))
3353 return false; 3316 return false;
3354 if (var.limit != 0xffff) 3317 if (var.limit < 0xffff)
3355 return false; 3318 return false;
3356 if (ar != 0xf3) 3319 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
3357 return false; 3320 return false;
3358 3321
3359 return true; 3322 return true;
@@ -3605,7 +3568,7 @@ out:
3605 3568
3606static void seg_setup(int seg) 3569static void seg_setup(int seg)
3607{ 3570{
3608 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3571 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3609 unsigned int ar; 3572 unsigned int ar;
3610 3573
3611 vmcs_write16(sf->selector, 0); 3574 vmcs_write16(sf->selector, 0);
@@ -3770,8 +3733,7 @@ static void vmx_set_constant_host_state(void)
3770 native_store_idt(&dt); 3733 native_store_idt(&dt);
3771 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 3734 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3772 3735
3773 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); 3736 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3774 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3775 3737
3776 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 3738 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3777 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 3739 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@ -4005,8 +3967,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4005 kvm_rip_write(vcpu, 0); 3967 kvm_rip_write(vcpu, 0);
4006 kvm_register_write(vcpu, VCPU_REGS_RSP, 0); 3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
4007 3969
4008 vmcs_writel(GUEST_DR7, 0x400);
4009
4010 vmcs_writel(GUEST_GDTR_BASE, 0); 3970 vmcs_writel(GUEST_GDTR_BASE, 0);
4011 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4012 3972
@@ -4456,7 +4416,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4456 hypercall[2] = 0xc1; 4416 hypercall[2] = 0xc1;
4457} 4417}
4458 4418
4459/* called to set cr0 as approriate for a mov-to-cr0 exit. */ 4419/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4460static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4420static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4461{ 4421{
4462 if (to_vmx(vcpu)->nested.vmxon && 4422 if (to_vmx(vcpu)->nested.vmxon &&
@@ -5701,7 +5661,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
5701 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5661 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5702 * to be done to userspace and return 0. 5662 * to be done to userspace and return 0.
5703 */ 5663 */
5704static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5664static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 5665 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
5706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5666 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5667 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -6229,17 +6189,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6229 msrs[i].host); 6189 msrs[i].host);
6230} 6190}
6231 6191
6232#ifdef CONFIG_X86_64
6233#define R "r"
6234#define Q "q"
6235#else
6236#define R "e"
6237#define Q "l"
6238#endif
6239
6240static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 6192static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6241{ 6193{
6242 struct vcpu_vmx *vmx = to_vmx(vcpu); 6194 struct vcpu_vmx *vmx = to_vmx(vcpu);
6195 unsigned long debugctlmsr;
6243 6196
6244 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { 6197 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6245 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6198 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6279,34 +6232,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6279 vmx_set_interrupt_shadow(vcpu, 0); 6232 vmx_set_interrupt_shadow(vcpu, 0);
6280 6233
6281 atomic_switch_perf_msrs(vmx); 6234 atomic_switch_perf_msrs(vmx);
6235 debugctlmsr = get_debugctlmsr();
6282 6236
6283 vmx->__launched = vmx->loaded_vmcs->launched; 6237 vmx->__launched = vmx->loaded_vmcs->launched;
6284 asm( 6238 asm(
6285 /* Store host registers */ 6239 /* Store host registers */
6286 "push %%"R"dx; push %%"R"bp;" 6240 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6287 "push %%"R"cx \n\t" /* placeholder for guest rcx */ 6241 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6288 "push %%"R"cx \n\t" 6242 "push %%" _ASM_CX " \n\t"
6289 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 6243 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6290 "je 1f \n\t" 6244 "je 1f \n\t"
6291 "mov %%"R"sp, %c[host_rsp](%0) \n\t" 6245 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6292 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 6246 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
6293 "1: \n\t" 6247 "1: \n\t"
6294 /* Reload cr2 if changed */ 6248 /* Reload cr2 if changed */
6295 "mov %c[cr2](%0), %%"R"ax \n\t" 6249 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
6296 "mov %%cr2, %%"R"dx \n\t" 6250 "mov %%cr2, %%" _ASM_DX " \n\t"
6297 "cmp %%"R"ax, %%"R"dx \n\t" 6251 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6298 "je 2f \n\t" 6252 "je 2f \n\t"
6299 "mov %%"R"ax, %%cr2 \n\t" 6253 "mov %%" _ASM_AX", %%cr2 \n\t"
6300 "2: \n\t" 6254 "2: \n\t"
6301 /* Check if vmlaunch of vmresume is needed */ 6255 /* Check if vmlaunch of vmresume is needed */
6302 "cmpl $0, %c[launched](%0) \n\t" 6256 "cmpl $0, %c[launched](%0) \n\t"
6303 /* Load guest registers. Don't clobber flags. */ 6257 /* Load guest registers. Don't clobber flags. */
6304 "mov %c[rax](%0), %%"R"ax \n\t" 6258 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
6305 "mov %c[rbx](%0), %%"R"bx \n\t" 6259 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
6306 "mov %c[rdx](%0), %%"R"dx \n\t" 6260 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
6307 "mov %c[rsi](%0), %%"R"si \n\t" 6261 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
6308 "mov %c[rdi](%0), %%"R"di \n\t" 6262 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
6309 "mov %c[rbp](%0), %%"R"bp \n\t" 6263 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
6310#ifdef CONFIG_X86_64 6264#ifdef CONFIG_X86_64
6311 "mov %c[r8](%0), %%r8 \n\t" 6265 "mov %c[r8](%0), %%r8 \n\t"
6312 "mov %c[r9](%0), %%r9 \n\t" 6266 "mov %c[r9](%0), %%r9 \n\t"
@@ -6317,24 +6271,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6317 "mov %c[r14](%0), %%r14 \n\t" 6271 "mov %c[r14](%0), %%r14 \n\t"
6318 "mov %c[r15](%0), %%r15 \n\t" 6272 "mov %c[r15](%0), %%r15 \n\t"
6319#endif 6273#endif
6320 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ 6274 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
6321 6275
6322 /* Enter guest mode */ 6276 /* Enter guest mode */
6323 "jne .Llaunched \n\t" 6277 "jne 1f \n\t"
6324 __ex(ASM_VMX_VMLAUNCH) "\n\t" 6278 __ex(ASM_VMX_VMLAUNCH) "\n\t"
6325 "jmp .Lkvm_vmx_return \n\t" 6279 "jmp 2f \n\t"
6326 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 6280 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
6327 ".Lkvm_vmx_return: " 6281 "2: "
6328 /* Save guest registers, load host registers, keep flags */ 6282 /* Save guest registers, load host registers, keep flags */
6329 "mov %0, %c[wordsize](%%"R"sp) \n\t" 6283 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
6330 "pop %0 \n\t" 6284 "pop %0 \n\t"
6331 "mov %%"R"ax, %c[rax](%0) \n\t" 6285 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
6332 "mov %%"R"bx, %c[rbx](%0) \n\t" 6286 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
6333 "pop"Q" %c[rcx](%0) \n\t" 6287 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
6334 "mov %%"R"dx, %c[rdx](%0) \n\t" 6288 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
6335 "mov %%"R"si, %c[rsi](%0) \n\t" 6289 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
6336 "mov %%"R"di, %c[rdi](%0) \n\t" 6290 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
6337 "mov %%"R"bp, %c[rbp](%0) \n\t" 6291 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
6338#ifdef CONFIG_X86_64 6292#ifdef CONFIG_X86_64
6339 "mov %%r8, %c[r8](%0) \n\t" 6293 "mov %%r8, %c[r8](%0) \n\t"
6340 "mov %%r9, %c[r9](%0) \n\t" 6294 "mov %%r9, %c[r9](%0) \n\t"
@@ -6345,11 +6299,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6345 "mov %%r14, %c[r14](%0) \n\t" 6299 "mov %%r14, %c[r14](%0) \n\t"
6346 "mov %%r15, %c[r15](%0) \n\t" 6300 "mov %%r15, %c[r15](%0) \n\t"
6347#endif 6301#endif
6348 "mov %%cr2, %%"R"ax \n\t" 6302 "mov %%cr2, %%" _ASM_AX " \n\t"
6349 "mov %%"R"ax, %c[cr2](%0) \n\t" 6303 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
6350 6304
6351 "pop %%"R"bp; pop %%"R"dx \n\t" 6305 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
6352 "setbe %c[fail](%0) \n\t" 6306 "setbe %c[fail](%0) \n\t"
6307 ".pushsection .rodata \n\t"
6308 ".global vmx_return \n\t"
6309 "vmx_return: " _ASM_PTR " 2b \n\t"
6310 ".popsection"
6353 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6311 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
6354 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 6312 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6355 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6313 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
@@ -6374,12 +6332,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6374 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 6332 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6375 [wordsize]"i"(sizeof(ulong)) 6333 [wordsize]"i"(sizeof(ulong))
6376 : "cc", "memory" 6334 : "cc", "memory"
6377 , R"ax", R"bx", R"di", R"si"
6378#ifdef CONFIG_X86_64 6335#ifdef CONFIG_X86_64
6336 , "rax", "rbx", "rdi", "rsi"
6379 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 6337 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6338#else
6339 , "eax", "ebx", "edi", "esi"
6380#endif 6340#endif
6381 ); 6341 );
6382 6342
6343 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6344 if (debugctlmsr)
6345 update_debugctlmsr(debugctlmsr);
6346
6383#ifndef CONFIG_X86_64 6347#ifndef CONFIG_X86_64
6384 /* 6348 /*
6385 * The sysexit path does not restore ds/es, so we must set them to 6349 * The sysexit path does not restore ds/es, so we must set them to
@@ -6424,9 +6388,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6424 vmx_complete_interrupts(vmx); 6388 vmx_complete_interrupts(vmx);
6425} 6389}
6426 6390
6427#undef R
6428#undef Q
6429
6430static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6391static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6431{ 6392{
6432 struct vcpu_vmx *vmx = to_vmx(vcpu); 6393 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7281,7 +7242,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7281 .vcpu_load = vmx_vcpu_load, 7242 .vcpu_load = vmx_vcpu_load,
7282 .vcpu_put = vmx_vcpu_put, 7243 .vcpu_put = vmx_vcpu_put,
7283 7244
7284 .set_guest_debug = set_guest_debug, 7245 .update_db_bp_intercept = update_exception_bitmap,
7285 .get_msr = vmx_get_msr, 7246 .get_msr = vmx_get_msr,
7286 .set_msr = vmx_set_msr, 7247 .set_msr = vmx_set_msr,
7287 .get_segment_base = vmx_get_segment_base, 7248 .get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f09552572fa..1eefebe5d727 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
246 246
247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
248{ 248{
249 if (irqchip_in_kernel(vcpu->kvm)) 249 return vcpu->arch.apic_base;
250 return vcpu->arch.apic_base;
251 else
252 return vcpu->arch.apic_base;
253} 250}
254EXPORT_SYMBOL_GPL(kvm_get_apic_base); 251EXPORT_SYMBOL_GPL(kvm_get_apic_base);
255 252
256void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 253void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257{ 254{
258 /* TODO: reserve bits check */ 255 /* TODO: reserve bits check */
259 if (irqchip_in_kernel(vcpu->kvm)) 256 kvm_lapic_set_base(vcpu, data);
260 kvm_lapic_set_base(vcpu, data);
261 else
262 vcpu->arch.apic_base = data;
263} 257}
264EXPORT_SYMBOL_GPL(kvm_set_apic_base); 258EXPORT_SYMBOL_GPL(kvm_set_apic_base);
265 259
@@ -698,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
698} 692}
699EXPORT_SYMBOL_GPL(kvm_get_cr8); 693EXPORT_SYMBOL_GPL(kvm_get_cr8);
700 694
695static void kvm_update_dr7(struct kvm_vcpu *vcpu)
696{
697 unsigned long dr7;
698
699 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
700 dr7 = vcpu->arch.guest_debug_dr7;
701 else
702 dr7 = vcpu->arch.dr7;
703 kvm_x86_ops->set_dr7(vcpu, dr7);
704 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
705}
706
701static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 707static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
702{ 708{
703 switch (dr) { 709 switch (dr) {
@@ -723,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
723 if (val & 0xffffffff00000000ULL) 729 if (val & 0xffffffff00000000ULL)
724 return -1; /* #GP */ 730 return -1; /* #GP */
725 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 731 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
726 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 732 kvm_update_dr7(vcpu);
727 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
728 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
729 }
730 break; 733 break;
731 } 734 }
732 735
@@ -823,7 +826,7 @@ static u32 msrs_to_save[] = {
823 826
824static unsigned num_msrs_to_save; 827static unsigned num_msrs_to_save;
825 828
826static u32 emulated_msrs[] = { 829static const u32 emulated_msrs[] = {
827 MSR_IA32_TSCDEADLINE, 830 MSR_IA32_TSCDEADLINE,
828 MSR_IA32_MISC_ENABLE, 831 MSR_IA32_MISC_ENABLE,
829 MSR_IA32_MCG_STATUS, 832 MSR_IA32_MCG_STATUS,
@@ -1097,7 +1100,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1097 * For each generation, we track the original measured 1100 * For each generation, we track the original measured
1098 * nanosecond time, offset, and write, so if TSCs are in 1101 * nanosecond time, offset, and write, so if TSCs are in
1099 * sync, we can match exact offset, and if not, we can match 1102 * sync, we can match exact offset, and if not, we can match
1100 * exact software computaion in compute_guest_tsc() 1103 * exact software computation in compute_guest_tsc()
1101 * 1104 *
1102 * These values are tracked in kvm->arch.cur_xxx variables. 1105 * These values are tracked in kvm->arch.cur_xxx variables.
1103 */ 1106 */
@@ -1140,6 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1140 unsigned long this_tsc_khz; 1143 unsigned long this_tsc_khz;
1141 s64 kernel_ns, max_kernel_ns; 1144 s64 kernel_ns, max_kernel_ns;
1142 u64 tsc_timestamp; 1145 u64 tsc_timestamp;
1146 u8 pvclock_flags;
1143 1147
1144 /* Keep irq disabled to prevent changes to the clock */ 1148 /* Keep irq disabled to prevent changes to the clock */
1145 local_irq_save(flags); 1149 local_irq_save(flags);
@@ -1221,7 +1225,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1221 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1222 vcpu->last_kernel_ns = kernel_ns; 1226 vcpu->last_kernel_ns = kernel_ns;
1223 vcpu->last_guest_tsc = tsc_timestamp; 1227 vcpu->last_guest_tsc = tsc_timestamp;
1224 vcpu->hv_clock.flags = 0; 1228
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1225 1236
1226 /* 1237 /*
1227 * The interface expects us to write an even number signaling that the 1238 * The interface expects us to write an even number signaling that the
@@ -1504,7 +1515,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1504{ 1515{
1505 gpa_t gpa = data & ~0x3f; 1516 gpa_t gpa = data & ~0x3f;
1506 1517
1507 /* Bits 2:5 are resrved, Should be zero */ 1518 /* Bits 2:5 are reserved, Should be zero */
1508 if (data & 0x3c) 1519 if (data & 0x3c)
1509 return 1; 1520 return 1;
1510 1521
@@ -1639,10 +1650,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1639 vcpu->arch.time_page = 1650 vcpu->arch.time_page =
1640 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1651 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1641 1652
1642 if (is_error_page(vcpu->arch.time_page)) { 1653 if (is_error_page(vcpu->arch.time_page))
1643 kvm_release_page_clean(vcpu->arch.time_page);
1644 vcpu->arch.time_page = NULL; 1654 vcpu->arch.time_page = NULL;
1645 } 1655
1646 break; 1656 break;
1647 } 1657 }
1648 case MSR_KVM_ASYNC_PF_EN: 1658 case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1737,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1727 * Ignore all writes to this no longer documented MSR. 1737 * Ignore all writes to this no longer documented MSR.
1728 * Writes are only relevant for old K7 processors, 1738 * Writes are only relevant for old K7 processors,
1729 * all pre-dating SVM, but a recommended workaround from 1739 * all pre-dating SVM, but a recommended workaround from
1730 * AMD for these chips. It is possible to speicify the 1740 * AMD for these chips. It is possible to specify the
1731 * affected processor models on the command line, hence 1741 * affected processor models on the command line, hence
1732 * the need to ignore the workaround. 1742 * the need to ignore the workaround.
1733 */ 1743 */
@@ -2177,6 +2187,8 @@ int kvm_dev_ioctl_check_extension(long ext)
2177 case KVM_CAP_GET_TSC_KHZ: 2187 case KVM_CAP_GET_TSC_KHZ:
2178 case KVM_CAP_PCI_2_3: 2188 case KVM_CAP_PCI_2_3:
2179 case KVM_CAP_KVMCLOCK_CTRL: 2189 case KVM_CAP_KVMCLOCK_CTRL:
2190 case KVM_CAP_READONLY_MEM:
2191 case KVM_CAP_IRQFD_RESAMPLE:
2180 r = 1; 2192 r = 1;
2181 break; 2193 break;
2182 case KVM_CAP_COALESCED_MMIO: 2194 case KVM_CAP_COALESCED_MMIO:
@@ -2358,8 +2370,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2358static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2370static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2359 struct kvm_lapic_state *s) 2371 struct kvm_lapic_state *s)
2360{ 2372{
2361 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2373 kvm_apic_post_state_restore(vcpu, s);
2362 kvm_apic_post_state_restore(vcpu);
2363 update_cr8_intercept(vcpu); 2374 update_cr8_intercept(vcpu);
2364 2375
2365 return 0; 2376 return 0;
@@ -2368,7 +2379,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2368static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2379static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2369 struct kvm_interrupt *irq) 2380 struct kvm_interrupt *irq)
2370{ 2381{
2371 if (irq->irq < 0 || irq->irq >= 256) 2382 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
2372 return -EINVAL; 2383 return -EINVAL;
2373 if (irqchip_in_kernel(vcpu->kvm)) 2384 if (irqchip_in_kernel(vcpu->kvm))
2374 return -ENXIO; 2385 return -ENXIO;
@@ -2635,11 +2646,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2635 */ 2646 */
2636static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 2647static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2637{ 2648{
2638 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2639 if (!vcpu->arch.time_page) 2649 if (!vcpu->arch.time_page)
2640 return -EINVAL; 2650 return -EINVAL;
2641 src->flags |= PVCLOCK_GUEST_STOPPED; 2651 vcpu->arch.pvclock_set_guest_stopped_request = true;
2642 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2643 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2652 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2644 return 0; 2653 return 0;
2645} 2654}
@@ -3090,7 +3099,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3090 if (!kvm->arch.vpit) 3099 if (!kvm->arch.vpit)
3091 return -ENXIO; 3100 return -ENXIO;
3092 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3101 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3093 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 3102 kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3094 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3103 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3095 return 0; 3104 return 0;
3096} 3105}
@@ -3173,6 +3182,16 @@ out:
3173 return r; 3182 return r;
3174} 3183}
3175 3184
3185int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
3186{
3187 if (!irqchip_in_kernel(kvm))
3188 return -ENXIO;
3189
3190 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3191 irq_event->irq, irq_event->level);
3192 return 0;
3193}
3194
3176long kvm_arch_vm_ioctl(struct file *filp, 3195long kvm_arch_vm_ioctl(struct file *filp,
3177 unsigned int ioctl, unsigned long arg) 3196 unsigned int ioctl, unsigned long arg)
3178{ 3197{
@@ -3279,29 +3298,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3279 create_pit_unlock: 3298 create_pit_unlock:
3280 mutex_unlock(&kvm->slots_lock); 3299 mutex_unlock(&kvm->slots_lock);
3281 break; 3300 break;
3282 case KVM_IRQ_LINE_STATUS:
3283 case KVM_IRQ_LINE: {
3284 struct kvm_irq_level irq_event;
3285
3286 r = -EFAULT;
3287 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3288 goto out;
3289 r = -ENXIO;
3290 if (irqchip_in_kernel(kvm)) {
3291 __s32 status;
3292 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3293 irq_event.irq, irq_event.level);
3294 if (ioctl == KVM_IRQ_LINE_STATUS) {
3295 r = -EFAULT;
3296 irq_event.status = status;
3297 if (copy_to_user(argp, &irq_event,
3298 sizeof irq_event))
3299 goto out;
3300 }
3301 r = 0;
3302 }
3303 break;
3304 }
3305 case KVM_GET_IRQCHIP: { 3301 case KVM_GET_IRQCHIP: {
3306 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3302 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3307 struct kvm_irqchip *chip; 3303 struct kvm_irqchip *chip;
@@ -3689,20 +3685,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
3689 gpa_t *gpa, struct x86_exception *exception, 3685 gpa_t *gpa, struct x86_exception *exception,
3690 bool write) 3686 bool write)
3691{ 3687{
3692 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
3689 | (write ? PFERR_WRITE_MASK : 0);
3693 3690
3694 if (vcpu_match_mmio_gva(vcpu, gva) && 3691 if (vcpu_match_mmio_gva(vcpu, gva)
3695 check_write_user_access(vcpu, write, access, 3692 && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
3696 vcpu->arch.access)) {
3697 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 3693 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
3698 (gva & (PAGE_SIZE - 1)); 3694 (gva & (PAGE_SIZE - 1));
3699 trace_vcpu_match_mmio(gva, *gpa, write, false); 3695 trace_vcpu_match_mmio(gva, *gpa, write, false);
3700 return 1; 3696 return 1;
3701 } 3697 }
3702 3698
3703 if (write)
3704 access |= PFERR_WRITE_MASK;
3705
3706 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3699 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3707 3700
3708 if (*gpa == UNMAPPED_GVA) 3701 if (*gpa == UNMAPPED_GVA)
@@ -3790,14 +3783,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3790 return X86EMUL_CONTINUE; 3783 return X86EMUL_CONTINUE;
3791} 3784}
3792 3785
3793static struct read_write_emulator_ops read_emultor = { 3786static const struct read_write_emulator_ops read_emultor = {
3794 .read_write_prepare = read_prepare, 3787 .read_write_prepare = read_prepare,
3795 .read_write_emulate = read_emulate, 3788 .read_write_emulate = read_emulate,
3796 .read_write_mmio = vcpu_mmio_read, 3789 .read_write_mmio = vcpu_mmio_read,
3797 .read_write_exit_mmio = read_exit_mmio, 3790 .read_write_exit_mmio = read_exit_mmio,
3798}; 3791};
3799 3792
3800static struct read_write_emulator_ops write_emultor = { 3793static const struct read_write_emulator_ops write_emultor = {
3801 .read_write_emulate = write_emulate, 3794 .read_write_emulate = write_emulate,
3802 .read_write_mmio = write_mmio, 3795 .read_write_mmio = write_mmio,
3803 .read_write_exit_mmio = write_exit_mmio, 3796 .read_write_exit_mmio = write_exit_mmio,
@@ -3808,7 +3801,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3808 unsigned int bytes, 3801 unsigned int bytes,
3809 struct x86_exception *exception, 3802 struct x86_exception *exception,
3810 struct kvm_vcpu *vcpu, 3803 struct kvm_vcpu *vcpu,
3811 struct read_write_emulator_ops *ops) 3804 const struct read_write_emulator_ops *ops)
3812{ 3805{
3813 gpa_t gpa; 3806 gpa_t gpa;
3814 int handled, ret; 3807 int handled, ret;
@@ -3857,7 +3850,7 @@ mmio:
3857int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3850int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3858 void *val, unsigned int bytes, 3851 void *val, unsigned int bytes,
3859 struct x86_exception *exception, 3852 struct x86_exception *exception,
3860 struct read_write_emulator_ops *ops) 3853 const struct read_write_emulator_ops *ops)
3861{ 3854{
3862 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3855 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3863 gpa_t gpa; 3856 gpa_t gpa;
@@ -3962,10 +3955,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
3962 goto emul_write; 3955 goto emul_write;
3963 3956
3964 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3957 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3965 if (is_error_page(page)) { 3958 if (is_error_page(page))
3966 kvm_release_page_clean(page);
3967 goto emul_write; 3959 goto emul_write;
3968 }
3969 3960
3970 kaddr = kmap_atomic(page); 3961 kaddr = kmap_atomic(page);
3971 kaddr += offset_in_page(gpa); 3962 kaddr += offset_in_page(gpa);
@@ -4332,7 +4323,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4332 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 4323 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4333} 4324}
4334 4325
4335static struct x86_emulate_ops emulate_ops = { 4326static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4327{
4328 return kvm_register_read(emul_to_vcpu(ctxt), reg);
4329}
4330
4331static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4332{
4333 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4334}
4335
4336static const struct x86_emulate_ops emulate_ops = {
4337 .read_gpr = emulator_read_gpr,
4338 .write_gpr = emulator_write_gpr,
4336 .read_std = kvm_read_guest_virt_system, 4339 .read_std = kvm_read_guest_virt_system,
4337 .write_std = kvm_write_guest_virt_system, 4340 .write_std = kvm_write_guest_virt_system,
4338 .fetch = kvm_fetch_guest_virt, 4341 .fetch = kvm_fetch_guest_virt,
@@ -4367,14 +4370,6 @@ static struct x86_emulate_ops emulate_ops = {
4367 .get_cpuid = emulator_get_cpuid, 4370 .get_cpuid = emulator_get_cpuid,
4368}; 4371};
4369 4372
4370static void cache_all_regs(struct kvm_vcpu *vcpu)
4371{
4372 kvm_register_read(vcpu, VCPU_REGS_RAX);
4373 kvm_register_read(vcpu, VCPU_REGS_RSP);
4374 kvm_register_read(vcpu, VCPU_REGS_RIP);
4375 vcpu->arch.regs_dirty = ~0;
4376}
4377
4378static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 4373static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4379{ 4374{
4380 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 4375 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4401,12 +4396,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4401 kvm_queue_exception(vcpu, ctxt->exception.vector); 4396 kvm_queue_exception(vcpu, ctxt->exception.vector);
4402} 4397}
4403 4398
4404static void init_decode_cache(struct x86_emulate_ctxt *ctxt, 4399static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4405 const unsigned long *regs)
4406{ 4400{
4407 memset(&ctxt->twobyte, 0, 4401 memset(&ctxt->twobyte, 0,
4408 (void *)&ctxt->regs - (void *)&ctxt->twobyte); 4402 (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
4409 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4410 4403
4411 ctxt->fetch.start = 0; 4404 ctxt->fetch.start = 0;
4412 ctxt->fetch.end = 0; 4405 ctxt->fetch.end = 0;
@@ -4421,14 +4414,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4421 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4414 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4422 int cs_db, cs_l; 4415 int cs_db, cs_l;
4423 4416
4424 /*
4425 * TODO: fix emulate.c to use guest_read/write_register
4426 * instead of direct ->regs accesses, can save hundred cycles
4427 * on Intel for instructions that don't read/change RSP, for
4428 * for example.
4429 */
4430 cache_all_regs(vcpu);
4431
4432 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4417 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4433 4418
4434 ctxt->eflags = kvm_get_rflags(vcpu); 4419 ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4440,7 +4425,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4440 X86EMUL_MODE_PROT16; 4425 X86EMUL_MODE_PROT16;
4441 ctxt->guest_mode = is_guest_mode(vcpu); 4426 ctxt->guest_mode = is_guest_mode(vcpu);
4442 4427
4443 init_decode_cache(ctxt, vcpu->arch.regs); 4428 init_decode_cache(ctxt);
4444 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4429 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4445} 4430}
4446 4431
@@ -4460,7 +4445,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4460 return EMULATE_FAIL; 4445 return EMULATE_FAIL;
4461 4446
4462 ctxt->eip = ctxt->_eip; 4447 ctxt->eip = ctxt->_eip;
4463 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4464 kvm_rip_write(vcpu, ctxt->eip); 4448 kvm_rip_write(vcpu, ctxt->eip);
4465 kvm_set_rflags(vcpu, ctxt->eflags); 4449 kvm_set_rflags(vcpu, ctxt->eflags);
4466 4450
@@ -4493,13 +4477,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4493static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4477static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4494{ 4478{
4495 gpa_t gpa; 4479 gpa_t gpa;
4480 pfn_t pfn;
4496 4481
4497 if (tdp_enabled) 4482 if (tdp_enabled)
4498 return false; 4483 return false;
4499 4484
4500 /* 4485 /*
4501 * if emulation was due to access to shadowed page table 4486 * if emulation was due to access to shadowed page table
4502 * and it failed try to unshadow page and re-entetr the 4487 * and it failed try to unshadow page and re-enter the
4503 * guest to let CPU execute the instruction. 4488 * guest to let CPU execute the instruction.
4504 */ 4489 */
4505 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4490 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4510,8 +4495,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4510 if (gpa == UNMAPPED_GVA) 4495 if (gpa == UNMAPPED_GVA)
4511 return true; /* let cpu generate fault */ 4496 return true; /* let cpu generate fault */
4512 4497
4513 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 4498 /*
4499 * Do not retry the unhandleable instruction if it faults on the
4500 * readonly host memory, otherwise it will goto a infinite loop:
4501 * retry instruction -> write #PF -> emulation fail -> retry
4502 * instruction -> ...
4503 */
4504 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4505 if (!is_error_pfn(pfn)) {
4506 kvm_release_pfn_clean(pfn);
4514 return true; 4507 return true;
4508 }
4515 4509
4516 return false; 4510 return false;
4517} 4511}
@@ -4560,6 +4554,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4560 return true; 4554 return true;
4561} 4555}
4562 4556
4557static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4558static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4559
4563int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4560int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4564 unsigned long cr2, 4561 unsigned long cr2,
4565 int emulation_type, 4562 int emulation_type,
@@ -4608,7 +4605,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4608 changes registers values during IO operation */ 4605 changes registers values during IO operation */
4609 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4606 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4610 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4607 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4611 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); 4608 emulator_invalidate_register_cache(ctxt);
4612 } 4609 }
4613 4610
4614restart: 4611restart:
@@ -4630,13 +4627,16 @@ restart:
4630 } else if (vcpu->arch.pio.count) { 4627 } else if (vcpu->arch.pio.count) {
4631 if (!vcpu->arch.pio.in) 4628 if (!vcpu->arch.pio.in)
4632 vcpu->arch.pio.count = 0; 4629 vcpu->arch.pio.count = 0;
4633 else 4630 else {
4634 writeback = false; 4631 writeback = false;
4632 vcpu->arch.complete_userspace_io = complete_emulated_pio;
4633 }
4635 r = EMULATE_DO_MMIO; 4634 r = EMULATE_DO_MMIO;
4636 } else if (vcpu->mmio_needed) { 4635 } else if (vcpu->mmio_needed) {
4637 if (!vcpu->mmio_is_write) 4636 if (!vcpu->mmio_is_write)
4638 writeback = false; 4637 writeback = false;
4639 r = EMULATE_DO_MMIO; 4638 r = EMULATE_DO_MMIO;
4639 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
4640 } else if (r == EMULATION_RESTART) 4640 } else if (r == EMULATION_RESTART)
4641 goto restart; 4641 goto restart;
4642 else 4642 else
@@ -4646,7 +4646,6 @@ restart:
4646 toggle_interruptibility(vcpu, ctxt->interruptibility); 4646 toggle_interruptibility(vcpu, ctxt->interruptibility);
4647 kvm_set_rflags(vcpu, ctxt->eflags); 4647 kvm_set_rflags(vcpu, ctxt->eflags);
4648 kvm_make_request(KVM_REQ_EVENT, vcpu); 4648 kvm_make_request(KVM_REQ_EVENT, vcpu);
4649 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4650 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4649 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4651 kvm_rip_write(vcpu, ctxt->eip); 4650 kvm_rip_write(vcpu, ctxt->eip);
4652 } else 4651 } else
@@ -4929,6 +4928,7 @@ int kvm_arch_init(void *opaque)
4929 if (cpu_has_xsave) 4928 if (cpu_has_xsave)
4930 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4929 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4931 4930
4931 kvm_lapic_init();
4932 return 0; 4932 return 0;
4933 4933
4934out: 4934out:
@@ -5499,6 +5499,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5499 return r; 5499 return r;
5500} 5500}
5501 5501
5502static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
5503{
5504 int r;
5505 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5506 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5507 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5508 if (r != EMULATE_DONE)
5509 return 0;
5510 return 1;
5511}
5512
5513static int complete_emulated_pio(struct kvm_vcpu *vcpu)
5514{
5515 BUG_ON(!vcpu->arch.pio.count);
5516
5517 return complete_emulated_io(vcpu);
5518}
5519
5502/* 5520/*
5503 * Implements the following, as a state machine: 5521 * Implements the following, as a state machine:
5504 * 5522 *
@@ -5515,47 +5533,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5515 * copy data 5533 * copy data
5516 * exit 5534 * exit
5517 */ 5535 */
5518static int complete_mmio(struct kvm_vcpu *vcpu) 5536static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5519{ 5537{
5520 struct kvm_run *run = vcpu->run; 5538 struct kvm_run *run = vcpu->run;
5521 struct kvm_mmio_fragment *frag; 5539 struct kvm_mmio_fragment *frag;
5522 int r;
5523 5540
5524 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5541 BUG_ON(!vcpu->mmio_needed);
5525 return 1;
5526 5542
5527 if (vcpu->mmio_needed) { 5543 /* Complete previous fragment */
5528 /* Complete previous fragment */ 5544 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5529 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; 5545 if (!vcpu->mmio_is_write)
5530 if (!vcpu->mmio_is_write) 5546 memcpy(frag->data, run->mmio.data, frag->len);
5531 memcpy(frag->data, run->mmio.data, frag->len); 5547 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5532 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 5548 vcpu->mmio_needed = 0;
5533 vcpu->mmio_needed = 0;
5534 if (vcpu->mmio_is_write)
5535 return 1;
5536 vcpu->mmio_read_completed = 1;
5537 goto done;
5538 }
5539 /* Initiate next fragment */
5540 ++frag;
5541 run->exit_reason = KVM_EXIT_MMIO;
5542 run->mmio.phys_addr = frag->gpa;
5543 if (vcpu->mmio_is_write) 5549 if (vcpu->mmio_is_write)
5544 memcpy(run->mmio.data, frag->data, frag->len); 5550 return 1;
5545 run->mmio.len = frag->len; 5551 vcpu->mmio_read_completed = 1;
5546 run->mmio.is_write = vcpu->mmio_is_write; 5552 return complete_emulated_io(vcpu);
5547 return 0; 5553 }
5548 5554 /* Initiate next fragment */
5549 } 5555 ++frag;
5550done: 5556 run->exit_reason = KVM_EXIT_MMIO;
5551 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5557 run->mmio.phys_addr = frag->gpa;
5552 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5558 if (vcpu->mmio_is_write)
5553 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5559 memcpy(run->mmio.data, frag->data, frag->len);
5554 if (r != EMULATE_DONE) 5560 run->mmio.len = frag->len;
5555 return 0; 5561 run->mmio.is_write = vcpu->mmio_is_write;
5556 return 1; 5562 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5563 return 0;
5557} 5564}
5558 5565
5566
5559int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5567int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5560{ 5568{
5561 int r; 5569 int r;
@@ -5582,9 +5590,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5582 } 5590 }
5583 } 5591 }
5584 5592
5585 r = complete_mmio(vcpu); 5593 if (unlikely(vcpu->arch.complete_userspace_io)) {
5586 if (r <= 0) 5594 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
5587 goto out; 5595 vcpu->arch.complete_userspace_io = NULL;
5596 r = cui(vcpu);
5597 if (r <= 0)
5598 goto out;
5599 } else
5600 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
5588 5601
5589 r = __vcpu_run(vcpu); 5602 r = __vcpu_run(vcpu);
5590 5603
@@ -5602,12 +5615,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5602 /* 5615 /*
5603 * We are here if userspace calls get_regs() in the middle of 5616 * We are here if userspace calls get_regs() in the middle of
5604 * instruction emulation. Registers state needs to be copied 5617 * instruction emulation. Registers state needs to be copied
5605 * back from emulation context to vcpu. Usrapace shouldn't do 5618 * back from emulation context to vcpu. Userspace shouldn't do
5606 * that usually, but some bad designed PV devices (vmware 5619 * that usually, but some bad designed PV devices (vmware
5607 * backdoor interface) need this to work 5620 * backdoor interface) need this to work
5608 */ 5621 */
5609 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5622 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
5610 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5611 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5623 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5612 } 5624 }
5613 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5625 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5747,7 +5759,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5747 if (ret) 5759 if (ret)
5748 return EMULATE_FAIL; 5760 return EMULATE_FAIL;
5749 5761
5750 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5751 kvm_rip_write(vcpu, ctxt->eip); 5762 kvm_rip_write(vcpu, ctxt->eip);
5752 kvm_set_rflags(vcpu, ctxt->eflags); 5763 kvm_set_rflags(vcpu, ctxt->eflags);
5753 kvm_make_request(KVM_REQ_EVENT, vcpu); 5764 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5799,7 +5810,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5799 if (mmu_reset_needed) 5810 if (mmu_reset_needed)
5800 kvm_mmu_reset_context(vcpu); 5811 kvm_mmu_reset_context(vcpu);
5801 5812
5802 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 5813 max_bits = KVM_NR_INTERRUPTS;
5803 pending_vec = find_first_bit( 5814 pending_vec = find_first_bit(
5804 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 5815 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
5805 if (pending_vec < max_bits) { 5816 if (pending_vec < max_bits) {
@@ -5859,13 +5870,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5859 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5870 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5860 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5871 for (i = 0; i < KVM_NR_DB_REGS; ++i)
5861 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5872 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
5862 vcpu->arch.switch_db_regs = 5873 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
5863 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
5864 } else { 5874 } else {
5865 for (i = 0; i < KVM_NR_DB_REGS; i++) 5875 for (i = 0; i < KVM_NR_DB_REGS; i++)
5866 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 5876 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
5867 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5868 } 5877 }
5878 kvm_update_dr7(vcpu);
5869 5879
5870 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5880 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5871 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 5881 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5877,7 +5887,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5877 */ 5887 */
5878 kvm_set_rflags(vcpu, rflags); 5888 kvm_set_rflags(vcpu, rflags);
5879 5889
5880 kvm_x86_ops->set_guest_debug(vcpu, dbg); 5890 kvm_x86_ops->update_db_bp_intercept(vcpu);
5881 5891
5882 r = 0; 5892 r = 0;
5883 5893
@@ -6023,7 +6033,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6023 int r; 6033 int r;
6024 6034
6025 vcpu->arch.mtrr_state.have_fixed = 1; 6035 vcpu->arch.mtrr_state.have_fixed = 1;
6026 vcpu_load(vcpu); 6036 r = vcpu_load(vcpu);
6037 if (r)
6038 return r;
6027 r = kvm_arch_vcpu_reset(vcpu); 6039 r = kvm_arch_vcpu_reset(vcpu);
6028 if (r == 0) 6040 if (r == 0)
6029 r = kvm_mmu_setup(vcpu); 6041 r = kvm_mmu_setup(vcpu);
@@ -6034,9 +6046,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6034 6046
6035void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6047void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6036{ 6048{
6049 int r;
6037 vcpu->arch.apf.msr_val = 0; 6050 vcpu->arch.apf.msr_val = 0;
6038 6051
6039 vcpu_load(vcpu); 6052 r = vcpu_load(vcpu);
6053 BUG_ON(r);
6040 kvm_mmu_unload(vcpu); 6054 kvm_mmu_unload(vcpu);
6041 vcpu_put(vcpu); 6055 vcpu_put(vcpu);
6042 6056
@@ -6050,10 +6064,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6050 vcpu->arch.nmi_pending = 0; 6064 vcpu->arch.nmi_pending = 0;
6051 vcpu->arch.nmi_injected = false; 6065 vcpu->arch.nmi_injected = false;
6052 6066
6053 vcpu->arch.switch_db_regs = 0;
6054 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6067 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6055 vcpu->arch.dr6 = DR6_FIXED_1; 6068 vcpu->arch.dr6 = DR6_FIXED_1;
6056 vcpu->arch.dr7 = DR7_FIXED_1; 6069 vcpu->arch.dr7 = DR7_FIXED_1;
6070 kvm_update_dr7(vcpu);
6057 6071
6058 kvm_make_request(KVM_REQ_EVENT, vcpu); 6072 kvm_make_request(KVM_REQ_EVENT, vcpu);
6059 vcpu->arch.apf.msr_val = 0; 6073 vcpu->arch.apf.msr_val = 0;
@@ -6132,7 +6146,7 @@ int kvm_arch_hardware_enable(void *garbage)
6132 * as we reset last_host_tsc on all VCPUs to stop this from being 6146 * as we reset last_host_tsc on all VCPUs to stop this from being
6133 * called multiple times (one for each physical CPU bringup). 6147 * called multiple times (one for each physical CPU bringup).
6134 * 6148 *
6135 * Platforms with unnreliable TSCs don't have to deal with this, they 6149 * Platforms with unreliable TSCs don't have to deal with this, they
6136 * will be compensated by the logic in vcpu_load, which sets the TSC to 6150 * will be compensated by the logic in vcpu_load, which sets the TSC to
6137 * catchup mode. This will catchup all VCPUs to real time, but cannot 6151 * catchup mode. This will catchup all VCPUs to real time, but cannot
6138 * guarantee that they stay in perfect synchronization. 6152 * guarantee that they stay in perfect synchronization.
@@ -6185,6 +6199,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6185 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 6199 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6186} 6200}
6187 6201
6202struct static_key kvm_no_apic_vcpu __read_mostly;
6203
6188int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6204int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6189{ 6205{
6190 struct page *page; 6206 struct page *page;
@@ -6217,7 +6233,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6217 r = kvm_create_lapic(vcpu); 6233 r = kvm_create_lapic(vcpu);
6218 if (r < 0) 6234 if (r < 0)
6219 goto fail_mmu_destroy; 6235 goto fail_mmu_destroy;
6220 } 6236 } else
6237 static_key_slow_inc(&kvm_no_apic_vcpu);
6221 6238
6222 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 6239 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6223 GFP_KERNEL); 6240 GFP_KERNEL);
@@ -6257,6 +6274,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6257 kvm_mmu_destroy(vcpu); 6274 kvm_mmu_destroy(vcpu);
6258 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6275 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6259 free_page((unsigned long)vcpu->arch.pio_data); 6276 free_page((unsigned long)vcpu->arch.pio_data);
6277 if (!irqchip_in_kernel(vcpu->kvm))
6278 static_key_slow_dec(&kvm_no_apic_vcpu);
6260} 6279}
6261 6280
6262int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 6281int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6269,15 +6288,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6269 6288
6270 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6289 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6271 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6290 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6291 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
6292 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
6293 &kvm->arch.irq_sources_bitmap);
6272 6294
6273 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6295 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6296 mutex_init(&kvm->arch.apic_map_lock);
6274 6297
6275 return 0; 6298 return 0;
6276} 6299}
6277 6300
6278static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6301static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6279{ 6302{
6280 vcpu_load(vcpu); 6303 int r;
6304 r = vcpu_load(vcpu);
6305 BUG_ON(r);
6281 kvm_mmu_unload(vcpu); 6306 kvm_mmu_unload(vcpu);
6282 vcpu_put(vcpu); 6307 vcpu_put(vcpu);
6283} 6308}
@@ -6321,6 +6346,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6321 put_page(kvm->arch.apic_access_page); 6346 put_page(kvm->arch.apic_access_page);
6322 if (kvm->arch.ept_identity_pagetable) 6347 if (kvm->arch.ept_identity_pagetable)
6323 put_page(kvm->arch.ept_identity_pagetable); 6348 put_page(kvm->arch.ept_identity_pagetable);
6349 kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
6324} 6350}
6325 6351
6326void kvm_arch_free_memslot(struct kvm_memory_slot *free, 6352void kvm_arch_free_memslot(struct kvm_memory_slot *free,
@@ -6328,10 +6354,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6328{ 6354{
6329 int i; 6355 int i;
6330 6356
6331 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6357 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6332 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6358 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
6333 kvm_kvfree(free->arch.lpage_info[i]); 6359 kvm_kvfree(free->arch.rmap[i]);
6334 free->arch.lpage_info[i] = NULL; 6360 free->arch.rmap[i] = NULL;
6361 }
6362 if (i == 0)
6363 continue;
6364
6365 if (!dont || free->arch.lpage_info[i - 1] !=
6366 dont->arch.lpage_info[i - 1]) {
6367 kvm_kvfree(free->arch.lpage_info[i - 1]);
6368 free->arch.lpage_info[i - 1] = NULL;
6335 } 6369 }
6336 } 6370 }
6337} 6371}
@@ -6340,23 +6374,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6340{ 6374{
6341 int i; 6375 int i;
6342 6376
6343 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6377 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6344 unsigned long ugfn; 6378 unsigned long ugfn;
6345 int lpages; 6379 int lpages;
6346 int level = i + 2; 6380 int level = i + 1;
6347 6381
6348 lpages = gfn_to_index(slot->base_gfn + npages - 1, 6382 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6349 slot->base_gfn, level) + 1; 6383 slot->base_gfn, level) + 1;
6350 6384
6351 slot->arch.lpage_info[i] = 6385 slot->arch.rmap[i] =
6352 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6386 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
6353 if (!slot->arch.lpage_info[i]) 6387 if (!slot->arch.rmap[i])
6388 goto out_free;
6389 if (i == 0)
6390 continue;
6391
6392 slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
6393 sizeof(*slot->arch.lpage_info[i - 1]));
6394 if (!slot->arch.lpage_info[i - 1])
6354 goto out_free; 6395 goto out_free;
6355 6396
6356 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 6397 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6357 slot->arch.lpage_info[i][0].write_count = 1; 6398 slot->arch.lpage_info[i - 1][0].write_count = 1;
6358 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 6399 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6359 slot->arch.lpage_info[i][lpages - 1].write_count = 1; 6400 slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
6360 ugfn = slot->userspace_addr >> PAGE_SHIFT; 6401 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6361 /* 6402 /*
6362 * If the gfn and userspace address are not aligned wrt each 6403 * If the gfn and userspace address are not aligned wrt each
@@ -6368,16 +6409,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6368 unsigned long j; 6409 unsigned long j;
6369 6410
6370 for (j = 0; j < lpages; ++j) 6411 for (j = 0; j < lpages; ++j)
6371 slot->arch.lpage_info[i][j].write_count = 1; 6412 slot->arch.lpage_info[i - 1][j].write_count = 1;
6372 } 6413 }
6373 } 6414 }
6374 6415
6375 return 0; 6416 return 0;
6376 6417
6377out_free: 6418out_free:
6378 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6419 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6379 kvm_kvfree(slot->arch.lpage_info[i]); 6420 kvm_kvfree(slot->arch.rmap[i]);
6380 slot->arch.lpage_info[i] = NULL; 6421 slot->arch.rmap[i] = NULL;
6422 if (i == 0)
6423 continue;
6424
6425 kvm_kvfree(slot->arch.lpage_info[i - 1]);
6426 slot->arch.lpage_info[i - 1] = NULL;
6381 } 6427 }
6382 return -ENOMEM; 6428 return -ENOMEM;
6383} 6429}
@@ -6396,10 +6442,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6396 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6442 map_flags = MAP_SHARED | MAP_ANONYMOUS;
6397 6443
6398 /*To keep backward compatibility with older userspace, 6444 /*To keep backward compatibility with older userspace,
6399 *x86 needs to hanlde !user_alloc case. 6445 *x86 needs to handle !user_alloc case.
6400 */ 6446 */
6401 if (!user_alloc) { 6447 if (!user_alloc) {
6402 if (npages && !old.rmap) { 6448 if (npages && !old.npages) {
6403 unsigned long userspace_addr; 6449 unsigned long userspace_addr;
6404 6450
6405 userspace_addr = vm_mmap(NULL, 0, 6451 userspace_addr = vm_mmap(NULL, 0,
@@ -6427,7 +6473,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6427 6473
6428 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6474 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6429 6475
6430 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6476 if (!user_alloc && !old.user_alloc && old.npages && !npages) {
6431 int ret; 6477 int ret;
6432 6478
6433 ret = vm_munmap(old.userspace_addr, 6479 ret = vm_munmap(old.userspace_addr,
@@ -6446,14 +6492,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6446 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6492 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6447 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6493 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6448 spin_unlock(&kvm->mmu_lock); 6494 spin_unlock(&kvm->mmu_lock);
6495 /*
6496 * If memory slot is created, or moved, we need to clear all
6497 * mmio sptes.
6498 */
6499 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
6500 kvm_mmu_zap_all(kvm);
6501 kvm_reload_remote_mmus(kvm);
6502 }
6449} 6503}
6450 6504
6451void kvm_arch_flush_shadow(struct kvm *kvm) 6505void kvm_arch_flush_shadow_all(struct kvm *kvm)
6452{ 6506{
6453 kvm_mmu_zap_all(kvm); 6507 kvm_mmu_zap_all(kvm);
6454 kvm_reload_remote_mmus(kvm); 6508 kvm_reload_remote_mmus(kvm);
6455} 6509}
6456 6510
6511void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6512 struct kvm_memory_slot *slot)
6513{
6514 kvm_arch_flush_shadow_all(kvm);
6515}
6516
6457int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6517int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6458{ 6518{
6459 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6519 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb885..2b5219c12ac8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
124 124
125extern u64 host_xcr0; 125extern u64 host_xcr0;
126 126
127extern struct static_key kvm_no_apic_vcpu;
127#endif 128#endif