aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-04 12:30:33 -0400
commitecefbd94b834fa32559d854646d777c56749ef1c (patch)
treeca8958900ad9e208a8e5fb7704f1b66dc76131b4 /arch/x86/kvm
parentce57e981f2b996aaca2031003b3f866368307766 (diff)
parent3d11df7abbff013b811d5615320580cd5d9d7d31 (diff)
Merge tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights of the changes for this release include support for vfio level triggered interrupts, improved big real mode support on older Intels, a streamlines guest page table walker, guest APIC speedups, PIO optimizations, better overcommit handling, and read-only memory." * tag 'kvm-3.7-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: s390: Fix vcpu_load handling in interrupt code KVM: x86: Fix guest debug across vcpu INIT reset KVM: Add resampling irqfds for level triggered interrupts KVM: optimize apic interrupt delivery KVM: MMU: Eliminate pointless temporary 'ac' KVM: MMU: Avoid access/dirty update loop if all is well KVM: MMU: Eliminate eperm temporary KVM: MMU: Optimize is_last_gpte() KVM: MMU: Simplify walk_addr_generic() loop KVM: MMU: Optimize pte permission checks KVM: MMU: Update accessed and dirty bits after guest pagetable walk KVM: MMU: Move gpte_access() out of paging_tmpl.h KVM: MMU: Optimize gpte_access() slightly KVM: MMU: Push clean gpte write protection out of gpte_access() KVM: clarify kvmclock documentation KVM: make processes waiting on vcpu mutex killable KVM: SVM: Make use of asm.h KVM: VMX: Make use of asm.h KVM: VMX: Make lto-friendly KVM: x86: lapic: Clean up find_highest_vector() and count_vectors() ... Conflicts: arch/s390/include/asm/processor.h arch/x86/kvm/i8259.c
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/emulate.c538
-rw-r--r--arch/x86/kvm/i8254.c64
-rw-r--r--arch/x86/kvm/i8254.h6
-rw-r--r--arch/x86/kvm/i8259.c70
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_timer.h18
-rw-r--r--arch/x86/kvm/lapic.c484
-rw-r--r--arch/x86/kvm/lapic.h61
-rw-r--r--arch/x86/kvm/mmu.c240
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu_audit.c8
-rw-r--r--arch/x86/kvm/paging_tmpl.h199
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/svm.c82
-rw-r--r--arch/x86/kvm/timer.c47
-rw-r--r--arch/x86/kvm/vmx.c233
-rw-r--r--arch/x86/kvm/x86.c384
-rw-r--r--arch/x86/kvm/x86.h1
21 files changed, 1408 insertions, 1074 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338843ea..586f00059805 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
20config KVM 20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 depends on HIGH_RES_TIMERS
23 # for device assignment: 24 # for device assignment:
24 depends on PCI 25 depends on PCI
25 # for TASKSTATS/TASK_DELAY_ACCT: 26 # for TASKSTATS/TASK_DELAY_ACCT:
@@ -37,6 +38,7 @@ config KVM
37 select TASK_DELAY_ACCT 38 select TASK_DELAY_ACCT
38 select PERF_EVENTS 39 select PERF_EVENTS
39 select HAVE_KVM_MSI 40 select HAVE_KVM_MSI
41 select HAVE_KVM_CPU_RELAX_INTERCEPT
40 ---help--- 42 ---help---
41 Support hosting fully virtualized guest machines using hardware 43 Support hosting fully virtualized guest machines using hardware
42 virtualization extensions. You will need a fairly recent 44 virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4f579e8dcacf..04d30401c5cb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) 12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
13 13
14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
15 i8254.o timer.o cpuid.o pmu.o 15 i8254.o cpuid.o pmu.o
16kvm-intel-y += vmx.o 16kvm-intel-y += vmx.o
17kvm-amd-y += svm.o 17kvm-amd-y += svm.o
18 18
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0595f1397b7c..ec79e773342e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -316,7 +316,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
316 } 316 }
317 case 7: { 317 case 7: {
318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 318 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
319 /* Mask ebx against host capbability word 9 */ 319 /* Mask ebx against host capability word 9 */
320 if (index == 0) { 320 if (index == 0) {
321 entry->ebx &= kvm_supported_word9_x86_features; 321 entry->ebx &= kvm_supported_word9_x86_features;
322 cpuid_mask(&entry->ebx, 9); 322 cpuid_mask(&entry->ebx, 9);
@@ -397,8 +397,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
397 break; 397 break;
398 } 398 }
399 case KVM_CPUID_SIGNATURE: { 399 case KVM_CPUID_SIGNATURE: {
400 char signature[12] = "KVMKVMKVM\0\0"; 400 static const char signature[12] = "KVMKVMKVM\0\0";
401 u32 *sigptr = (u32 *)signature; 401 const u32 *sigptr = (const u32 *)signature;
402 entry->eax = KVM_CPUID_FEATURES; 402 entry->eax = KVM_CPUID_FEATURES;
403 entry->ebx = sigptr[0]; 403 entry->ebx = sigptr[0];
404 entry->ecx = sigptr[1]; 404 entry->ecx = sigptr[1];
@@ -484,10 +484,10 @@ struct kvm_cpuid_param {
484 u32 func; 484 u32 func;
485 u32 idx; 485 u32 idx;
486 bool has_leaf_count; 486 bool has_leaf_count;
487 bool (*qualifier)(struct kvm_cpuid_param *param); 487 bool (*qualifier)(const struct kvm_cpuid_param *param);
488}; 488};
489 489
490static bool is_centaur_cpu(struct kvm_cpuid_param *param) 490static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
491{ 491{
492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; 492 return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
493} 493}
@@ -498,7 +498,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
498 struct kvm_cpuid_entry2 *cpuid_entries; 498 struct kvm_cpuid_entry2 *cpuid_entries;
499 int limit, nent = 0, r = -E2BIG, i; 499 int limit, nent = 0, r = -E2BIG, i;
500 u32 func; 500 u32 func;
501 static struct kvm_cpuid_param param[] = { 501 static const struct kvm_cpuid_param param[] = {
502 { .func = 0, .has_leaf_count = true }, 502 { .func = 0, .has_leaf_count = true },
503 { .func = 0x80000000, .has_leaf_count = true }, 503 { .func = 0x80000000, .has_leaf_count = true },
504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, 504 { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
@@ -517,7 +517,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
517 517
518 r = 0; 518 r = 0;
519 for (i = 0; i < ARRAY_SIZE(param); i++) { 519 for (i = 0; i < ARRAY_SIZE(param); i++) {
520 struct kvm_cpuid_param *ent = &param[i]; 520 const struct kvm_cpuid_param *ent = &param[i];
521 521
522 if (ent->qualifier && !ent->qualifier(ent)) 522 if (ent->qualifier && !ent->qualifier(ent))
523 continue; 523 continue;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3b57a27be88..39171cb307ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -161,9 +161,9 @@ struct opcode {
161 u64 intercept : 8; 161 u64 intercept : 8;
162 union { 162 union {
163 int (*execute)(struct x86_emulate_ctxt *ctxt); 163 int (*execute)(struct x86_emulate_ctxt *ctxt);
164 struct opcode *group; 164 const struct opcode *group;
165 struct group_dual *gdual; 165 const struct group_dual *gdual;
166 struct gprefix *gprefix; 166 const struct gprefix *gprefix;
167 } u; 167 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 168 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 169};
@@ -202,6 +202,42 @@ struct gprefix {
202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a 202#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
203#define EFLG_RESERVED_ONE_MASK 2 203#define EFLG_RESERVED_ONE_MASK 2
204 204
205static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
206{
207 if (!(ctxt->regs_valid & (1 << nr))) {
208 ctxt->regs_valid |= 1 << nr;
209 ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
210 }
211 return ctxt->_regs[nr];
212}
213
214static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
215{
216 ctxt->regs_valid |= 1 << nr;
217 ctxt->regs_dirty |= 1 << nr;
218 return &ctxt->_regs[nr];
219}
220
221static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
222{
223 reg_read(ctxt, nr);
224 return reg_write(ctxt, nr);
225}
226
227static void writeback_registers(struct x86_emulate_ctxt *ctxt)
228{
229 unsigned reg;
230
231 for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
232 ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
233}
234
235static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
236{
237 ctxt->regs_dirty = 0;
238 ctxt->regs_valid = 0;
239}
240
205/* 241/*
206 * Instruction emulation: 242 * Instruction emulation:
207 * Most instructions are emulated directly via a fragment of inline assembly 243 * Most instructions are emulated directly via a fragment of inline assembly
@@ -374,8 +410,8 @@ struct gprefix {
374#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
375 do { \ 411 do { \
376 unsigned long _tmp; \ 412 unsigned long _tmp; \
377 ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ 413 ulong *rax = reg_rmw((ctxt), VCPU_REGS_RAX); \
378 ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ 414 ulong *rdx = reg_rmw((ctxt), VCPU_REGS_RDX); \
379 \ 415 \
380 __asm__ __volatile__ ( \ 416 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0", "5", "1") \ 417 _PRE_EFLAGS("0", "5", "1") \
@@ -494,7 +530,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
494 530
495static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) 531static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
496{ 532{
497 masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); 533 masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
498} 534}
499 535
500static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 536static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -632,8 +668,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
632 668
633 la = seg_base(ctxt, addr.seg) + addr.ea; 669 la = seg_base(ctxt, addr.seg) + addr.ea;
634 switch (ctxt->mode) { 670 switch (ctxt->mode) {
635 case X86EMUL_MODE_REAL:
636 break;
637 case X86EMUL_MODE_PROT64: 671 case X86EMUL_MODE_PROT64:
638 if (((signed long)la << 16) >> 16 != la) 672 if (((signed long)la << 16) >> 16 != la)
639 return emulate_gp(ctxt, 0); 673 return emulate_gp(ctxt, 0);
@@ -655,7 +689,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
655 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) 689 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
656 goto bad; 690 goto bad;
657 } else { 691 } else {
658 /* exapand-down segment */ 692 /* expand-down segment */
659 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) 693 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
660 goto bad; 694 goto bad;
661 lim = desc.d ? 0xffffffff : 0xffff; 695 lim = desc.d ? 0xffffffff : 0xffff;
@@ -663,7 +697,10 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 goto bad; 697 goto bad;
664 } 698 }
665 cpl = ctxt->ops->cpl(ctxt); 699 cpl = ctxt->ops->cpl(ctxt);
666 rpl = sel & 3; 700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
667 cpl = max(cpl, rpl); 704 cpl = max(cpl, rpl);
668 if (!(desc.type & 8)) { 705 if (!(desc.type & 8)) {
669 /* data segment */ 706 /* data segment */
@@ -688,9 +725,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
688 return X86EMUL_CONTINUE; 725 return X86EMUL_CONTINUE;
689bad: 726bad:
690 if (addr.seg == VCPU_SREG_SS) 727 if (addr.seg == VCPU_SREG_SS)
691 return emulate_ss(ctxt, addr.seg); 728 return emulate_ss(ctxt, sel);
692 else 729 else
693 return emulate_gp(ctxt, addr.seg); 730 return emulate_gp(ctxt, sel);
694} 731}
695 732
696static int linearize(struct x86_emulate_ctxt *ctxt, 733static int linearize(struct x86_emulate_ctxt *ctxt,
@@ -786,14 +823,15 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
786 * pointer into the block that addresses the relevant register. 823 * pointer into the block that addresses the relevant register.
787 * @highbyte_regs specifies whether to decode AH,CH,DH,BH. 824 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
788 */ 825 */
789static void *decode_register(u8 modrm_reg, unsigned long *regs, 826static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
790 int highbyte_regs) 827 int highbyte_regs)
791{ 828{
792 void *p; 829 void *p;
793 830
794 p = &regs[modrm_reg];
795 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) 831 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
796 p = (unsigned char *)&regs[modrm_reg & 3] + 1; 832 p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
833 else
834 p = reg_rmw(ctxt, modrm_reg);
797 return p; 835 return p;
798} 836}
799 837
@@ -871,23 +909,23 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
871{ 909{
872 ctxt->ops->get_fpu(ctxt); 910 ctxt->ops->get_fpu(ctxt);
873 switch (reg) { 911 switch (reg) {
874 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; 912 case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
875 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; 913 case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
876 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; 914 case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
877 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; 915 case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
878 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; 916 case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
879 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; 917 case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
880 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; 918 case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
881 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; 919 case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
882#ifdef CONFIG_X86_64 920#ifdef CONFIG_X86_64
883 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; 921 case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
884 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; 922 case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
885 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; 923 case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
886 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; 924 case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
887 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; 925 case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
888 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; 926 case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
889 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; 927 case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
890 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; 928 case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
891#endif 929#endif
892 default: BUG(); 930 default: BUG();
893 } 931 }
@@ -899,23 +937,23 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
899{ 937{
900 ctxt->ops->get_fpu(ctxt); 938 ctxt->ops->get_fpu(ctxt);
901 switch (reg) { 939 switch (reg) {
902 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; 940 case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
903 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; 941 case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
904 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; 942 case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
905 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; 943 case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
906 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; 944 case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
907 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; 945 case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
908 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; 946 case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
909 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; 947 case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
910#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
911 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; 949 case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
912 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; 950 case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
913 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; 951 case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
914 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; 952 case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
915 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; 953 case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
916 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; 954 case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
917 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; 955 case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
918 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; 956 case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
919#endif 957#endif
920 default: BUG(); 958 default: BUG();
921 } 959 }
@@ -982,10 +1020,10 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
982 1020
983 op->type = OP_REG; 1021 op->type = OP_REG;
984 if (ctxt->d & ByteOp) { 1022 if (ctxt->d & ByteOp) {
985 op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); 1023 op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
986 op->bytes = 1; 1024 op->bytes = 1;
987 } else { 1025 } else {
988 op->addr.reg = decode_register(reg, ctxt->regs, 0); 1026 op->addr.reg = decode_register(ctxt, reg, 0);
989 op->bytes = ctxt->op_bytes; 1027 op->bytes = ctxt->op_bytes;
990 } 1028 }
991 fetch_register_operand(op); 1029 fetch_register_operand(op);
@@ -1020,8 +1058,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1020 if (ctxt->modrm_mod == 3) { 1058 if (ctxt->modrm_mod == 3) {
1021 op->type = OP_REG; 1059 op->type = OP_REG;
1022 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 1060 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
1023 op->addr.reg = decode_register(ctxt->modrm_rm, 1061 op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, ctxt->d & ByteOp);
1024 ctxt->regs, ctxt->d & ByteOp);
1025 if (ctxt->d & Sse) { 1062 if (ctxt->d & Sse) {
1026 op->type = OP_XMM; 1063 op->type = OP_XMM;
1027 op->bytes = 16; 1064 op->bytes = 16;
@@ -1042,10 +1079,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1042 op->type = OP_MEM; 1079 op->type = OP_MEM;
1043 1080
1044 if (ctxt->ad_bytes == 2) { 1081 if (ctxt->ad_bytes == 2) {
1045 unsigned bx = ctxt->regs[VCPU_REGS_RBX]; 1082 unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
1046 unsigned bp = ctxt->regs[VCPU_REGS_RBP]; 1083 unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
1047 unsigned si = ctxt->regs[VCPU_REGS_RSI]; 1084 unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
1048 unsigned di = ctxt->regs[VCPU_REGS_RDI]; 1085 unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
1049 1086
1050 /* 16-bit ModR/M decode. */ 1087 /* 16-bit ModR/M decode. */
1051 switch (ctxt->modrm_mod) { 1088 switch (ctxt->modrm_mod) {
@@ -1102,17 +1139,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
1102 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) 1139 if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
1103 modrm_ea += insn_fetch(s32, ctxt); 1140 modrm_ea += insn_fetch(s32, ctxt);
1104 else { 1141 else {
1105 modrm_ea += ctxt->regs[base_reg]; 1142 modrm_ea += reg_read(ctxt, base_reg);
1106 adjust_modrm_seg(ctxt, base_reg); 1143 adjust_modrm_seg(ctxt, base_reg);
1107 } 1144 }
1108 if (index_reg != 4) 1145 if (index_reg != 4)
1109 modrm_ea += ctxt->regs[index_reg] << scale; 1146 modrm_ea += reg_read(ctxt, index_reg) << scale;
1110 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { 1147 } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
1111 if (ctxt->mode == X86EMUL_MODE_PROT64) 1148 if (ctxt->mode == X86EMUL_MODE_PROT64)
1112 ctxt->rip_relative = 1; 1149 ctxt->rip_relative = 1;
1113 } else { 1150 } else {
1114 base_reg = ctxt->modrm_rm; 1151 base_reg = ctxt->modrm_rm;
1115 modrm_ea += ctxt->regs[base_reg]; 1152 modrm_ea += reg_read(ctxt, base_reg);
1116 adjust_modrm_seg(ctxt, base_reg); 1153 adjust_modrm_seg(ctxt, base_reg);
1117 } 1154 }
1118 switch (ctxt->modrm_mod) { 1155 switch (ctxt->modrm_mod) {
@@ -1179,24 +1216,21 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1179 int rc; 1216 int rc;
1180 struct read_cache *mc = &ctxt->mem_read; 1217 struct read_cache *mc = &ctxt->mem_read;
1181 1218
1182 while (size) { 1219 if (mc->pos < mc->end)
1183 int n = min(size, 8u); 1220 goto read_cached;
1184 size -= n;
1185 if (mc->pos < mc->end)
1186 goto read_cached;
1187 1221
1188 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, 1222 WARN_ON((mc->end + size) >= sizeof(mc->data));
1189 &ctxt->exception);
1190 if (rc != X86EMUL_CONTINUE)
1191 return rc;
1192 mc->end += n;
1193 1223
1194 read_cached: 1224 rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
1195 memcpy(dest, mc->data + mc->pos, n); 1225 &ctxt->exception);
1196 mc->pos += n; 1226 if (rc != X86EMUL_CONTINUE)
1197 dest += n; 1227 return rc;
1198 addr += n; 1228
1199 } 1229 mc->end += size;
1230
1231read_cached:
1232 memcpy(dest, mc->data + mc->pos, size);
1233 mc->pos += size;
1200 return X86EMUL_CONTINUE; 1234 return X86EMUL_CONTINUE;
1201} 1235}
1202 1236
@@ -1253,10 +1287,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1253 if (rc->pos == rc->end) { /* refill pio read ahead */ 1287 if (rc->pos == rc->end) { /* refill pio read ahead */
1254 unsigned int in_page, n; 1288 unsigned int in_page, n;
1255 unsigned int count = ctxt->rep_prefix ? 1289 unsigned int count = ctxt->rep_prefix ?
1256 address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; 1290 address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
1257 in_page = (ctxt->eflags & EFLG_DF) ? 1291 in_page = (ctxt->eflags & EFLG_DF) ?
1258 offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : 1292 offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
1259 PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); 1293 PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
1260 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, 1294 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1261 count); 1295 count);
1262 if (n == 0) 1296 if (n == 0)
@@ -1267,8 +1301,15 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 rc->end = n * size; 1301 rc->end = n * size;
1268 } 1302 }
1269 1303
1270 memcpy(dest, rc->data + rc->pos, size); 1304 if (ctxt->rep_prefix && !(ctxt->eflags & EFLG_DF)) {
1271 rc->pos += size; 1305 ctxt->dst.data = rc->data + rc->pos;
1306 ctxt->dst.type = OP_MEM_STR;
1307 ctxt->dst.count = (rc->end - rc->pos) / size;
1308 rc->pos = rc->end;
1309 } else {
1310 memcpy(dest, rc->data + rc->pos, size);
1311 rc->pos += size;
1312 }
1272 return 1; 1313 return 1;
1273} 1314}
1274 1315
@@ -1291,7 +1332,7 @@ static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
1291static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1332static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1292 u16 selector, struct desc_ptr *dt) 1333 u16 selector, struct desc_ptr *dt)
1293{ 1334{
1294 struct x86_emulate_ops *ops = ctxt->ops; 1335 const struct x86_emulate_ops *ops = ctxt->ops;
1295 1336
1296 if (selector & 1 << 2) { 1337 if (selector & 1 << 2) {
1297 struct desc_struct desc; 1338 struct desc_struct desc;
@@ -1355,19 +1396,15 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1355 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ 1396 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1356 ulong desc_addr; 1397 ulong desc_addr;
1357 int ret; 1398 int ret;
1399 u16 dummy;
1358 1400
1359 memset(&seg_desc, 0, sizeof seg_desc); 1401 memset(&seg_desc, 0, sizeof seg_desc);
1360 1402
1361 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) 1403 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1362 || ctxt->mode == X86EMUL_MODE_REAL) { 1404 || ctxt->mode == X86EMUL_MODE_REAL) {
1363 /* set real mode segment descriptor */ 1405 /* set real mode segment descriptor */
1406 ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
1364 set_desc_base(&seg_desc, selector << 4); 1407 set_desc_base(&seg_desc, selector << 4);
1365 set_desc_limit(&seg_desc, 0xffff);
1366 seg_desc.type = 3;
1367 seg_desc.p = 1;
1368 seg_desc.s = 1;
1369 if (ctxt->mode == X86EMUL_MODE_VM86)
1370 seg_desc.dpl = 3;
1371 goto load; 1408 goto load;
1372 } 1409 }
1373 1410
@@ -1396,7 +1433,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1396 err_code = selector & 0xfffc; 1433 err_code = selector & 0xfffc;
1397 err_vec = GP_VECTOR; 1434 err_vec = GP_VECTOR;
1398 1435
1399 /* can't load system descriptor into segment selecor */ 1436 /* can't load system descriptor into segment selector */
1400 if (seg <= VCPU_SREG_GS && !seg_desc.s) 1437 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1401 goto exception; 1438 goto exception;
1402 1439
@@ -1516,6 +1553,14 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1516 if (rc != X86EMUL_CONTINUE) 1553 if (rc != X86EMUL_CONTINUE)
1517 return rc; 1554 return rc;
1518 break; 1555 break;
1556 case OP_MEM_STR:
1557 rc = segmented_write(ctxt,
1558 ctxt->dst.addr.mem,
1559 ctxt->dst.data,
1560 ctxt->dst.bytes * ctxt->dst.count);
1561 if (rc != X86EMUL_CONTINUE)
1562 return rc;
1563 break;
1519 case OP_XMM: 1564 case OP_XMM:
1520 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); 1565 write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
1521 break; 1566 break;
@@ -1536,7 +1581,7 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
1536 struct segmented_address addr; 1581 struct segmented_address addr;
1537 1582
1538 rsp_increment(ctxt, -bytes); 1583 rsp_increment(ctxt, -bytes);
1539 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1584 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1540 addr.seg = VCPU_SREG_SS; 1585 addr.seg = VCPU_SREG_SS;
1541 1586
1542 return segmented_write(ctxt, addr, data, bytes); 1587 return segmented_write(ctxt, addr, data, bytes);
@@ -1555,7 +1600,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1555 int rc; 1600 int rc;
1556 struct segmented_address addr; 1601 struct segmented_address addr;
1557 1602
1558 addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); 1603 addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
1559 addr.seg = VCPU_SREG_SS; 1604 addr.seg = VCPU_SREG_SS;
1560 rc = segmented_read(ctxt, addr, dest, len); 1605 rc = segmented_read(ctxt, addr, dest, len);
1561 if (rc != X86EMUL_CONTINUE) 1606 if (rc != X86EMUL_CONTINUE)
@@ -1623,26 +1668,28 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
1623 int rc; 1668 int rc;
1624 unsigned frame_size = ctxt->src.val; 1669 unsigned frame_size = ctxt->src.val;
1625 unsigned nesting_level = ctxt->src2.val & 31; 1670 unsigned nesting_level = ctxt->src2.val & 31;
1671 ulong rbp;
1626 1672
1627 if (nesting_level) 1673 if (nesting_level)
1628 return X86EMUL_UNHANDLEABLE; 1674 return X86EMUL_UNHANDLEABLE;
1629 1675
1630 rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); 1676 rbp = reg_read(ctxt, VCPU_REGS_RBP);
1677 rc = push(ctxt, &rbp, stack_size(ctxt));
1631 if (rc != X86EMUL_CONTINUE) 1678 if (rc != X86EMUL_CONTINUE)
1632 return rc; 1679 return rc;
1633 assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], 1680 assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
1634 stack_mask(ctxt)); 1681 stack_mask(ctxt));
1635 assign_masked(&ctxt->regs[VCPU_REGS_RSP], 1682 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
1636 ctxt->regs[VCPU_REGS_RSP] - frame_size, 1683 reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
1637 stack_mask(ctxt)); 1684 stack_mask(ctxt));
1638 return X86EMUL_CONTINUE; 1685 return X86EMUL_CONTINUE;
1639} 1686}
1640 1687
1641static int em_leave(struct x86_emulate_ctxt *ctxt) 1688static int em_leave(struct x86_emulate_ctxt *ctxt)
1642{ 1689{
1643 assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], 1690 assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
1644 stack_mask(ctxt)); 1691 stack_mask(ctxt));
1645 return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); 1692 return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
1646} 1693}
1647 1694
1648static int em_push_sreg(struct x86_emulate_ctxt *ctxt) 1695static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
@@ -1670,13 +1717,13 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
1670 1717
1671static int em_pusha(struct x86_emulate_ctxt *ctxt) 1718static int em_pusha(struct x86_emulate_ctxt *ctxt)
1672{ 1719{
1673 unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; 1720 unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
1674 int rc = X86EMUL_CONTINUE; 1721 int rc = X86EMUL_CONTINUE;
1675 int reg = VCPU_REGS_RAX; 1722 int reg = VCPU_REGS_RAX;
1676 1723
1677 while (reg <= VCPU_REGS_RDI) { 1724 while (reg <= VCPU_REGS_RDI) {
1678 (reg == VCPU_REGS_RSP) ? 1725 (reg == VCPU_REGS_RSP) ?
1679 (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); 1726 (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
1680 1727
1681 rc = em_push(ctxt); 1728 rc = em_push(ctxt);
1682 if (rc != X86EMUL_CONTINUE) 1729 if (rc != X86EMUL_CONTINUE)
@@ -1705,7 +1752,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1705 --reg; 1752 --reg;
1706 } 1753 }
1707 1754
1708 rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); 1755 rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
1709 if (rc != X86EMUL_CONTINUE) 1756 if (rc != X86EMUL_CONTINUE)
1710 break; 1757 break;
1711 --reg; 1758 --reg;
@@ -1713,9 +1760,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
1713 return rc; 1760 return rc;
1714} 1761}
1715 1762
1716int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) 1763static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1717{ 1764{
1718 struct x86_emulate_ops *ops = ctxt->ops; 1765 const struct x86_emulate_ops *ops = ctxt->ops;
1719 int rc; 1766 int rc;
1720 struct desc_ptr dt; 1767 struct desc_ptr dt;
1721 gva_t cs_addr; 1768 gva_t cs_addr;
@@ -1762,11 +1809,22 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1762 return rc; 1809 return rc;
1763} 1810}
1764 1811
1812int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
1813{
1814 int rc;
1815
1816 invalidate_registers(ctxt);
1817 rc = __emulate_int_real(ctxt, irq);
1818 if (rc == X86EMUL_CONTINUE)
1819 writeback_registers(ctxt);
1820 return rc;
1821}
1822
1765static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) 1823static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
1766{ 1824{
1767 switch(ctxt->mode) { 1825 switch(ctxt->mode) {
1768 case X86EMUL_MODE_REAL: 1826 case X86EMUL_MODE_REAL:
1769 return emulate_int_real(ctxt, irq); 1827 return __emulate_int_real(ctxt, irq);
1770 case X86EMUL_MODE_VM86: 1828 case X86EMUL_MODE_VM86:
1771 case X86EMUL_MODE_PROT16: 1829 case X86EMUL_MODE_PROT16:
1772 case X86EMUL_MODE_PROT32: 1830 case X86EMUL_MODE_PROT32:
@@ -1973,14 +2031,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
1973{ 2031{
1974 u64 old = ctxt->dst.orig_val64; 2032 u64 old = ctxt->dst.orig_val64;
1975 2033
1976 if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || 2034 if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
1977 ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { 2035 ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
1978 ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 2036 *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
1979 ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 2037 *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
1980 ctxt->eflags &= ~EFLG_ZF; 2038 ctxt->eflags &= ~EFLG_ZF;
1981 } else { 2039 } else {
1982 ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | 2040 ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
1983 (u32) ctxt->regs[VCPU_REGS_RBX]; 2041 (u32) reg_read(ctxt, VCPU_REGS_RBX);
1984 2042
1985 ctxt->eflags |= EFLG_ZF; 2043 ctxt->eflags |= EFLG_ZF;
1986 } 2044 }
@@ -2016,7 +2074,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2016{ 2074{
2017 /* Save real source value, then compare EAX against destination. */ 2075 /* Save real source value, then compare EAX against destination. */
2018 ctxt->src.orig_val = ctxt->src.val; 2076 ctxt->src.orig_val = ctxt->src.val;
2019 ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; 2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2020 emulate_2op_SrcV(ctxt, "cmp"); 2078 emulate_2op_SrcV(ctxt, "cmp");
2021 2079
2022 if (ctxt->eflags & EFLG_ZF) { 2080 if (ctxt->eflags & EFLG_ZF) {
@@ -2025,7 +2083,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2025 } else { 2083 } else {
2026 /* Failure: write the value we saw to EAX. */ 2084 /* Failure: write the value we saw to EAX. */
2027 ctxt->dst.type = OP_REG; 2085 ctxt->dst.type = OP_REG;
2028 ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; 2086 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
2029 } 2087 }
2030 return X86EMUL_CONTINUE; 2088 return X86EMUL_CONTINUE;
2031} 2089}
@@ -2050,12 +2108,6 @@ static void
2050setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 2108setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2051 struct desc_struct *cs, struct desc_struct *ss) 2109 struct desc_struct *cs, struct desc_struct *ss)
2052{ 2110{
2053 u16 selector;
2054
2055 memset(cs, 0, sizeof(struct desc_struct));
2056 ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
2057 memset(ss, 0, sizeof(struct desc_struct));
2058
2059 cs->l = 0; /* will be adjusted later */ 2111 cs->l = 0; /* will be adjusted later */
2060 set_desc_base(cs, 0); /* flat segment */ 2112 set_desc_base(cs, 0); /* flat segment */
2061 cs->g = 1; /* 4kb granularity */ 2113 cs->g = 1; /* 4kb granularity */
@@ -2065,6 +2117,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2065 cs->dpl = 0; /* will be adjusted later */ 2117 cs->dpl = 0; /* will be adjusted later */
2066 cs->p = 1; 2118 cs->p = 1;
2067 cs->d = 1; 2119 cs->d = 1;
2120 cs->avl = 0;
2068 2121
2069 set_desc_base(ss, 0); /* flat segment */ 2122 set_desc_base(ss, 0); /* flat segment */
2070 set_desc_limit(ss, 0xfffff); /* 4GB limit */ 2123 set_desc_limit(ss, 0xfffff); /* 4GB limit */
@@ -2074,6 +2127,8 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
2074 ss->d = 1; /* 32bit stack segment */ 2127 ss->d = 1; /* 32bit stack segment */
2075 ss->dpl = 0; 2128 ss->dpl = 0;
2076 ss->p = 1; 2129 ss->p = 1;
2130 ss->l = 0;
2131 ss->avl = 0;
2077} 2132}
2078 2133
2079static bool vendor_intel(struct x86_emulate_ctxt *ctxt) 2134static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
@@ -2089,7 +2144,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
2089 2144
2090static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) 2145static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2091{ 2146{
2092 struct x86_emulate_ops *ops = ctxt->ops; 2147 const struct x86_emulate_ops *ops = ctxt->ops;
2093 u32 eax, ebx, ecx, edx; 2148 u32 eax, ebx, ecx, edx;
2094 2149
2095 /* 2150 /*
@@ -2133,7 +2188,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
2133 2188
2134static int em_syscall(struct x86_emulate_ctxt *ctxt) 2189static int em_syscall(struct x86_emulate_ctxt *ctxt)
2135{ 2190{
2136 struct x86_emulate_ops *ops = ctxt->ops; 2191 const struct x86_emulate_ops *ops = ctxt->ops;
2137 struct desc_struct cs, ss; 2192 struct desc_struct cs, ss;
2138 u64 msr_data; 2193 u64 msr_data;
2139 u16 cs_sel, ss_sel; 2194 u16 cs_sel, ss_sel;
@@ -2165,10 +2220,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2165 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2220 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2166 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2221 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2167 2222
2168 ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; 2223 *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
2169 if (efer & EFER_LMA) { 2224 if (efer & EFER_LMA) {
2170#ifdef CONFIG_X86_64 2225#ifdef CONFIG_X86_64
2171 ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 2226 *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
2172 2227
2173 ops->get_msr(ctxt, 2228 ops->get_msr(ctxt,
2174 ctxt->mode == X86EMUL_MODE_PROT64 ? 2229 ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2191,7 +2246,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
2191 2246
2192static int em_sysenter(struct x86_emulate_ctxt *ctxt) 2247static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2193{ 2248{
2194 struct x86_emulate_ops *ops = ctxt->ops; 2249 const struct x86_emulate_ops *ops = ctxt->ops;
2195 struct desc_struct cs, ss; 2250 struct desc_struct cs, ss;
2196 u64 msr_data; 2251 u64 msr_data;
2197 u16 cs_sel, ss_sel; 2252 u16 cs_sel, ss_sel;
@@ -2228,6 +2283,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2228 if (msr_data == 0x0) 2283 if (msr_data == 0x0)
2229 return emulate_gp(ctxt, 0); 2284 return emulate_gp(ctxt, 0);
2230 break; 2285 break;
2286 default:
2287 break;
2231 } 2288 }
2232 2289
2233 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2290 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -2247,14 +2304,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
2247 ctxt->_eip = msr_data; 2304 ctxt->_eip = msr_data;
2248 2305
2249 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); 2306 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2250 ctxt->regs[VCPU_REGS_RSP] = msr_data; 2307 *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
2251 2308
2252 return X86EMUL_CONTINUE; 2309 return X86EMUL_CONTINUE;
2253} 2310}
2254 2311
2255static int em_sysexit(struct x86_emulate_ctxt *ctxt) 2312static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2256{ 2313{
2257 struct x86_emulate_ops *ops = ctxt->ops; 2314 const struct x86_emulate_ops *ops = ctxt->ops;
2258 struct desc_struct cs, ss; 2315 struct desc_struct cs, ss;
2259 u64 msr_data; 2316 u64 msr_data;
2260 int usermode; 2317 int usermode;
@@ -2297,8 +2354,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
2297 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2354 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2298 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2355 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2299 2356
2300 ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; 2357 ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
2301 ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; 2358 *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
2302 2359
2303 return X86EMUL_CONTINUE; 2360 return X86EMUL_CONTINUE;
2304} 2361}
@@ -2317,7 +2374,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
2317static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2374static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2318 u16 port, u16 len) 2375 u16 port, u16 len)
2319{ 2376{
2320 struct x86_emulate_ops *ops = ctxt->ops; 2377 const struct x86_emulate_ops *ops = ctxt->ops;
2321 struct desc_struct tr_seg; 2378 struct desc_struct tr_seg;
2322 u32 base3; 2379 u32 base3;
2323 int r; 2380 int r;
@@ -2367,14 +2424,14 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2367{ 2424{
2368 tss->ip = ctxt->_eip; 2425 tss->ip = ctxt->_eip;
2369 tss->flag = ctxt->eflags; 2426 tss->flag = ctxt->eflags;
2370 tss->ax = ctxt->regs[VCPU_REGS_RAX]; 2427 tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
2371 tss->cx = ctxt->regs[VCPU_REGS_RCX]; 2428 tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
2372 tss->dx = ctxt->regs[VCPU_REGS_RDX]; 2429 tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
2373 tss->bx = ctxt->regs[VCPU_REGS_RBX]; 2430 tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
2374 tss->sp = ctxt->regs[VCPU_REGS_RSP]; 2431 tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
2375 tss->bp = ctxt->regs[VCPU_REGS_RBP]; 2432 tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
2376 tss->si = ctxt->regs[VCPU_REGS_RSI]; 2433 tss->si = reg_read(ctxt, VCPU_REGS_RSI);
2377 tss->di = ctxt->regs[VCPU_REGS_RDI]; 2434 tss->di = reg_read(ctxt, VCPU_REGS_RDI);
2378 2435
2379 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2436 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2380 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2437 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2390,14 +2447,14 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2390 2447
2391 ctxt->_eip = tss->ip; 2448 ctxt->_eip = tss->ip;
2392 ctxt->eflags = tss->flag | 2; 2449 ctxt->eflags = tss->flag | 2;
2393 ctxt->regs[VCPU_REGS_RAX] = tss->ax; 2450 *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
2394 ctxt->regs[VCPU_REGS_RCX] = tss->cx; 2451 *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
2395 ctxt->regs[VCPU_REGS_RDX] = tss->dx; 2452 *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
2396 ctxt->regs[VCPU_REGS_RBX] = tss->bx; 2453 *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
2397 ctxt->regs[VCPU_REGS_RSP] = tss->sp; 2454 *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
2398 ctxt->regs[VCPU_REGS_RBP] = tss->bp; 2455 *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
2399 ctxt->regs[VCPU_REGS_RSI] = tss->si; 2456 *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
2400 ctxt->regs[VCPU_REGS_RDI] = tss->di; 2457 *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
2401 2458
2402 /* 2459 /*
2403 * SDM says that segment selectors are loaded before segment 2460 * SDM says that segment selectors are loaded before segment
@@ -2410,7 +2467,7 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2410 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); 2467 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2411 2468
2412 /* 2469 /*
2413 * Now load segment descriptors. If fault happenes at this stage 2470 * Now load segment descriptors. If fault happens at this stage
2414 * it is handled in a context of new task 2471 * it is handled in a context of new task
2415 */ 2472 */
2416 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); 2473 ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
@@ -2436,7 +2493,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2436 u16 tss_selector, u16 old_tss_sel, 2493 u16 tss_selector, u16 old_tss_sel,
2437 ulong old_tss_base, struct desc_struct *new_desc) 2494 ulong old_tss_base, struct desc_struct *new_desc)
2438{ 2495{
2439 struct x86_emulate_ops *ops = ctxt->ops; 2496 const struct x86_emulate_ops *ops = ctxt->ops;
2440 struct tss_segment_16 tss_seg; 2497 struct tss_segment_16 tss_seg;
2441 int ret; 2498 int ret;
2442 u32 new_tss_base = get_desc_base(new_desc); 2499 u32 new_tss_base = get_desc_base(new_desc);
@@ -2482,14 +2539,14 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2482 tss->cr3 = ctxt->ops->get_cr(ctxt, 3); 2539 tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
2483 tss->eip = ctxt->_eip; 2540 tss->eip = ctxt->_eip;
2484 tss->eflags = ctxt->eflags; 2541 tss->eflags = ctxt->eflags;
2485 tss->eax = ctxt->regs[VCPU_REGS_RAX]; 2542 tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
2486 tss->ecx = ctxt->regs[VCPU_REGS_RCX]; 2543 tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
2487 tss->edx = ctxt->regs[VCPU_REGS_RDX]; 2544 tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
2488 tss->ebx = ctxt->regs[VCPU_REGS_RBX]; 2545 tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
2489 tss->esp = ctxt->regs[VCPU_REGS_RSP]; 2546 tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
2490 tss->ebp = ctxt->regs[VCPU_REGS_RBP]; 2547 tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
2491 tss->esi = ctxt->regs[VCPU_REGS_RSI]; 2548 tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
2492 tss->edi = ctxt->regs[VCPU_REGS_RDI]; 2549 tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
2493 2550
2494 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); 2551 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2495 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2552 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2511,14 +2568,14 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2511 ctxt->eflags = tss->eflags | 2; 2568 ctxt->eflags = tss->eflags | 2;
2512 2569
2513 /* General purpose registers */ 2570 /* General purpose registers */
2514 ctxt->regs[VCPU_REGS_RAX] = tss->eax; 2571 *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
2515 ctxt->regs[VCPU_REGS_RCX] = tss->ecx; 2572 *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
2516 ctxt->regs[VCPU_REGS_RDX] = tss->edx; 2573 *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
2517 ctxt->regs[VCPU_REGS_RBX] = tss->ebx; 2574 *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
2518 ctxt->regs[VCPU_REGS_RSP] = tss->esp; 2575 *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
2519 ctxt->regs[VCPU_REGS_RBP] = tss->ebp; 2576 *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
2520 ctxt->regs[VCPU_REGS_RSI] = tss->esi; 2577 *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
2521 ctxt->regs[VCPU_REGS_RDI] = tss->edi; 2578 *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
2522 2579
2523 /* 2580 /*
2524 * SDM says that segment selectors are loaded before segment 2581 * SDM says that segment selectors are loaded before segment
@@ -2583,7 +2640,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2583 u16 tss_selector, u16 old_tss_sel, 2640 u16 tss_selector, u16 old_tss_sel,
2584 ulong old_tss_base, struct desc_struct *new_desc) 2641 ulong old_tss_base, struct desc_struct *new_desc)
2585{ 2642{
2586 struct x86_emulate_ops *ops = ctxt->ops; 2643 const struct x86_emulate_ops *ops = ctxt->ops;
2587 struct tss_segment_32 tss_seg; 2644 struct tss_segment_32 tss_seg;
2588 int ret; 2645 int ret;
2589 u32 new_tss_base = get_desc_base(new_desc); 2646 u32 new_tss_base = get_desc_base(new_desc);
@@ -2627,7 +2684,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2627 u16 tss_selector, int idt_index, int reason, 2684 u16 tss_selector, int idt_index, int reason,
2628 bool has_error_code, u32 error_code) 2685 bool has_error_code, u32 error_code)
2629{ 2686{
2630 struct x86_emulate_ops *ops = ctxt->ops; 2687 const struct x86_emulate_ops *ops = ctxt->ops;
2631 struct desc_struct curr_tss_desc, next_tss_desc; 2688 struct desc_struct curr_tss_desc, next_tss_desc;
2632 int ret; 2689 int ret;
2633 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); 2690 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2652,7 +2709,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2652 * 2709 *
2653 * 1. jmp/call/int to task gate: Check against DPL of the task gate 2710 * 1. jmp/call/int to task gate: Check against DPL of the task gate
2654 * 2. Exception/IRQ/iret: No check is performed 2711 * 2. Exception/IRQ/iret: No check is performed
2655 * 3. jmp/call to TSS: Check agains DPL of the TSS 2712 * 3. jmp/call to TSS: Check against DPL of the TSS
2656 */ 2713 */
2657 if (reason == TASK_SWITCH_GATE) { 2714 if (reason == TASK_SWITCH_GATE) {
2658 if (idt_index != -1) { 2715 if (idt_index != -1) {
@@ -2693,7 +2750,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2693 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; 2750 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2694 2751
2695 /* set back link to prev task only if NT bit is set in eflags 2752 /* set back link to prev task only if NT bit is set in eflags
2696 note that old_tss_sel is not used afetr this point */ 2753 note that old_tss_sel is not used after this point */
2697 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 2754 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2698 old_tss_sel = 0xffff; 2755 old_tss_sel = 0xffff;
2699 2756
@@ -2733,26 +2790,28 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2733{ 2790{
2734 int rc; 2791 int rc;
2735 2792
2793 invalidate_registers(ctxt);
2736 ctxt->_eip = ctxt->eip; 2794 ctxt->_eip = ctxt->eip;
2737 ctxt->dst.type = OP_NONE; 2795 ctxt->dst.type = OP_NONE;
2738 2796
2739 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, 2797 rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
2740 has_error_code, error_code); 2798 has_error_code, error_code);
2741 2799
2742 if (rc == X86EMUL_CONTINUE) 2800 if (rc == X86EMUL_CONTINUE) {
2743 ctxt->eip = ctxt->_eip; 2801 ctxt->eip = ctxt->_eip;
2802 writeback_registers(ctxt);
2803 }
2744 2804
2745 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 2805 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2746} 2806}
2747 2807
2748static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2808static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
2749 int reg, struct operand *op) 2809 struct operand *op)
2750{ 2810{
2751 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2811 int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
2752 2812
2753 register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); 2813 register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
2754 op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); 2814 op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
2755 op->addr.mem.seg = seg;
2756} 2815}
2757 2816
2758static int em_das(struct x86_emulate_ctxt *ctxt) 2817static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -2927,7 +2986,7 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
2927{ 2986{
2928 ctxt->dst.type = OP_REG; 2987 ctxt->dst.type = OP_REG;
2929 ctxt->dst.bytes = ctxt->src.bytes; 2988 ctxt->dst.bytes = ctxt->src.bytes;
2930 ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 2989 ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
2931 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); 2990 ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
2932 2991
2933 return X86EMUL_CONTINUE; 2992 return X86EMUL_CONTINUE;
@@ -2938,8 +2997,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2938 u64 tsc = 0; 2997 u64 tsc = 0;
2939 2998
2940 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); 2999 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2941 ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; 3000 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
2942 ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; 3001 *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
2943 return X86EMUL_CONTINUE; 3002 return X86EMUL_CONTINUE;
2944} 3003}
2945 3004
@@ -2947,10 +3006,10 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
2947{ 3006{
2948 u64 pmc; 3007 u64 pmc;
2949 3008
2950 if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) 3009 if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
2951 return emulate_gp(ctxt, 0); 3010 return emulate_gp(ctxt, 0);
2952 ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; 3011 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
2953 ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; 3012 *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
2954 return X86EMUL_CONTINUE; 3013 return X86EMUL_CONTINUE;
2955} 3014}
2956 3015
@@ -2992,9 +3051,9 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
2992{ 3051{
2993 u64 msr_data; 3052 u64 msr_data;
2994 3053
2995 msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] 3054 msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
2996 | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); 3055 | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
2997 if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) 3056 if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
2998 return emulate_gp(ctxt, 0); 3057 return emulate_gp(ctxt, 0);
2999 3058
3000 return X86EMUL_CONTINUE; 3059 return X86EMUL_CONTINUE;
@@ -3004,11 +3063,11 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
3004{ 3063{
3005 u64 msr_data; 3064 u64 msr_data;
3006 3065
3007 if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) 3066 if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
3008 return emulate_gp(ctxt, 0); 3067 return emulate_gp(ctxt, 0);
3009 3068
3010 ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; 3069 *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
3011 ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; 3070 *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
3012 return X86EMUL_CONTINUE; 3071 return X86EMUL_CONTINUE;
3013} 3072}
3014 3073
@@ -3188,8 +3247,8 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
3188 3247
3189static int em_loop(struct x86_emulate_ctxt *ctxt) 3248static int em_loop(struct x86_emulate_ctxt *ctxt)
3190{ 3249{
3191 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 3250 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
3192 if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && 3251 if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
3193 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) 3252 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
3194 jmp_rel(ctxt, ctxt->src.val); 3253 jmp_rel(ctxt, ctxt->src.val);
3195 3254
@@ -3198,7 +3257,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
3198 3257
3199static int em_jcxz(struct x86_emulate_ctxt *ctxt) 3258static int em_jcxz(struct x86_emulate_ctxt *ctxt)
3200{ 3259{
3201 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) 3260 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
3202 jmp_rel(ctxt, ctxt->src.val); 3261 jmp_rel(ctxt, ctxt->src.val);
3203 3262
3204 return X86EMUL_CONTINUE; 3263 return X86EMUL_CONTINUE;
@@ -3286,20 +3345,20 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3286{ 3345{
3287 u32 eax, ebx, ecx, edx; 3346 u32 eax, ebx, ecx, edx;
3288 3347
3289 eax = ctxt->regs[VCPU_REGS_RAX]; 3348 eax = reg_read(ctxt, VCPU_REGS_RAX);
3290 ecx = ctxt->regs[VCPU_REGS_RCX]; 3349 ecx = reg_read(ctxt, VCPU_REGS_RCX);
3291 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); 3350 ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
3292 ctxt->regs[VCPU_REGS_RAX] = eax; 3351 *reg_write(ctxt, VCPU_REGS_RAX) = eax;
3293 ctxt->regs[VCPU_REGS_RBX] = ebx; 3352 *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
3294 ctxt->regs[VCPU_REGS_RCX] = ecx; 3353 *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
3295 ctxt->regs[VCPU_REGS_RDX] = edx; 3354 *reg_write(ctxt, VCPU_REGS_RDX) = edx;
3296 return X86EMUL_CONTINUE; 3355 return X86EMUL_CONTINUE;
3297} 3356}
3298 3357
3299static int em_lahf(struct x86_emulate_ctxt *ctxt) 3358static int em_lahf(struct x86_emulate_ctxt *ctxt)
3300{ 3359{
3301 ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; 3360 *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
3302 ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; 3361 *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
3303 return X86EMUL_CONTINUE; 3362 return X86EMUL_CONTINUE;
3304} 3363}
3305 3364
@@ -3456,7 +3515,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
3456 3515
3457static int check_svme_pa(struct x86_emulate_ctxt *ctxt) 3516static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
3458{ 3517{
3459 u64 rax = ctxt->regs[VCPU_REGS_RAX]; 3518 u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
3460 3519
3461 /* Valid physical address? */ 3520 /* Valid physical address? */
3462 if (rax & 0xffff000000000000ULL) 3521 if (rax & 0xffff000000000000ULL)
@@ -3478,7 +3537,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
3478static int check_rdpmc(struct x86_emulate_ctxt *ctxt) 3537static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
3479{ 3538{
3480 u64 cr4 = ctxt->ops->get_cr(ctxt, 4); 3539 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
3481 u64 rcx = ctxt->regs[VCPU_REGS_RCX]; 3540 u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
3482 3541
3483 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || 3542 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
3484 (rcx > 3)) 3543 (rcx > 3))
@@ -3531,13 +3590,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3531 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3532 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3533 3592
3534static struct opcode group7_rm1[] = { 3593static const struct opcode group7_rm1[] = {
3535 DI(SrcNone | Priv, monitor), 3594 DI(SrcNone | Priv, monitor),
3536 DI(SrcNone | Priv, mwait), 3595 DI(SrcNone | Priv, mwait),
3537 N, N, N, N, N, N, 3596 N, N, N, N, N, N,
3538}; 3597};
3539 3598
3540static struct opcode group7_rm3[] = { 3599static const struct opcode group7_rm3[] = {
3541 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), 3600 DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
3542 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), 3601 II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
3543 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), 3602 DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
@@ -3548,13 +3607,13 @@ static struct opcode group7_rm3[] = {
3548 DIP(SrcNone | Prot | Priv, invlpga, check_svme), 3607 DIP(SrcNone | Prot | Priv, invlpga, check_svme),
3549}; 3608};
3550 3609
3551static struct opcode group7_rm7[] = { 3610static const struct opcode group7_rm7[] = {
3552 N, 3611 N,
3553 DIP(SrcNone, rdtscp, check_rdtsc), 3612 DIP(SrcNone, rdtscp, check_rdtsc),
3554 N, N, N, N, N, N, 3613 N, N, N, N, N, N,
3555}; 3614};
3556 3615
3557static struct opcode group1[] = { 3616static const struct opcode group1[] = {
3558 I(Lock, em_add), 3617 I(Lock, em_add),
3559 I(Lock | PageTable, em_or), 3618 I(Lock | PageTable, em_or),
3560 I(Lock, em_adc), 3619 I(Lock, em_adc),
@@ -3565,11 +3624,11 @@ static struct opcode group1[] = {
3565 I(0, em_cmp), 3624 I(0, em_cmp),
3566}; 3625};
3567 3626
3568static struct opcode group1A[] = { 3627static const struct opcode group1A[] = {
3569 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3570}; 3629};
3571 3630
3572static struct opcode group3[] = { 3631static const struct opcode group3[] = {
3573 I(DstMem | SrcImm, em_test), 3632 I(DstMem | SrcImm, em_test),
3574 I(DstMem | SrcImm, em_test), 3633 I(DstMem | SrcImm, em_test),
3575 I(DstMem | SrcNone | Lock, em_not), 3634 I(DstMem | SrcNone | Lock, em_not),
@@ -3580,13 +3639,13 @@ static struct opcode group3[] = {
3580 I(SrcMem, em_idiv_ex), 3639 I(SrcMem, em_idiv_ex),
3581}; 3640};
3582 3641
3583static struct opcode group4[] = { 3642static const struct opcode group4[] = {
3584 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3585 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
3586 N, N, N, N, N, N, 3645 N, N, N, N, N, N,
3587}; 3646};
3588 3647
3589static struct opcode group5[] = { 3648static const struct opcode group5[] = {
3590 I(DstMem | SrcNone | Lock, em_grp45), 3649 I(DstMem | SrcNone | Lock, em_grp45),
3591 I(DstMem | SrcNone | Lock, em_grp45), 3650 I(DstMem | SrcNone | Lock, em_grp45),
3592 I(SrcMem | Stack, em_grp45), 3651 I(SrcMem | Stack, em_grp45),
@@ -3596,7 +3655,7 @@ static struct opcode group5[] = {
3596 I(SrcMem | Stack, em_grp45), N, 3655 I(SrcMem | Stack, em_grp45), N,
3597}; 3656};
3598 3657
3599static struct opcode group6[] = { 3658static const struct opcode group6[] = {
3600 DI(Prot, sldt), 3659 DI(Prot, sldt),
3601 DI(Prot, str), 3660 DI(Prot, str),
3602 II(Prot | Priv | SrcMem16, em_lldt, lldt), 3661 II(Prot | Priv | SrcMem16, em_lldt, lldt),
@@ -3604,7 +3663,7 @@ static struct opcode group6[] = {
3604 N, N, N, N, 3663 N, N, N, N,
3605}; 3664};
3606 3665
3607static struct group_dual group7 = { { 3666static const struct group_dual group7 = { {
3608 II(Mov | DstMem | Priv, em_sgdt, sgdt), 3667 II(Mov | DstMem | Priv, em_sgdt, sgdt),
3609 II(Mov | DstMem | Priv, em_sidt, sidt), 3668 II(Mov | DstMem | Priv, em_sidt, sidt),
3610 II(SrcMem | Priv, em_lgdt, lgdt), 3669 II(SrcMem | Priv, em_lgdt, lgdt),
@@ -3621,7 +3680,7 @@ static struct group_dual group7 = { {
3621 EXT(0, group7_rm7), 3680 EXT(0, group7_rm7),
3622} }; 3681} };
3623 3682
3624static struct opcode group8[] = { 3683static const struct opcode group8[] = {
3625 N, N, N, N, 3684 N, N, N, N,
3626 I(DstMem | SrcImmByte, em_bt), 3685 I(DstMem | SrcImmByte, em_bt),
3627 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
@@ -3629,26 +3688,26 @@ static struct opcode group8[] = {
3629 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3630}; 3689};
3631 3690
3632static struct group_dual group9 = { { 3691static const struct group_dual group9 = { {
3633 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, 3692 N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
3634}, { 3693}, {
3635 N, N, N, N, N, N, N, N, 3694 N, N, N, N, N, N, N, N,
3636} }; 3695} };
3637 3696
3638static struct opcode group11[] = { 3697static const struct opcode group11[] = {
3639 I(DstMem | SrcImm | Mov | PageTable, em_mov), 3698 I(DstMem | SrcImm | Mov | PageTable, em_mov),
3640 X7(D(Undefined)), 3699 X7(D(Undefined)),
3641}; 3700};
3642 3701
3643static struct gprefix pfx_0f_6f_0f_7f = { 3702static const struct gprefix pfx_0f_6f_0f_7f = {
3644 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 3703 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
3645}; 3704};
3646 3705
3647static struct gprefix pfx_vmovntpx = { 3706static const struct gprefix pfx_vmovntpx = {
3648 I(0, em_mov), N, N, N, 3707 I(0, em_mov), N, N, N,
3649}; 3708};
3650 3709
3651static struct opcode opcode_table[256] = { 3710static const struct opcode opcode_table[256] = {
3652 /* 0x00 - 0x07 */ 3711 /* 0x00 - 0x07 */
3653 I6ALU(Lock, em_add), 3712 I6ALU(Lock, em_add),
3654 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
@@ -3689,7 +3748,7 @@ static struct opcode opcode_table[256] = {
3689 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3748 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3690 I(SrcImmByte | Mov | Stack, em_push), 3749 I(SrcImmByte | Mov | Stack, em_push),
3691 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3750 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3692 I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ 3751 I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
3693 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ 3752 I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
3694 /* 0x70 - 0x7F */ 3753 /* 0x70 - 0x7F */
3695 X16(D(SrcImmByte)), 3754 X16(D(SrcImmByte)),
@@ -3765,7 +3824,7 @@ static struct opcode opcode_table[256] = {
3765 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3824 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3766}; 3825};
3767 3826
3768static struct opcode twobyte_table[256] = { 3827static const struct opcode twobyte_table[256] = {
3769 /* 0x00 - 0x0F */ 3828 /* 0x00 - 0x0F */
3770 G(0, group6), GD(0, &group7), N, N, 3829 G(0, group6), GD(0, &group7), N, N,
3771 N, I(ImplicitOps | VendorSpecific, em_syscall), 3830 N, I(ImplicitOps | VendorSpecific, em_syscall),
@@ -3936,7 +3995,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3936 case OpAcc: 3995 case OpAcc:
3937 op->type = OP_REG; 3996 op->type = OP_REG;
3938 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 3997 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3939 op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; 3998 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
3940 fetch_register_operand(op); 3999 fetch_register_operand(op);
3941 op->orig_val = op->val; 4000 op->orig_val = op->val;
3942 break; 4001 break;
@@ -3944,19 +4003,20 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3944 op->type = OP_MEM; 4003 op->type = OP_MEM;
3945 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4004 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3946 op->addr.mem.ea = 4005 op->addr.mem.ea =
3947 register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); 4006 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
3948 op->addr.mem.seg = VCPU_SREG_ES; 4007 op->addr.mem.seg = VCPU_SREG_ES;
3949 op->val = 0; 4008 op->val = 0;
4009 op->count = 1;
3950 break; 4010 break;
3951 case OpDX: 4011 case OpDX:
3952 op->type = OP_REG; 4012 op->type = OP_REG;
3953 op->bytes = 2; 4013 op->bytes = 2;
3954 op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; 4014 op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
3955 fetch_register_operand(op); 4015 fetch_register_operand(op);
3956 break; 4016 break;
3957 case OpCL: 4017 case OpCL:
3958 op->bytes = 1; 4018 op->bytes = 1;
3959 op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; 4019 op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
3960 break; 4020 break;
3961 case OpImmByte: 4021 case OpImmByte:
3962 rc = decode_imm(ctxt, op, 1, true); 4022 rc = decode_imm(ctxt, op, 1, true);
@@ -3987,9 +4047,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
3987 op->type = OP_MEM; 4047 op->type = OP_MEM;
3988 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; 4048 op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
3989 op->addr.mem.ea = 4049 op->addr.mem.ea =
3990 register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); 4050 register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
3991 op->addr.mem.seg = seg_override(ctxt); 4051 op->addr.mem.seg = seg_override(ctxt);
3992 op->val = 0; 4052 op->val = 0;
4053 op->count = 1;
3993 break; 4054 break;
3994 case OpImmFAddr: 4055 case OpImmFAddr:
3995 op->type = OP_IMM; 4056 op->type = OP_IMM;
@@ -4293,9 +4354,10 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4293 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4294} 4355}
4295 4356
4357
4296int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4297{ 4359{
4298 struct x86_emulate_ops *ops = ctxt->ops; 4360 const struct x86_emulate_ops *ops = ctxt->ops;
4299 int rc = X86EMUL_CONTINUE; 4361 int rc = X86EMUL_CONTINUE;
4300 int saved_dst_type = ctxt->dst.type; 4362 int saved_dst_type = ctxt->dst.type;
4301 4363
@@ -4356,7 +4418,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4356 } 4418 }
4357 4419
4358 /* Instruction can only be executed in protected mode */ 4420 /* Instruction can only be executed in protected mode */
4359 if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { 4421 if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
4360 rc = emulate_ud(ctxt); 4422 rc = emulate_ud(ctxt);
4361 goto done; 4423 goto done;
4362 } 4424 }
@@ -4377,7 +4439,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4377 4439
4378 if (ctxt->rep_prefix && (ctxt->d & String)) { 4440 if (ctxt->rep_prefix && (ctxt->d & String)) {
4379 /* All REP prefixes have the same first termination condition */ 4441 /* All REP prefixes have the same first termination condition */
4380 if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { 4442 if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
4381 ctxt->eip = ctxt->_eip; 4443 ctxt->eip = ctxt->_eip;
4382 goto done; 4444 goto done;
4383 } 4445 }
@@ -4450,7 +4512,7 @@ special_insn:
4450 ctxt->dst.val = ctxt->src.addr.mem.ea; 4512 ctxt->dst.val = ctxt->src.addr.mem.ea;
4451 break; 4513 break;
4452 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 4514 case 0x90 ... 0x97: /* nop / xchg reg, rax */
4453 if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) 4515 if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
4454 break; 4516 break;
4455 rc = em_xchg(ctxt); 4517 rc = em_xchg(ctxt);
4456 break; 4518 break;
@@ -4478,7 +4540,7 @@ special_insn:
4478 rc = em_grp2(ctxt); 4540 rc = em_grp2(ctxt);
4479 break; 4541 break;
4480 case 0xd2 ... 0xd3: /* Grp2 */ 4542 case 0xd2 ... 0xd3: /* Grp2 */
4481 ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; 4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4482 rc = em_grp2(ctxt); 4544 rc = em_grp2(ctxt);
4483 break; 4545 break;
4484 case 0xe9: /* jmp rel */ 4546 case 0xe9: /* jmp rel */
@@ -4524,23 +4586,27 @@ writeback:
4524 ctxt->dst.type = saved_dst_type; 4586 ctxt->dst.type = saved_dst_type;
4525 4587
4526 if ((ctxt->d & SrcMask) == SrcSI) 4588 if ((ctxt->d & SrcMask) == SrcSI)
4527 string_addr_inc(ctxt, seg_override(ctxt), 4589 string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
4528 VCPU_REGS_RSI, &ctxt->src);
4529 4590
4530 if ((ctxt->d & DstMask) == DstDI) 4591 if ((ctxt->d & DstMask) == DstDI)
4531 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, 4592 string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
4532 &ctxt->dst);
4533 4593
4534 if (ctxt->rep_prefix && (ctxt->d & String)) { 4594 if (ctxt->rep_prefix && (ctxt->d & String)) {
4595 unsigned int count;
4535 struct read_cache *r = &ctxt->io_read; 4596 struct read_cache *r = &ctxt->io_read;
4536 register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); 4597 if ((ctxt->d & SrcMask) == SrcSI)
4598 count = ctxt->src.count;
4599 else
4600 count = ctxt->dst.count;
4601 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
4602 -count);
4537 4603
4538 if (!string_insn_completed(ctxt)) { 4604 if (!string_insn_completed(ctxt)) {
4539 /* 4605 /*
4540 * Re-enter guest when pio read ahead buffer is empty 4606 * Re-enter guest when pio read ahead buffer is empty
4541 * or, if it is not used, after each 1024 iteration. 4607 * or, if it is not used, after each 1024 iteration.
4542 */ 4608 */
4543 if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && 4609 if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
4544 (r->end == 0 || r->end != r->pos)) { 4610 (r->end == 0 || r->end != r->pos)) {
4545 /* 4611 /*
4546 * Reset read cache. Usually happens before 4612 * Reset read cache. Usually happens before
@@ -4548,6 +4614,7 @@ writeback:
4548 * we have to do it here. 4614 * we have to do it here.
4549 */ 4615 */
4550 ctxt->mem_read.end = 0; 4616 ctxt->mem_read.end = 0;
4617 writeback_registers(ctxt);
4551 return EMULATION_RESTART; 4618 return EMULATION_RESTART;
4552 } 4619 }
4553 goto done; /* skip rip writeback */ 4620 goto done; /* skip rip writeback */
@@ -4562,6 +4629,9 @@ done:
4562 if (rc == X86EMUL_INTERCEPTED) 4629 if (rc == X86EMUL_INTERCEPTED)
4563 return EMULATION_INTERCEPTED; 4630 return EMULATION_INTERCEPTED;
4564 4631
4632 if (rc == X86EMUL_CONTINUE)
4633 writeback_registers(ctxt);
4634
4565 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4635 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
4566 4636
4567twobyte_insn: 4637twobyte_insn:
@@ -4634,3 +4704,13 @@ twobyte_insn:
4634cannot_emulate: 4704cannot_emulate:
4635 return EMULATION_FAILED; 4705 return EMULATION_FAILED;
4636} 4706}
4707
4708void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
4709{
4710 invalidate_registers(ctxt);
4711}
4712
4713void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
4714{
4715 writeback_registers(ctxt);
4716}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index adba28f88d1a..11300d2fa714 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -108,7 +108,7 @@ static s64 __kpit_elapsed(struct kvm *kvm)
108 ktime_t remaining; 108 ktime_t remaining;
109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 109 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
110 110
111 if (!ps->pit_timer.period) 111 if (!ps->period)
112 return 0; 112 return 0;
113 113
114 /* 114 /*
@@ -120,9 +120,9 @@ static s64 __kpit_elapsed(struct kvm *kvm)
120 * itself with the initial count and continues counting 120 * itself with the initial count and continues counting
121 * from there. 121 * from there.
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->pit_timer.timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->pit_timer.period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->pit_timer.period); 125 elapsed = mod_64(elapsed, ps->period);
126 126
127 return elapsed; 127 return elapsed;
128} 128}
@@ -238,12 +238,12 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
238 int value; 238 int value;
239 239
240 spin_lock(&ps->inject_lock); 240 spin_lock(&ps->inject_lock);
241 value = atomic_dec_return(&ps->pit_timer.pending); 241 value = atomic_dec_return(&ps->pending);
242 if (value < 0) 242 if (value < 0)
243 /* spurious acks can be generated if, for example, the 243 /* spurious acks can be generated if, for example, the
244 * PIC is being reset. Handle it gracefully here 244 * PIC is being reset. Handle it gracefully here
245 */ 245 */
246 atomic_inc(&ps->pit_timer.pending); 246 atomic_inc(&ps->pending);
247 else if (value > 0) 247 else if (value > 0)
248 /* in this case, we had multiple outstanding pit interrupts 248 /* in this case, we had multiple outstanding pit interrupts
249 * that we needed to inject. Reinject 249 * that we needed to inject. Reinject
@@ -261,28 +261,17 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
261 if (!kvm_vcpu_is_bsp(vcpu) || !pit) 261 if (!kvm_vcpu_is_bsp(vcpu) || !pit)
262 return; 262 return;
263 263
264 timer = &pit->pit_state.pit_timer.timer; 264 timer = &pit->pit_state.timer;
265 if (hrtimer_cancel(timer)) 265 if (hrtimer_cancel(timer))
266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 266 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
267} 267}
268 268
269static void destroy_pit_timer(struct kvm_pit *pit) 269static void destroy_pit_timer(struct kvm_pit *pit)
270{ 270{
271 hrtimer_cancel(&pit->pit_state.pit_timer.timer); 271 hrtimer_cancel(&pit->pit_state.timer);
272 flush_kthread_work(&pit->expired); 272 flush_kthread_work(&pit->expired);
273} 273}
274 274
275static bool kpit_is_periodic(struct kvm_timer *ktimer)
276{
277 struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
278 pit_timer);
279 return ps->is_periodic;
280}
281
282static struct kvm_timer_ops kpit_ops = {
283 .is_periodic = kpit_is_periodic,
284};
285
286static void pit_do_work(struct kthread_work *work) 275static void pit_do_work(struct kthread_work *work)
287{ 276{
288 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 277 struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
@@ -322,16 +311,16 @@ static void pit_do_work(struct kthread_work *work)
322 311
323static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 312static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
324{ 313{
325 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 314 struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
326 struct kvm_pit *pt = ktimer->kvm->arch.vpit; 315 struct kvm_pit *pt = ps->kvm->arch.vpit;
327 316
328 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 317 if (ps->reinject || !atomic_read(&ps->pending)) {
329 atomic_inc(&ktimer->pending); 318 atomic_inc(&ps->pending);
330 queue_kthread_work(&pt->worker, &pt->expired); 319 queue_kthread_work(&pt->worker, &pt->expired);
331 } 320 }
332 321
333 if (ktimer->t_ops->is_periodic(ktimer)) { 322 if (ps->is_periodic) {
334 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 323 hrtimer_add_expires_ns(&ps->timer, ps->period);
335 return HRTIMER_RESTART; 324 return HRTIMER_RESTART;
336 } else 325 } else
337 return HRTIMER_NORESTART; 326 return HRTIMER_NORESTART;
@@ -340,7 +329,6 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
340static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) 329static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
341{ 330{
342 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; 331 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
343 struct kvm_timer *pt = &ps->pit_timer;
344 s64 interval; 332 s64 interval;
345 333
346 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) 334 if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
@@ -351,19 +339,18 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
351 pr_debug("create pit timer, interval is %llu nsec\n", interval); 339 pr_debug("create pit timer, interval is %llu nsec\n", interval);
352 340
353 /* TODO The new value only affected after the retriggered */ 341 /* TODO The new value only affected after the retriggered */
354 hrtimer_cancel(&pt->timer); 342 hrtimer_cancel(&ps->timer);
355 flush_kthread_work(&ps->pit->expired); 343 flush_kthread_work(&ps->pit->expired);
356 pt->period = interval; 344 ps->period = interval;
357 ps->is_periodic = is_period; 345 ps->is_periodic = is_period;
358 346
359 pt->timer.function = pit_timer_fn; 347 ps->timer.function = pit_timer_fn;
360 pt->t_ops = &kpit_ops; 348 ps->kvm = ps->pit->kvm;
361 pt->kvm = ps->pit->kvm;
362 349
363 atomic_set(&pt->pending, 0); 350 atomic_set(&ps->pending, 0);
364 ps->irq_ack = 1; 351 ps->irq_ack = 1;
365 352
366 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 353 hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
367 HRTIMER_MODE_ABS); 354 HRTIMER_MODE_ABS);
368} 355}
369 356
@@ -639,7 +626,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
639 } 626 }
640 mutex_unlock(&pit->pit_state.lock); 627 mutex_unlock(&pit->pit_state.lock);
641 628
642 atomic_set(&pit->pit_state.pit_timer.pending, 0); 629 atomic_set(&pit->pit_state.pending, 0);
643 pit->pit_state.irq_ack = 1; 630 pit->pit_state.irq_ack = 1;
644} 631}
645 632
@@ -648,7 +635,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
648 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); 635 struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
649 636
650 if (!mask) { 637 if (!mask) {
651 atomic_set(&pit->pit_state.pit_timer.pending, 0); 638 atomic_set(&pit->pit_state.pending, 0);
652 pit->pit_state.irq_ack = 1; 639 pit->pit_state.irq_ack = 1;
653 } 640 }
654} 641}
@@ -706,12 +693,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
706 693
707 pit_state = &pit->pit_state; 694 pit_state = &pit->pit_state;
708 pit_state->pit = pit; 695 pit_state->pit = pit;
709 hrtimer_init(&pit_state->pit_timer.timer, 696 hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
710 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
711 pit_state->irq_ack_notifier.gsi = 0; 697 pit_state->irq_ack_notifier.gsi = 0;
712 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; 698 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
713 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); 699 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
714 pit_state->pit_timer.reinject = true; 700 pit_state->reinject = true;
715 mutex_unlock(&pit->pit_state.lock); 701 mutex_unlock(&pit->pit_state.lock);
716 702
717 kvm_pit_reset(pit); 703 kvm_pit_reset(pit);
@@ -761,7 +747,7 @@ void kvm_free_pit(struct kvm *kvm)
761 kvm_unregister_irq_ack_notifier(kvm, 747 kvm_unregister_irq_ack_notifier(kvm,
762 &kvm->arch.vpit->pit_state.irq_ack_notifier); 748 &kvm->arch.vpit->pit_state.irq_ack_notifier);
763 mutex_lock(&kvm->arch.vpit->pit_state.lock); 749 mutex_lock(&kvm->arch.vpit->pit_state.lock);
764 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 750 timer = &kvm->arch.vpit->pit_state.timer;
765 hrtimer_cancel(timer); 751 hrtimer_cancel(timer);
766 flush_kthread_work(&kvm->arch.vpit->expired); 752 flush_kthread_work(&kvm->arch.vpit->expired);
767 kthread_stop(kvm->arch.vpit->worker_task); 753 kthread_stop(kvm->arch.vpit->worker_task);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index fdf40425ea1d..dd1b16b611b0 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -24,8 +24,12 @@ struct kvm_kpit_channel_state {
24struct kvm_kpit_state { 24struct kvm_kpit_state {
25 struct kvm_kpit_channel_state channels[3]; 25 struct kvm_kpit_channel_state channels[3];
26 u32 flags; 26 u32 flags;
27 struct kvm_timer pit_timer;
28 bool is_periodic; 27 bool is_periodic;
28 s64 period; /* unit: ns */
29 struct hrtimer timer;
30 atomic_t pending; /* accumulated triggered timers */
31 bool reinject;
32 struct kvm *kvm;
29 u32 speaker_data_on; 33 u32 speaker_data_on;
30 struct mutex lock; 34 struct mutex lock;
31 struct kvm_pit *pit; 35 struct kvm_pit *pit;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 9fc9aa7ac703..848206df0967 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -190,17 +190,17 @@ void kvm_pic_update_irq(struct kvm_pic *s)
190 190
191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) 191int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
192{ 192{
193 int ret = -1; 193 int ret, irq_level;
194
195 BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
194 196
195 pic_lock(s); 197 pic_lock(s);
196 if (irq >= 0 && irq < PIC_NUM_PINS) { 198 irq_level = __kvm_irq_line_state(&s->irq_states[irq],
197 int irq_level = __kvm_irq_line_state(&s->irq_states[irq], 199 irq_source_id, level);
198 irq_source_id, level); 200 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
199 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); 201 pic_update_irq(s);
200 pic_update_irq(s); 202 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
201 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 203 s->pics[irq >> 3].imr, ret == 0);
202 s->pics[irq >> 3].imr, ret == 0);
203 }
204 pic_unlock(s); 204 pic_unlock(s);
205 205
206 return ret; 206 return ret;
@@ -275,23 +275,20 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
275{ 275{
276 int irq, i; 276 int irq, i;
277 struct kvm_vcpu *vcpu; 277 struct kvm_vcpu *vcpu;
278 u8 irr = s->irr, isr = s->imr; 278 u8 edge_irr = s->irr & ~s->elcr;
279 bool found = false; 279 bool found = false;
280 280
281 s->last_irr = 0; 281 s->last_irr = 0;
282 s->irr = 0; 282 s->irr &= s->elcr;
283 s->imr = 0; 283 s->imr = 0;
284 s->isr = 0;
285 s->priority_add = 0; 284 s->priority_add = 0;
286 s->irq_base = 0;
287 s->read_reg_select = 0;
288 s->poll = 0;
289 s->special_mask = 0; 285 s->special_mask = 0;
290 s->init_state = 0; 286 s->read_reg_select = 0;
291 s->auto_eoi = 0; 287 if (!s->init4) {
292 s->rotate_on_auto_eoi = 0; 288 s->special_fully_nested_mode = 0;
293 s->special_fully_nested_mode = 0; 289 s->auto_eoi = 0;
294 s->init4 = 0; 290 }
291 s->init_state = 1;
295 292
296 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) 293 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
297 if (kvm_apic_accept_pic_intr(vcpu)) { 294 if (kvm_apic_accept_pic_intr(vcpu)) {
@@ -304,7 +301,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
304 return; 301 return;
305 302
306 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) 303 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
307 if (irr & (1 << irq) || isr & (1 << irq)) 304 if (edge_irr & (1 << irq))
308 pic_clear_isr(s, irq); 305 pic_clear_isr(s, irq);
309} 306}
310 307
@@ -316,40 +313,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
316 addr &= 1; 313 addr &= 1;
317 if (addr == 0) { 314 if (addr == 0) {
318 if (val & 0x10) { 315 if (val & 0x10) {
319 u8 edge_irr = s->irr & ~s->elcr;
320 int i;
321 bool found = false;
322 struct kvm_vcpu *vcpu;
323
324 s->init4 = val & 1; 316 s->init4 = val & 1;
325 s->last_irr = 0;
326 s->irr &= s->elcr;
327 s->imr = 0;
328 s->priority_add = 0;
329 s->special_mask = 0;
330 s->read_reg_select = 0;
331 if (!s->init4) {
332 s->special_fully_nested_mode = 0;
333 s->auto_eoi = 0;
334 }
335 s->init_state = 1;
336 if (val & 0x02) 317 if (val & 0x02)
337 pr_pic_unimpl("single mode not supported"); 318 pr_pic_unimpl("single mode not supported");
338 if (val & 0x08) 319 if (val & 0x08)
339 pr_pic_unimpl( 320 pr_pic_unimpl(
340 "level sensitive irq not supported"); 321 "level sensitive irq not supported");
341 322 kvm_pic_reset(s);
342 kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
343 if (kvm_apic_accept_pic_intr(vcpu)) {
344 found = true;
345 break;
346 }
347
348
349 if (found)
350 for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
351 if (edge_irr & (1 << irq))
352 pic_clear_isr(s, irq);
353 } else if (val & 0x08) { 323 } else if (val & 0x08) {
354 if (val & 0x04) 324 if (val & 0x04)
355 s->poll = 1; 325 s->poll = 1;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2086f2bfba33..2d03568e9498 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -70,7 +70,7 @@ struct kvm_pic {
70 struct kvm_io_device dev_slave; 70 struct kvm_io_device dev_slave;
71 struct kvm_io_device dev_eclr; 71 struct kvm_io_device dev_eclr;
72 void (*ack_notifier)(void *opaque, int irq); 72 void (*ack_notifier)(void *opaque, int irq);
73 unsigned long irq_states[16]; 73 unsigned long irq_states[PIC_NUM_PINS];
74}; 74};
75 75
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 76struct kvm_pic *kvm_create_pic(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 497dbaa366d4..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,18 +0,0 @@
1
2struct kvm_timer {
3 struct hrtimer timer;
4 s64 period; /* unit: ns */
5 u32 timer_mode_mask;
6 u64 tscdeadline;
7 atomic_t pending; /* accumulated triggered timers */
8 bool reinject;
9 struct kvm_timer_ops *t_ops;
10 struct kvm *kvm;
11 struct kvm_vcpu *vcpu;
12};
13
14struct kvm_timer_ops {
15 bool (*is_periodic)(struct kvm_timer *);
16};
17
18enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce878788a39f..c6e6b721b6ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -34,6 +34,7 @@
34#include <asm/current.h> 34#include <asm/current.h>
35#include <asm/apicdef.h> 35#include <asm/apicdef.h>
36#include <linux/atomic.h> 36#include <linux/atomic.h>
37#include <linux/jump_label.h>
37#include "kvm_cache_regs.h" 38#include "kvm_cache_regs.h"
38#include "irq.h" 39#include "irq.h"
39#include "trace.h" 40#include "trace.h"
@@ -65,6 +66,7 @@
65#define APIC_DEST_NOSHORT 0x0 66#define APIC_DEST_NOSHORT 0x0
66#define APIC_DEST_MASK 0x800 67#define APIC_DEST_MASK 0x800
67#define MAX_APIC_VECTOR 256 68#define MAX_APIC_VECTOR 256
69#define APIC_VECTORS_PER_REG 32
68 70
69#define VEC_POS(v) ((v) & (32 - 1)) 71#define VEC_POS(v) ((v) & (32 - 1))
70#define REG_POS(v) (((v) >> 5) << 4) 72#define REG_POS(v) (((v) >> 5) << 4)
@@ -72,11 +74,6 @@
72static unsigned int min_timer_period_us = 500; 74static unsigned int min_timer_period_us = 500;
73module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); 75module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
74 76
75static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
76{
77 return *((u32 *) (apic->regs + reg_off));
78}
79
80static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) 77static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
81{ 78{
82 *((u32 *) (apic->regs + reg_off)) = val; 79 *((u32 *) (apic->regs + reg_off)) = val;
@@ -117,19 +114,23 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
117 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); 114 return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
118} 115}
119 116
120static inline int apic_hw_enabled(struct kvm_lapic *apic) 117struct static_key_deferred apic_hw_disabled __read_mostly;
121{ 118struct static_key_deferred apic_sw_disabled __read_mostly;
122 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
123}
124 119
125static inline int apic_sw_enabled(struct kvm_lapic *apic) 120static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
126{ 121{
127 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; 122 if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
123 if (val & APIC_SPIV_APIC_ENABLED)
124 static_key_slow_dec_deferred(&apic_sw_disabled);
125 else
126 static_key_slow_inc(&apic_sw_disabled.key);
127 }
128 apic_set_reg(apic, APIC_SPIV, val);
128} 129}
129 130
130static inline int apic_enabled(struct kvm_lapic *apic) 131static inline int apic_enabled(struct kvm_lapic *apic)
131{ 132{
132 return apic_sw_enabled(apic) && apic_hw_enabled(apic); 133 return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
133} 134}
134 135
135#define LVT_MASK \ 136#define LVT_MASK \
@@ -139,36 +140,135 @@ static inline int apic_enabled(struct kvm_lapic *apic)
139 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
140 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
141 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
142static inline int kvm_apic_id(struct kvm_lapic *apic) 148static inline int kvm_apic_id(struct kvm_lapic *apic)
143{ 149{
144 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151}
152
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
154{
155 u16 cid;
156 ldr >>= 32 - map->ldr_bits;
157 cid = (ldr >> map->cid_shift) & map->cid_mask;
158
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
160
161 return cid;
162}
163
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
165{
166 ldr >>= (32 - map->ldr_bits);
167 return ldr & map->lid_mask;
168}
169
170static void recalculate_apic_map(struct kvm *kvm)
171{
172 struct kvm_apic_map *new, *old = NULL;
173 struct kvm_vcpu *vcpu;
174 int i;
175
176 new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
177
178 mutex_lock(&kvm->arch.apic_map_lock);
179
180 if (!new)
181 goto out;
182
183 new->ldr_bits = 8;
184 /* flat mode is default */
185 new->cid_shift = 8;
186 new->cid_mask = 0;
187 new->lid_mask = 0xff;
188
189 kvm_for_each_vcpu(i, vcpu, kvm) {
190 struct kvm_lapic *apic = vcpu->arch.apic;
191 u16 cid, lid;
192 u32 ldr;
193
194 if (!kvm_apic_present(vcpu))
195 continue;
196
197 /*
198 * All APICs have to be configured in the same mode by an OS.
199 * We take advatage of this while building logical id loockup
200 * table. After reset APICs are in xapic/flat mode, so if we
201 * find apic with different setting we assume this is the mode
202 * OS wants all apics to be in; build lookup table accordingly.
203 */
204 if (apic_x2apic_mode(apic)) {
205 new->ldr_bits = 32;
206 new->cid_shift = 16;
207 new->cid_mask = new->lid_mask = 0xffff;
208 } else if (kvm_apic_sw_enabled(apic) &&
209 !new->cid_mask /* flat mode */ &&
210 kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
211 new->cid_shift = 4;
212 new->cid_mask = 0xf;
213 new->lid_mask = 0xf;
214 }
215
216 new->phys_map[kvm_apic_id(apic)] = apic;
217
218 ldr = kvm_apic_get_reg(apic, APIC_LDR);
219 cid = apic_cluster_id(new, ldr);
220 lid = apic_logical_id(new, ldr);
221
222 if (lid)
223 new->logical_map[cid][ffs(lid) - 1] = apic;
224 }
225out:
226 old = rcu_dereference_protected(kvm->arch.apic_map,
227 lockdep_is_held(&kvm->arch.apic_map_lock));
228 rcu_assign_pointer(kvm->arch.apic_map, new);
229 mutex_unlock(&kvm->arch.apic_map_lock);
230
231 if (old)
232 kfree_rcu(old, rcu);
233}
234
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
236{
237 apic_set_reg(apic, APIC_ID, id << 24);
238 recalculate_apic_map(apic->vcpu->kvm);
239}
240
241static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
242{
243 apic_set_reg(apic, APIC_LDR, id);
244 recalculate_apic_map(apic->vcpu->kvm);
145} 245}
146 246
147static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) 247static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
148{ 248{
149 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); 249 return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
150} 250}
151 251
152static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) 252static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
153{ 253{
154 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; 254 return kvm_apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
155} 255}
156 256
157static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) 257static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
158{ 258{
159 return ((apic_get_reg(apic, APIC_LVTT) & 259 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
160 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); 260 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
161} 261}
162 262
163static inline int apic_lvtt_period(struct kvm_lapic *apic) 263static inline int apic_lvtt_period(struct kvm_lapic *apic)
164{ 264{
165 return ((apic_get_reg(apic, APIC_LVTT) & 265 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
166 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); 266 apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
167} 267}
168 268
169static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) 269static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
170{ 270{
171 return ((apic_get_reg(apic, APIC_LVTT) & 271 return ((kvm_apic_get_reg(apic, APIC_LVTT) &
172 apic->lapic_timer.timer_mode_mask) == 272 apic->lapic_timer.timer_mode_mask) ==
173 APIC_LVT_TIMER_TSCDEADLINE); 273 APIC_LVT_TIMER_TSCDEADLINE);
174} 274}
@@ -184,7 +284,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
184 struct kvm_cpuid_entry2 *feat; 284 struct kvm_cpuid_entry2 *feat;
185 u32 v = APIC_VERSION; 285 u32 v = APIC_VERSION;
186 286
187 if (!irqchip_in_kernel(vcpu->kvm)) 287 if (!kvm_vcpu_has_lapic(vcpu))
188 return; 288 return;
189 289
190 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); 290 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -193,12 +293,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
193 apic_set_reg(apic, APIC_LVR, v); 293 apic_set_reg(apic, APIC_LVR, v);
194} 294}
195 295
196static inline int apic_x2apic_mode(struct kvm_lapic *apic) 296static const unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
197{
198 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
199}
200
201static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
202 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ 297 LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
203 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 298 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
204 LVT_MASK | APIC_MODE_MASK, /* LVTPC */ 299 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
@@ -208,25 +303,30 @@ static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
208 303
209static int find_highest_vector(void *bitmap) 304static int find_highest_vector(void *bitmap)
210{ 305{
211 u32 *word = bitmap; 306 int vec;
212 int word_offset = MAX_APIC_VECTOR >> 5; 307 u32 *reg;
213 308
214 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) 309 for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
215 continue; 310 vec >= 0; vec -= APIC_VECTORS_PER_REG) {
311 reg = bitmap + REG_POS(vec);
312 if (*reg)
313 return fls(*reg) - 1 + vec;
314 }
216 315
217 if (likely(!word_offset && !word[0])) 316 return -1;
218 return -1;
219 else
220 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
221} 317}
222 318
223static u8 count_vectors(void *bitmap) 319static u8 count_vectors(void *bitmap)
224{ 320{
225 u32 *word = bitmap; 321 int vec;
226 int word_offset; 322 u32 *reg;
227 u8 count = 0; 323 u8 count = 0;
228 for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) 324
229 count += hweight32(word[word_offset << 2]); 325 for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
326 reg = bitmap + REG_POS(vec);
327 count += hweight32(*reg);
328 }
329
230 return count; 330 return count;
231} 331}
232 332
@@ -285,7 +385,6 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
285 385
286int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 386int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
287{ 387{
288 struct kvm_lapic *apic = vcpu->arch.apic;
289 int highest_irr; 388 int highest_irr;
290 389
291 /* This may race with setting of irr in __apic_accept_irq() and 390 /* This may race with setting of irr in __apic_accept_irq() and
@@ -293,9 +392,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
293 * will cause vmexit immediately and the value will be recalculated 392 * will cause vmexit immediately and the value will be recalculated
294 * on the next vmentry. 393 * on the next vmentry.
295 */ 394 */
296 if (!apic) 395 if (!kvm_vcpu_has_lapic(vcpu))
297 return 0; 396 return 0;
298 highest_irr = apic_find_highest_irr(apic); 397 highest_irr = apic_find_highest_irr(vcpu->arch.apic);
299 398
300 return highest_irr; 399 return highest_irr;
301} 400}
@@ -378,8 +477,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
378 u32 tpr, isrv, ppr, old_ppr; 477 u32 tpr, isrv, ppr, old_ppr;
379 int isr; 478 int isr;
380 479
381 old_ppr = apic_get_reg(apic, APIC_PROCPRI); 480 old_ppr = kvm_apic_get_reg(apic, APIC_PROCPRI);
382 tpr = apic_get_reg(apic, APIC_TASKPRI); 481 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI);
383 isr = apic_find_highest_isr(apic); 482 isr = apic_find_highest_isr(apic);
384 isrv = (isr != -1) ? isr : 0; 483 isrv = (isr != -1) ? isr : 0;
385 484
@@ -415,13 +514,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
415 u32 logical_id; 514 u32 logical_id;
416 515
417 if (apic_x2apic_mode(apic)) { 516 if (apic_x2apic_mode(apic)) {
418 logical_id = apic_get_reg(apic, APIC_LDR); 517 logical_id = kvm_apic_get_reg(apic, APIC_LDR);
419 return logical_id & mda; 518 return logical_id & mda;
420 } 519 }
421 520
422 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); 521 logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
423 522
424 switch (apic_get_reg(apic, APIC_DFR)) { 523 switch (kvm_apic_get_reg(apic, APIC_DFR)) {
425 case APIC_DFR_FLAT: 524 case APIC_DFR_FLAT:
426 if (logical_id & mda) 525 if (logical_id & mda)
427 result = 1; 526 result = 1;
@@ -433,7 +532,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
433 break; 532 break;
434 default: 533 default:
435 apic_debug("Bad DFR vcpu %d: %08x\n", 534 apic_debug("Bad DFR vcpu %d: %08x\n",
436 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); 535 apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
437 break; 536 break;
438 } 537 }
439 538
@@ -478,6 +577,72 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
478 return result; 577 return result;
479} 578}
480 579
580bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
581 struct kvm_lapic_irq *irq, int *r)
582{
583 struct kvm_apic_map *map;
584 unsigned long bitmap = 1;
585 struct kvm_lapic **dst;
586 int i;
587 bool ret = false;
588
589 *r = -1;
590
591 if (irq->shorthand == APIC_DEST_SELF) {
592 *r = kvm_apic_set_irq(src->vcpu, irq);
593 return true;
594 }
595
596 if (irq->shorthand)
597 return false;
598
599 rcu_read_lock();
600 map = rcu_dereference(kvm->arch.apic_map);
601
602 if (!map)
603 goto out;
604
605 if (irq->dest_mode == 0) { /* physical mode */
606 if (irq->delivery_mode == APIC_DM_LOWEST ||
607 irq->dest_id == 0xff)
608 goto out;
609 dst = &map->phys_map[irq->dest_id & 0xff];
610 } else {
611 u32 mda = irq->dest_id << (32 - map->ldr_bits);
612
613 dst = map->logical_map[apic_cluster_id(map, mda)];
614
615 bitmap = apic_logical_id(map, mda);
616
617 if (irq->delivery_mode == APIC_DM_LOWEST) {
618 int l = -1;
619 for_each_set_bit(i, &bitmap, 16) {
620 if (!dst[i])
621 continue;
622 if (l < 0)
623 l = i;
624 else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
625 l = i;
626 }
627
628 bitmap = (l >= 0) ? 1 << l : 0;
629 }
630 }
631
632 for_each_set_bit(i, &bitmap, 16) {
633 if (!dst[i])
634 continue;
635 if (*r < 0)
636 *r = 0;
637 *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
638 }
639
640 ret = true;
641out:
642 rcu_read_unlock();
643 return ret;
644}
645
481/* 646/*
482 * Add a pending IRQ into lapic. 647 * Add a pending IRQ into lapic.
483 * Return 1 if successfully added and 0 if discarded. 648 * Return 1 if successfully added and 0 if discarded.
@@ -591,7 +756,7 @@ static int apic_set_eoi(struct kvm_lapic *apic)
591 apic_clear_isr(vector, apic); 756 apic_clear_isr(vector, apic);
592 apic_update_ppr(apic); 757 apic_update_ppr(apic);
593 758
594 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
595 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { 760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
596 int trigger_mode; 761 int trigger_mode;
597 if (apic_test_vector(vector, apic->regs + APIC_TMR)) 762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
@@ -606,8 +771,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
606 771
607static void apic_send_ipi(struct kvm_lapic *apic) 772static void apic_send_ipi(struct kvm_lapic *apic)
608{ 773{
609 u32 icr_low = apic_get_reg(apic, APIC_ICR); 774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
610 u32 icr_high = apic_get_reg(apic, APIC_ICR2); 775 u32 icr_high = kvm_apic_get_reg(apic, APIC_ICR2);
611 struct kvm_lapic_irq irq; 776 struct kvm_lapic_irq irq;
612 777
613 irq.vector = icr_low & APIC_VECTOR_MASK; 778 irq.vector = icr_low & APIC_VECTOR_MASK;
@@ -642,7 +807,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
642 ASSERT(apic != NULL); 807 ASSERT(apic != NULL);
643 808
644 /* if initial count is 0, current count should also be 0 */ 809 /* if initial count is 0, current count should also be 0 */
645 if (apic_get_reg(apic, APIC_TMICT) == 0) 810 if (kvm_apic_get_reg(apic, APIC_TMICT) == 0)
646 return 0; 811 return 0;
647 812
648 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); 813 remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
@@ -696,13 +861,15 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
696 861
697 val = apic_get_tmcct(apic); 862 val = apic_get_tmcct(apic);
698 break; 863 break;
699 864 case APIC_PROCPRI:
865 apic_update_ppr(apic);
866 val = kvm_apic_get_reg(apic, offset);
867 break;
700 case APIC_TASKPRI: 868 case APIC_TASKPRI:
701 report_tpr_access(apic, false); 869 report_tpr_access(apic, false);
702 /* fall thru */ 870 /* fall thru */
703 default: 871 default:
704 apic_update_ppr(apic); 872 val = kvm_apic_get_reg(apic, offset);
705 val = apic_get_reg(apic, offset);
706 break; 873 break;
707 } 874 }
708 875
@@ -719,7 +886,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
719{ 886{
720 unsigned char alignment = offset & 0xf; 887 unsigned char alignment = offset & 0xf;
721 u32 result; 888 u32 result;
722 /* this bitmask has a bit cleared for each reserver register */ 889 /* this bitmask has a bit cleared for each reserved register */
723 static const u64 rmask = 0x43ff01ffffffe70cULL; 890 static const u64 rmask = 0x43ff01ffffffe70cULL;
724 891
725 if ((alignment + len) > 4) { 892 if ((alignment + len) > 4) {
@@ -754,7 +921,7 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
754 921
755static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) 922static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
756{ 923{
757 return apic_hw_enabled(apic) && 924 return kvm_apic_hw_enabled(apic) &&
758 addr >= apic->base_address && 925 addr >= apic->base_address &&
759 addr < apic->base_address + LAPIC_MMIO_LENGTH; 926 addr < apic->base_address + LAPIC_MMIO_LENGTH;
760} 927}
@@ -777,7 +944,7 @@ static void update_divide_count(struct kvm_lapic *apic)
777{ 944{
778 u32 tmp1, tmp2, tdcr; 945 u32 tmp1, tmp2, tdcr;
779 946
780 tdcr = apic_get_reg(apic, APIC_TDCR); 947 tdcr = kvm_apic_get_reg(apic, APIC_TDCR);
781 tmp1 = tdcr & 0xf; 948 tmp1 = tdcr & 0xf;
782 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; 949 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
783 apic->divide_count = 0x1 << (tmp2 & 0x7); 950 apic->divide_count = 0x1 << (tmp2 & 0x7);
@@ -792,9 +959,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
792 atomic_set(&apic->lapic_timer.pending, 0); 959 atomic_set(&apic->lapic_timer.pending, 0);
793 960
794 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { 961 if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
795 /* lapic timer in oneshot or peroidic mode */ 962 /* lapic timer in oneshot or periodic mode */
796 now = apic->lapic_timer.timer.base->get_time(); 963 now = apic->lapic_timer.timer.base->get_time();
797 apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) 964 apic->lapic_timer.period = (u64)kvm_apic_get_reg(apic, APIC_TMICT)
798 * APIC_BUS_CYCLE_NS * apic->divide_count; 965 * APIC_BUS_CYCLE_NS * apic->divide_count;
799 966
800 if (!apic->lapic_timer.period) 967 if (!apic->lapic_timer.period)
@@ -826,7 +993,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
826 "timer initial count 0x%x, period %lldns, " 993 "timer initial count 0x%x, period %lldns, "
827 "expire @ 0x%016" PRIx64 ".\n", __func__, 994 "expire @ 0x%016" PRIx64 ".\n", __func__,
828 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 995 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
829 apic_get_reg(apic, APIC_TMICT), 996 kvm_apic_get_reg(apic, APIC_TMICT),
830 apic->lapic_timer.period, 997 apic->lapic_timer.period,
831 ktime_to_ns(ktime_add_ns(now, 998 ktime_to_ns(ktime_add_ns(now,
832 apic->lapic_timer.period))); 999 apic->lapic_timer.period)));
@@ -858,7 +1025,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
858 1025
859static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) 1026static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
860{ 1027{
861 int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); 1028 int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
862 1029
863 if (apic_lvt_nmi_mode(lvt0_val)) { 1030 if (apic_lvt_nmi_mode(lvt0_val)) {
864 if (!nmi_wd_enabled) { 1031 if (!nmi_wd_enabled) {
@@ -879,7 +1046,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
879 switch (reg) { 1046 switch (reg) {
880 case APIC_ID: /* Local APIC ID */ 1047 case APIC_ID: /* Local APIC ID */
881 if (!apic_x2apic_mode(apic)) 1048 if (!apic_x2apic_mode(apic))
882 apic_set_reg(apic, APIC_ID, val); 1049 kvm_apic_set_id(apic, val >> 24);
883 else 1050 else
884 ret = 1; 1051 ret = 1;
885 break; 1052 break;
@@ -895,29 +1062,30 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
895 1062
896 case APIC_LDR: 1063 case APIC_LDR:
897 if (!apic_x2apic_mode(apic)) 1064 if (!apic_x2apic_mode(apic))
898 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); 1065 kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
899 else 1066 else
900 ret = 1; 1067 ret = 1;
901 break; 1068 break;
902 1069
903 case APIC_DFR: 1070 case APIC_DFR:
904 if (!apic_x2apic_mode(apic)) 1071 if (!apic_x2apic_mode(apic)) {
905 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); 1072 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
906 else 1073 recalculate_apic_map(apic->vcpu->kvm);
1074 } else
907 ret = 1; 1075 ret = 1;
908 break; 1076 break;
909 1077
910 case APIC_SPIV: { 1078 case APIC_SPIV: {
911 u32 mask = 0x3ff; 1079 u32 mask = 0x3ff;
912 if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) 1080 if (kvm_apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
913 mask |= APIC_SPIV_DIRECTED_EOI; 1081 mask |= APIC_SPIV_DIRECTED_EOI;
914 apic_set_reg(apic, APIC_SPIV, val & mask); 1082 apic_set_spiv(apic, val & mask);
915 if (!(val & APIC_SPIV_APIC_ENABLED)) { 1083 if (!(val & APIC_SPIV_APIC_ENABLED)) {
916 int i; 1084 int i;
917 u32 lvt_val; 1085 u32 lvt_val;
918 1086
919 for (i = 0; i < APIC_LVT_NUM; i++) { 1087 for (i = 0; i < APIC_LVT_NUM; i++) {
920 lvt_val = apic_get_reg(apic, 1088 lvt_val = kvm_apic_get_reg(apic,
921 APIC_LVTT + 0x10 * i); 1089 APIC_LVTT + 0x10 * i);
922 apic_set_reg(apic, APIC_LVTT + 0x10 * i, 1090 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
923 lvt_val | APIC_LVT_MASKED); 1091 lvt_val | APIC_LVT_MASKED);
@@ -946,7 +1114,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
946 case APIC_LVT1: 1114 case APIC_LVT1:
947 case APIC_LVTERR: 1115 case APIC_LVTERR:
948 /* TODO: Check vector */ 1116 /* TODO: Check vector */
949 if (!apic_sw_enabled(apic)) 1117 if (!kvm_apic_sw_enabled(apic))
950 val |= APIC_LVT_MASKED; 1118 val |= APIC_LVT_MASKED;
951 1119
952 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; 1120 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
@@ -955,12 +1123,12 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
955 break; 1123 break;
956 1124
957 case APIC_LVTT: 1125 case APIC_LVTT:
958 if ((apic_get_reg(apic, APIC_LVTT) & 1126 if ((kvm_apic_get_reg(apic, APIC_LVTT) &
959 apic->lapic_timer.timer_mode_mask) != 1127 apic->lapic_timer.timer_mode_mask) !=
960 (val & apic->lapic_timer.timer_mode_mask)) 1128 (val & apic->lapic_timer.timer_mode_mask))
961 hrtimer_cancel(&apic->lapic_timer.timer); 1129 hrtimer_cancel(&apic->lapic_timer.timer);
962 1130
963 if (!apic_sw_enabled(apic)) 1131 if (!kvm_apic_sw_enabled(apic))
964 val |= APIC_LVT_MASKED; 1132 val |= APIC_LVT_MASKED;
965 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); 1133 val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
966 apic_set_reg(apic, APIC_LVTT, val); 1134 apic_set_reg(apic, APIC_LVTT, val);
@@ -1039,24 +1207,30 @@ static int apic_mmio_write(struct kvm_io_device *this,
1039 1207
1040void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) 1208void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1041{ 1209{
1042 struct kvm_lapic *apic = vcpu->arch.apic; 1210 if (kvm_vcpu_has_lapic(vcpu))
1043
1044 if (apic)
1045 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); 1211 apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
1046} 1212}
1047EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1048 1214
1049void kvm_free_lapic(struct kvm_vcpu *vcpu) 1215void kvm_free_lapic(struct kvm_vcpu *vcpu)
1050{ 1216{
1217 struct kvm_lapic *apic = vcpu->arch.apic;
1218
1051 if (!vcpu->arch.apic) 1219 if (!vcpu->arch.apic)
1052 return; 1220 return;
1053 1221
1054 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 1222 hrtimer_cancel(&apic->lapic_timer.timer);
1223
1224 if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
1225 static_key_slow_dec_deferred(&apic_hw_disabled);
1226
1227 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
1228 static_key_slow_dec_deferred(&apic_sw_disabled);
1055 1229
1056 if (vcpu->arch.apic->regs) 1230 if (apic->regs)
1057 free_page((unsigned long)vcpu->arch.apic->regs); 1231 free_page((unsigned long)apic->regs);
1058 1232
1059 kfree(vcpu->arch.apic); 1233 kfree(apic);
1060} 1234}
1061 1235
1062/* 1236/*
@@ -1068,10 +1242,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
1068u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) 1242u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1069{ 1243{
1070 struct kvm_lapic *apic = vcpu->arch.apic; 1244 struct kvm_lapic *apic = vcpu->arch.apic;
1071 if (!apic)
1072 return 0;
1073 1245
1074 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1246 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1247 apic_lvtt_period(apic))
1075 return 0; 1248 return 0;
1076 1249
1077 return apic->lapic_timer.tscdeadline; 1250 return apic->lapic_timer.tscdeadline;
@@ -1080,10 +1253,9 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
1080void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) 1253void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
1081{ 1254{
1082 struct kvm_lapic *apic = vcpu->arch.apic; 1255 struct kvm_lapic *apic = vcpu->arch.apic;
1083 if (!apic)
1084 return;
1085 1256
1086 if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) 1257 if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
1258 apic_lvtt_period(apic))
1087 return; 1259 return;
1088 1260
1089 hrtimer_cancel(&apic->lapic_timer.timer); 1261 hrtimer_cancel(&apic->lapic_timer.timer);
@@ -1095,20 +1267,21 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
1095{ 1267{
1096 struct kvm_lapic *apic = vcpu->arch.apic; 1268 struct kvm_lapic *apic = vcpu->arch.apic;
1097 1269
1098 if (!apic) 1270 if (!kvm_vcpu_has_lapic(vcpu))
1099 return; 1271 return;
1272
1100 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 1273 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
1101 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 1274 | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
1102} 1275}
1103 1276
1104u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 1277u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1105{ 1278{
1106 struct kvm_lapic *apic = vcpu->arch.apic;
1107 u64 tpr; 1279 u64 tpr;
1108 1280
1109 if (!apic) 1281 if (!kvm_vcpu_has_lapic(vcpu))
1110 return 0; 1282 return 0;
1111 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); 1283
1284 tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
1112 1285
1113 return (tpr & 0xf0) >> 4; 1286 return (tpr & 0xf0) >> 4;
1114} 1287}
@@ -1123,6 +1296,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1123 return; 1296 return;
1124 } 1297 }
1125 1298
1299 /* update jump label if enable bit changes */
1300 if ((vcpu->arch.apic_base ^ value) & MSR_IA32_APICBASE_ENABLE) {
1301 if (value & MSR_IA32_APICBASE_ENABLE)
1302 static_key_slow_dec_deferred(&apic_hw_disabled);
1303 else
1304 static_key_slow_inc(&apic_hw_disabled.key);
1305 recalculate_apic_map(vcpu->kvm);
1306 }
1307
1126 if (!kvm_vcpu_is_bsp(apic->vcpu)) 1308 if (!kvm_vcpu_is_bsp(apic->vcpu))
1127 value &= ~MSR_IA32_APICBASE_BSP; 1309 value &= ~MSR_IA32_APICBASE_BSP;
1128 1310
@@ -1130,7 +1312,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1130 if (apic_x2apic_mode(apic)) { 1312 if (apic_x2apic_mode(apic)) {
1131 u32 id = kvm_apic_id(apic); 1313 u32 id = kvm_apic_id(apic);
1132 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); 1314 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
1133 apic_set_reg(apic, APIC_LDR, ldr); 1315 kvm_apic_set_ldr(apic, ldr);
1134 } 1316 }
1135 apic->base_address = apic->vcpu->arch.apic_base & 1317 apic->base_address = apic->vcpu->arch.apic_base &
1136 MSR_IA32_APICBASE_BASE; 1318 MSR_IA32_APICBASE_BASE;
@@ -1155,7 +1337,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1155 /* Stop the timer in case it's a reset to an active apic */ 1337 /* Stop the timer in case it's a reset to an active apic */
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1338 hrtimer_cancel(&apic->lapic_timer.timer);
1157 1339
1158 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 1340 kvm_apic_set_id(apic, vcpu->vcpu_id);
1159 kvm_apic_set_version(apic->vcpu); 1341 kvm_apic_set_version(apic->vcpu);
1160 1342
1161 for (i = 0; i < APIC_LVT_NUM; i++) 1343 for (i = 0; i < APIC_LVT_NUM; i++)
@@ -1164,9 +1346,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1164 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); 1346 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
1165 1347
1166 apic_set_reg(apic, APIC_DFR, 0xffffffffU); 1348 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
1167 apic_set_reg(apic, APIC_SPIV, 0xff); 1349 apic_set_spiv(apic, 0xff);
1168 apic_set_reg(apic, APIC_TASKPRI, 0); 1350 apic_set_reg(apic, APIC_TASKPRI, 0);
1169 apic_set_reg(apic, APIC_LDR, 0); 1351 kvm_apic_set_ldr(apic, 0);
1170 apic_set_reg(apic, APIC_ESR, 0); 1352 apic_set_reg(apic, APIC_ESR, 0);
1171 apic_set_reg(apic, APIC_ICR, 0); 1353 apic_set_reg(apic, APIC_ICR, 0);
1172 apic_set_reg(apic, APIC_ICR2, 0); 1354 apic_set_reg(apic, APIC_ICR2, 0);
@@ -1183,7 +1365,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1183 update_divide_count(apic); 1365 update_divide_count(apic);
1184 atomic_set(&apic->lapic_timer.pending, 0); 1366 atomic_set(&apic->lapic_timer.pending, 0);
1185 if (kvm_vcpu_is_bsp(vcpu)) 1367 if (kvm_vcpu_is_bsp(vcpu))
1186 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 1368 kvm_lapic_set_base(vcpu,
1369 vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
1187 vcpu->arch.pv_eoi.msr_val = 0; 1370 vcpu->arch.pv_eoi.msr_val = 0;
1188 apic_update_ppr(apic); 1371 apic_update_ppr(apic);
1189 1372
@@ -1196,45 +1379,34 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1196 vcpu->arch.apic_base, apic->base_address); 1379 vcpu->arch.apic_base, apic->base_address);
1197} 1380}
1198 1381
1199bool kvm_apic_present(struct kvm_vcpu *vcpu)
1200{
1201 return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
1202}
1203
1204int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
1205{
1206 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
1207}
1208
1209/* 1382/*
1210 *---------------------------------------------------------------------- 1383 *----------------------------------------------------------------------
1211 * timer interface 1384 * timer interface
1212 *---------------------------------------------------------------------- 1385 *----------------------------------------------------------------------
1213 */ 1386 */
1214 1387
1215static bool lapic_is_periodic(struct kvm_timer *ktimer) 1388static bool lapic_is_periodic(struct kvm_lapic *apic)
1216{ 1389{
1217 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
1218 lapic_timer);
1219 return apic_lvtt_period(apic); 1390 return apic_lvtt_period(apic);
1220} 1391}
1221 1392
1222int apic_has_pending_timer(struct kvm_vcpu *vcpu) 1393int apic_has_pending_timer(struct kvm_vcpu *vcpu)
1223{ 1394{
1224 struct kvm_lapic *lapic = vcpu->arch.apic; 1395 struct kvm_lapic *apic = vcpu->arch.apic;
1225 1396
1226 if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) 1397 if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
1227 return atomic_read(&lapic->lapic_timer.pending); 1398 apic_lvt_enabled(apic, APIC_LVTT))
1399 return atomic_read(&apic->lapic_timer.pending);
1228 1400
1229 return 0; 1401 return 0;
1230} 1402}
1231 1403
1232int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) 1404int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
1233{ 1405{
1234 u32 reg = apic_get_reg(apic, lvt_type); 1406 u32 reg = kvm_apic_get_reg(apic, lvt_type);
1235 int vector, mode, trig_mode; 1407 int vector, mode, trig_mode;
1236 1408
1237 if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { 1409 if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
1238 vector = reg & APIC_VECTOR_MASK; 1410 vector = reg & APIC_VECTOR_MASK;
1239 mode = reg & APIC_MODE_MASK; 1411 mode = reg & APIC_MODE_MASK;
1240 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; 1412 trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -1251,15 +1423,40 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
1251 kvm_apic_local_deliver(apic, APIC_LVT0); 1423 kvm_apic_local_deliver(apic, APIC_LVT0);
1252} 1424}
1253 1425
1254static struct kvm_timer_ops lapic_timer_ops = {
1255 .is_periodic = lapic_is_periodic,
1256};
1257
1258static const struct kvm_io_device_ops apic_mmio_ops = { 1426static const struct kvm_io_device_ops apic_mmio_ops = {
1259 .read = apic_mmio_read, 1427 .read = apic_mmio_read,
1260 .write = apic_mmio_write, 1428 .write = apic_mmio_write,
1261}; 1429};
1262 1430
1431static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
1432{
1433 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
1434 struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
1435 struct kvm_vcpu *vcpu = apic->vcpu;
1436 wait_queue_head_t *q = &vcpu->wq;
1437
1438 /*
1439 * There is a race window between reading and incrementing, but we do
1440 * not care about potentially losing timer events in the !reinject
1441 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
1442 * in vcpu_enter_guest.
1443 */
1444 if (!atomic_read(&ktimer->pending)) {
1445 atomic_inc(&ktimer->pending);
1446 /* FIXME: this code should not know anything about vcpus */
1447 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1448 }
1449
1450 if (waitqueue_active(q))
1451 wake_up_interruptible(q);
1452
1453 if (lapic_is_periodic(apic)) {
1454 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
1455 return HRTIMER_RESTART;
1456 } else
1457 return HRTIMER_NORESTART;
1458}
1459
1263int kvm_create_lapic(struct kvm_vcpu *vcpu) 1460int kvm_create_lapic(struct kvm_vcpu *vcpu)
1264{ 1461{
1265 struct kvm_lapic *apic; 1462 struct kvm_lapic *apic;
@@ -1283,14 +1480,17 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1283 1480
1284 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1481 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
1285 HRTIMER_MODE_ABS); 1482 HRTIMER_MODE_ABS);
1286 apic->lapic_timer.timer.function = kvm_timer_fn; 1483 apic->lapic_timer.timer.function = apic_timer_fn;
1287 apic->lapic_timer.t_ops = &lapic_timer_ops;
1288 apic->lapic_timer.kvm = vcpu->kvm;
1289 apic->lapic_timer.vcpu = vcpu;
1290 1484
1291 apic->base_address = APIC_DEFAULT_PHYS_BASE; 1485 /*
1292 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 1486 * APIC is created enabled. This will prevent kvm_lapic_set_base from
1487 * thinking that APIC satet has changed.
1488 */
1489 vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
1490 kvm_lapic_set_base(vcpu,
1491 APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
1293 1492
1493 static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
1294 kvm_lapic_reset(vcpu); 1494 kvm_lapic_reset(vcpu);
1295 kvm_iodevice_init(&apic->dev, &apic_mmio_ops); 1495 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
1296 1496
@@ -1306,23 +1506,23 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1306 struct kvm_lapic *apic = vcpu->arch.apic; 1506 struct kvm_lapic *apic = vcpu->arch.apic;
1307 int highest_irr; 1507 int highest_irr;
1308 1508
1309 if (!apic || !apic_enabled(apic)) 1509 if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
1310 return -1; 1510 return -1;
1311 1511
1312 apic_update_ppr(apic); 1512 apic_update_ppr(apic);
1313 highest_irr = apic_find_highest_irr(apic); 1513 highest_irr = apic_find_highest_irr(apic);
1314 if ((highest_irr == -1) || 1514 if ((highest_irr == -1) ||
1315 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) 1515 ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
1316 return -1; 1516 return -1;
1317 return highest_irr; 1517 return highest_irr;
1318} 1518}
1319 1519
1320int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1520int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1321{ 1521{
1322 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1522 u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1323 int r = 0; 1523 int r = 0;
1324 1524
1325 if (!apic_hw_enabled(vcpu->arch.apic)) 1525 if (!kvm_apic_hw_enabled(vcpu->arch.apic))
1326 r = 1; 1526 r = 1;
1327 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1527 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1328 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1528 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1334,7 +1534,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1334{ 1534{
1335 struct kvm_lapic *apic = vcpu->arch.apic; 1535 struct kvm_lapic *apic = vcpu->arch.apic;
1336 1536
1337 if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { 1537 if (!kvm_vcpu_has_lapic(vcpu))
1538 return;
1539
1540 if (atomic_read(&apic->lapic_timer.pending) > 0) {
1338 if (kvm_apic_local_deliver(apic, APIC_LVTT)) 1541 if (kvm_apic_local_deliver(apic, APIC_LVTT))
1339 atomic_dec(&apic->lapic_timer.pending); 1542 atomic_dec(&apic->lapic_timer.pending);
1340 } 1543 }
@@ -1354,12 +1557,17 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1354 return vector; 1557 return vector;
1355} 1558}
1356 1559
1357void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1560void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1561 struct kvm_lapic_state *s)
1358{ 1562{
1359 struct kvm_lapic *apic = vcpu->arch.apic; 1563 struct kvm_lapic *apic = vcpu->arch.apic;
1360 1564
1361 apic->base_address = vcpu->arch.apic_base & 1565 kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
1362 MSR_IA32_APICBASE_BASE; 1566 /* set SPIV separately to get count of SW disabled APICs right */
1567 apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
1568 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1569 /* call kvm_apic_set_id() to put apic into apic_map */
1570 kvm_apic_set_id(apic, kvm_apic_id(apic));
1363 kvm_apic_set_version(vcpu); 1571 kvm_apic_set_version(vcpu);
1364 1572
1365 apic_update_ppr(apic); 1573 apic_update_ppr(apic);
@@ -1374,13 +1582,12 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1374 1582
1375void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1583void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1376{ 1584{
1377 struct kvm_lapic *apic = vcpu->arch.apic;
1378 struct hrtimer *timer; 1585 struct hrtimer *timer;
1379 1586
1380 if (!apic) 1587 if (!kvm_vcpu_has_lapic(vcpu))
1381 return; 1588 return;
1382 1589
1383 timer = &apic->lapic_timer.timer; 1590 timer = &vcpu->arch.apic->lapic_timer.timer;
1384 if (hrtimer_cancel(timer)) 1591 if (hrtimer_cancel(timer))
1385 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 1592 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1386} 1593}
@@ -1478,7 +1685,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1478 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) 1685 if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
1479 return; 1686 return;
1480 1687
1481 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; 1688 tpr = kvm_apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1482 max_irr = apic_find_highest_irr(apic); 1689 max_irr = apic_find_highest_irr(apic);
1483 if (max_irr < 0) 1690 if (max_irr < 0)
1484 max_irr = 0; 1691 max_irr = 0;
@@ -1537,7 +1744,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1537{ 1744{
1538 struct kvm_lapic *apic = vcpu->arch.apic; 1745 struct kvm_lapic *apic = vcpu->arch.apic;
1539 1746
1540 if (!irqchip_in_kernel(vcpu->kvm)) 1747 if (!kvm_vcpu_has_lapic(vcpu))
1541 return 1; 1748 return 1;
1542 1749
1543 /* if this is ICR write vector before command */ 1750 /* if this is ICR write vector before command */
@@ -1551,7 +1758,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1551 struct kvm_lapic *apic = vcpu->arch.apic; 1758 struct kvm_lapic *apic = vcpu->arch.apic;
1552 u32 low, high = 0; 1759 u32 low, high = 0;
1553 1760
1554 if (!irqchip_in_kernel(vcpu->kvm)) 1761 if (!kvm_vcpu_has_lapic(vcpu))
1555 return 1; 1762 return 1;
1556 1763
1557 if (apic_reg_read(apic, reg, 4, &low)) 1764 if (apic_reg_read(apic, reg, 4, &low))
@@ -1576,3 +1783,10 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
1576 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, 1783 return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
1577 addr); 1784 addr);
1578} 1785}
1786
1787void kvm_lapic_init(void)
1788{
1789 /* do not patch jump label more than once per second */
1790 jump_label_rate_limit(&apic_hw_disabled, HZ);
1791 jump_label_rate_limit(&apic_sw_disabled, HZ);
1792}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4af5405ae1e2..e5ebf9f3571f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,10 +2,17 @@
2#define __KVM_X86_LAPIC_H 2#define __KVM_X86_LAPIC_H
3 3
4#include "iodev.h" 4#include "iodev.h"
5#include "kvm_timer.h"
6 5
7#include <linux/kvm_host.h> 6#include <linux/kvm_host.h>
8 7
8struct kvm_timer {
9 struct hrtimer timer;
10 s64 period; /* unit: ns */
11 u32 timer_mode_mask;
12 u64 tscdeadline;
13 atomic_t pending; /* accumulated triggered timers */
14};
15
9struct kvm_lapic { 16struct kvm_lapic {
10 unsigned long base_address; 17 unsigned long base_address;
11 struct kvm_io_device dev; 18 struct kvm_io_device dev;
@@ -45,11 +52,13 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
45int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 52int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
46int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); 53int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
47 54
55bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
56 struct kvm_lapic_irq *irq, int *r);
57
48u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); 58u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
49void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); 59void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
50void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); 60void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
51int kvm_lapic_enabled(struct kvm_vcpu *vcpu); 61 struct kvm_lapic_state *s);
52bool kvm_apic_present(struct kvm_vcpu *vcpu);
53int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); 62int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
54 63
55u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -71,4 +80,48 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
71} 80}
72 81
73int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); 82int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
83void kvm_lapic_init(void);
84
85static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
86{
87 return *((u32 *) (apic->regs + reg_off));
88}
89
90extern struct static_key kvm_no_apic_vcpu;
91
92static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
93{
94 if (static_key_false(&kvm_no_apic_vcpu))
95 return vcpu->arch.apic;
96 return true;
97}
98
99extern struct static_key_deferred apic_hw_disabled;
100
101static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
102{
103 if (static_key_false(&apic_hw_disabled.key))
104 return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
105 return MSR_IA32_APICBASE_ENABLE;
106}
107
108extern struct static_key_deferred apic_sw_disabled;
109
110static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
111{
112 if (static_key_false(&apic_sw_disabled.key))
113 return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
114 return APIC_SPIV_APIC_ENABLED;
115}
116
117static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
118{
119 return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
120}
121
122static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
123{
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125}
126
74#endif 127#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d273ea8..d289fee1ffb8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -556,6 +556,14 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
556 return 0; 556 return 0;
557 557
558 pfn = spte_to_pfn(old_spte); 558 pfn = spte_to_pfn(old_spte);
559
560 /*
561 * KVM does not hold the refcount of the page used by
562 * kvm mmu, before reclaiming the page, we should
563 * unmap it from mmu first.
564 */
565 WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
566
559 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 567 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
560 kvm_set_pfn_accessed(pfn); 568 kvm_set_pfn_accessed(pfn);
561 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 569 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
@@ -960,13 +968,10 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
960static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, 968static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
961 struct kvm_memory_slot *slot) 969 struct kvm_memory_slot *slot)
962{ 970{
963 struct kvm_lpage_info *linfo; 971 unsigned long idx;
964
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967 972
968 linfo = lpage_info_slot(gfn, slot, level); 973 idx = gfn_to_index(gfn, slot->base_gfn, level);
969 return &linfo->rmap_pde; 974 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
970} 975}
971 976
972/* 977/*
@@ -1173,7 +1178,8 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1173 unsigned long *rmapp; 1178 unsigned long *rmapp;
1174 1179
1175 while (mask) { 1180 while (mask) {
1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot);
1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1178 1184
1179 /* clear the first set bit */ 1185 /* clear the first set bit */
@@ -1200,7 +1206,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1200} 1206}
1201 1207
1202static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1208static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1203 unsigned long data) 1209 struct kvm_memory_slot *slot, unsigned long data)
1204{ 1210{
1205 u64 *sptep; 1211 u64 *sptep;
1206 struct rmap_iterator iter; 1212 struct rmap_iterator iter;
@@ -1218,7 +1224,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1218} 1224}
1219 1225
1220static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1226static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1221 unsigned long data) 1227 struct kvm_memory_slot *slot, unsigned long data)
1222{ 1228{
1223 u64 *sptep; 1229 u64 *sptep;
1224 struct rmap_iterator iter; 1230 struct rmap_iterator iter;
@@ -1259,43 +1265,67 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
1259 return 0; 1265 return 0;
1260} 1266}
1261 1267
1262static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1268static int kvm_handle_hva_range(struct kvm *kvm,
1263 unsigned long data, 1269 unsigned long start,
1264 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1270 unsigned long end,
1265 unsigned long data)) 1271 unsigned long data,
1272 int (*handler)(struct kvm *kvm,
1273 unsigned long *rmapp,
1274 struct kvm_memory_slot *slot,
1275 unsigned long data))
1266{ 1276{
1267 int j; 1277 int j;
1268 int ret; 1278 int ret = 0;
1269 int retval = 0;
1270 struct kvm_memslots *slots; 1279 struct kvm_memslots *slots;
1271 struct kvm_memory_slot *memslot; 1280 struct kvm_memory_slot *memslot;
1272 1281
1273 slots = kvm_memslots(kvm); 1282 slots = kvm_memslots(kvm);
1274 1283
1275 kvm_for_each_memslot(memslot, slots) { 1284 kvm_for_each_memslot(memslot, slots) {
1276 unsigned long start = memslot->userspace_addr; 1285 unsigned long hva_start, hva_end;
1277 unsigned long end; 1286 gfn_t gfn_start, gfn_end;
1278 1287
1279 end = start + (memslot->npages << PAGE_SHIFT); 1288 hva_start = max(start, memslot->userspace_addr);
1280 if (hva >= start && hva < end) { 1289 hva_end = min(end, memslot->userspace_addr +
1281 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 1290 (memslot->npages << PAGE_SHIFT));
1282 gfn_t gfn = memslot->base_gfn + gfn_offset; 1291 if (hva_start >= hva_end)
1292 continue;
1293 /*
1294 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1295 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1296 */
1297 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1298 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1283 1299
1284 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 1300 for (j = PT_PAGE_TABLE_LEVEL;
1301 j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
1302 unsigned long idx, idx_end;
1303 unsigned long *rmapp;
1285 1304
1286 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 1305 /*
1287 struct kvm_lpage_info *linfo; 1306 * {idx(page_j) | page_j intersects with
1307 * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
1308 */
1309 idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
1310 idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
1288 1311
1289 linfo = lpage_info_slot(gfn, memslot, 1312 rmapp = __gfn_to_rmap(gfn_start, j, memslot);
1290 PT_DIRECTORY_LEVEL + j); 1313
1291 ret |= handler(kvm, &linfo->rmap_pde, data); 1314 for (; idx <= idx_end; ++idx)
1292 } 1315 ret |= handler(kvm, rmapp++, memslot, data);
1293 trace_kvm_age_page(hva, memslot, ret);
1294 retval |= ret;
1295 } 1316 }
1296 } 1317 }
1297 1318
1298 return retval; 1319 return ret;
1320}
1321
1322static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1323 unsigned long data,
1324 int (*handler)(struct kvm *kvm, unsigned long *rmapp,
1325 struct kvm_memory_slot *slot,
1326 unsigned long data))
1327{
1328 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1299} 1329}
1300 1330
1301int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1331int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
@@ -1303,13 +1333,18 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1303 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1333 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
1304} 1334}
1305 1335
1336int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1337{
1338 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1339}
1340
1306void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1341void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1307{ 1342{
1308 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1343 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1309} 1344}
1310 1345
1311static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1346static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1312 unsigned long data) 1347 struct kvm_memory_slot *slot, unsigned long data)
1313{ 1348{
1314 u64 *sptep; 1349 u64 *sptep;
1315 struct rmap_iterator uninitialized_var(iter); 1350 struct rmap_iterator uninitialized_var(iter);
@@ -1323,8 +1358,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1323 * This has some overhead, but not as much as the cost of swapping 1358 * This has some overhead, but not as much as the cost of swapping
1324 * out actively used pages or breaking up actively used hugepages. 1359 * out actively used pages or breaking up actively used hugepages.
1325 */ 1360 */
1326 if (!shadow_accessed_mask) 1361 if (!shadow_accessed_mask) {
1327 return kvm_unmap_rmapp(kvm, rmapp, data); 1362 young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
1363 goto out;
1364 }
1328 1365
1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1366 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1330 sptep = rmap_get_next(&iter)) { 1367 sptep = rmap_get_next(&iter)) {
@@ -1336,12 +1373,14 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1336 (unsigned long *)sptep); 1373 (unsigned long *)sptep);
1337 } 1374 }
1338 } 1375 }
1339 1376out:
1377 /* @data has hva passed to kvm_age_hva(). */
1378 trace_kvm_age_page(data, slot, young);
1340 return young; 1379 return young;
1341} 1380}
1342 1381
1343static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1382static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1344 unsigned long data) 1383 struct kvm_memory_slot *slot, unsigned long data)
1345{ 1384{
1346 u64 *sptep; 1385 u64 *sptep;
1347 struct rmap_iterator iter; 1386 struct rmap_iterator iter;
@@ -1379,13 +1418,13 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1379 1418
1380 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 1419 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1381 1420
1382 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); 1421 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
1383 kvm_flush_remote_tlbs(vcpu->kvm); 1422 kvm_flush_remote_tlbs(vcpu->kvm);
1384} 1423}
1385 1424
1386int kvm_age_hva(struct kvm *kvm, unsigned long hva) 1425int kvm_age_hva(struct kvm *kvm, unsigned long hva)
1387{ 1426{
1388 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 1427 return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
1389} 1428}
1390 1429
1391int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1430int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -2457,7 +2496,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2457 rmap_recycle(vcpu, sptep, gfn); 2496 rmap_recycle(vcpu, sptep, gfn);
2458 } 2497 }
2459 } 2498 }
2460 kvm_release_pfn_clean(pfn); 2499
2500 if (!is_error_pfn(pfn))
2501 kvm_release_pfn_clean(pfn);
2461} 2502}
2462 2503
2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2504static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2469,17 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2469 bool no_dirty_log) 2510 bool no_dirty_log)
2470{ 2511{
2471 struct kvm_memory_slot *slot; 2512 struct kvm_memory_slot *slot;
2472 unsigned long hva;
2473 2513
2474 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2514 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2475 if (!slot) { 2515 if (!slot)
2476 get_page(fault_page); 2516 return KVM_PFN_ERR_FAULT;
2477 return page_to_pfn(fault_page);
2478 }
2479 2517
2480 hva = gfn_to_hva_memslot(slot, gfn); 2518 return gfn_to_pfn_memslot_atomic(slot, gfn);
2481
2482 return hva_to_pfn_atomic(vcpu->kvm, hva);
2483} 2519}
2484 2520
2485static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2521static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
@@ -2580,11 +2616,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2580 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2616 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
2581 iterator.level - 1, 2617 iterator.level - 1,
2582 1, ACC_ALL, iterator.sptep); 2618 1, ACC_ALL, iterator.sptep);
2583 if (!sp) {
2584 pgprintk("nonpaging_map: ENOMEM\n");
2585 kvm_release_pfn_clean(pfn);
2586 return -ENOMEM;
2587 }
2588 2619
2589 mmu_spte_set(iterator.sptep, 2620 mmu_spte_set(iterator.sptep,
2590 __pa(sp->spt) 2621 __pa(sp->spt)
@@ -2611,8 +2642,16 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2611 2642
2612static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) 2643static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2613{ 2644{
2614 kvm_release_pfn_clean(pfn); 2645 /*
2615 if (is_hwpoison_pfn(pfn)) { 2646 * Do not cache the mmio info caused by writing the readonly gfn
2647 * into the spte otherwise read access on readonly gfn also can
2648 * caused mmio page fault and treat it as mmio access.
2649 * Return 1 to tell kvm to emulate it.
2650 */
2651 if (pfn == KVM_PFN_ERR_RO_FAULT)
2652 return 1;
2653
2654 if (pfn == KVM_PFN_ERR_HWPOISON) {
2616 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); 2655 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2617 return 0; 2656 return 0;
2618 } 2657 }
@@ -3236,8 +3275,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3236 if (!async) 3275 if (!async)
3237 return false; /* *pfn has correct page already */ 3276 return false; /* *pfn has correct page already */
3238 3277
3239 put_page(pfn_to_page(*pfn));
3240
3241 if (!prefault && can_do_async_pf(vcpu)) { 3278 if (!prefault && can_do_async_pf(vcpu)) {
3242 trace_kvm_try_async_get_page(gva, gfn); 3279 trace_kvm_try_async_get_page(gva, gfn);
3243 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3280 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
@@ -3371,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3371 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3408 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
3372} 3409}
3373 3410
3411static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3412{
3413 unsigned mask;
3414
3415 BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
3416
3417 mask = (unsigned)~ACC_WRITE_MASK;
3418 /* Allow write access to dirty gptes */
3419 mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
3420 *access &= mask;
3421}
3422
3374static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3423static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3375 int *nr_present) 3424 int *nr_present)
3376{ 3425{
@@ -3388,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3388 return false; 3437 return false;
3389} 3438}
3390 3439
3440static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
3441{
3442 unsigned access;
3443
3444 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
3445 access &= ~(gpte >> PT64_NX_SHIFT);
3446
3447 return access;
3448}
3449
3450static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
3451{
3452 unsigned index;
3453
3454 index = level - 1;
3455 index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
3456 return mmu->last_pte_bitmap & (1 << index);
3457}
3458
3391#define PTTYPE 64 3459#define PTTYPE 64
3392#include "paging_tmpl.h" 3460#include "paging_tmpl.h"
3393#undef PTTYPE 3461#undef PTTYPE
@@ -3457,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
3457 } 3525 }
3458} 3526}
3459 3527
3528static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3529{
3530 unsigned bit, byte, pfec;
3531 u8 map;
3532 bool fault, x, w, u, wf, uf, ff, smep;
3533
3534 smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3535 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
3536 pfec = byte << 1;
3537 map = 0;
3538 wf = pfec & PFERR_WRITE_MASK;
3539 uf = pfec & PFERR_USER_MASK;
3540 ff = pfec & PFERR_FETCH_MASK;
3541 for (bit = 0; bit < 8; ++bit) {
3542 x = bit & ACC_EXEC_MASK;
3543 w = bit & ACC_WRITE_MASK;
3544 u = bit & ACC_USER_MASK;
3545
3546 /* Not really needed: !nx will cause pte.nx to fault */
3547 x |= !mmu->nx;
3548 /* Allow supervisor writes if !cr0.wp */
3549 w |= !is_write_protection(vcpu) && !uf;
3550 /* Disallow supervisor fetches of user code if cr4.smep */
3551 x &= !(smep && u && !uf);
3552
3553 fault = (ff && !x) || (uf && !u) || (wf && !w);
3554 map |= fault << bit;
3555 }
3556 mmu->permissions[byte] = map;
3557 }
3558}
3559
3560static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
3561{
3562 u8 map;
3563 unsigned level, root_level = mmu->root_level;
3564 const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
3565
3566 if (root_level == PT32E_ROOT_LEVEL)
3567 --root_level;
3568 /* PT_PAGE_TABLE_LEVEL always terminates */
3569 map = 1 | (1 << ps_set_index);
3570 for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
3571 if (level <= PT_PDPE_LEVEL
3572 && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
3573 map |= 1 << (ps_set_index | (level - 1));
3574 }
3575 mmu->last_pte_bitmap = map;
3576}
3577
3460static int paging64_init_context_common(struct kvm_vcpu *vcpu, 3578static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3461 struct kvm_mmu *context, 3579 struct kvm_mmu *context,
3462 int level) 3580 int level)
@@ -3465,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
3465 context->root_level = level; 3583 context->root_level = level;
3466 3584
3467 reset_rsvds_bits_mask(vcpu, context); 3585 reset_rsvds_bits_mask(vcpu, context);
3586 update_permission_bitmask(vcpu, context);
3587 update_last_pte_bitmap(vcpu, context);
3468 3588
3469 ASSERT(is_pae(vcpu)); 3589 ASSERT(is_pae(vcpu));
3470 context->new_cr3 = paging_new_cr3; 3590 context->new_cr3 = paging_new_cr3;
@@ -3493,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
3493 context->root_level = PT32_ROOT_LEVEL; 3613 context->root_level = PT32_ROOT_LEVEL;
3494 3614
3495 reset_rsvds_bits_mask(vcpu, context); 3615 reset_rsvds_bits_mask(vcpu, context);
3616 update_permission_bitmask(vcpu, context);
3617 update_last_pte_bitmap(vcpu, context);
3496 3618
3497 context->new_cr3 = paging_new_cr3; 3619 context->new_cr3 = paging_new_cr3;
3498 context->page_fault = paging32_page_fault; 3620 context->page_fault = paging32_page_fault;
@@ -3553,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3553 context->gva_to_gpa = paging32_gva_to_gpa; 3675 context->gva_to_gpa = paging32_gva_to_gpa;
3554 } 3676 }
3555 3677
3678 update_permission_bitmask(vcpu, context);
3679 update_last_pte_bitmap(vcpu, context);
3680
3556 return 0; 3681 return 0;
3557} 3682}
3558 3683
@@ -3628,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3628 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 3753 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3629 } 3754 }
3630 3755
3756 update_permission_bitmask(vcpu, g_context);
3757 update_last_pte_bitmap(vcpu, g_context);
3758
3631 return 0; 3759 return 0;
3632} 3760}
3633 3761
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9af021..69871080e866 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,8 +18,10 @@
18#define PT_PCD_MASK (1ULL << 4) 18#define PT_PCD_MASK (1ULL << 4)
19#define PT_ACCESSED_SHIFT 5 19#define PT_ACCESSED_SHIFT 5
20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) 20#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
21#define PT_DIRTY_MASK (1ULL << 6) 21#define PT_DIRTY_SHIFT 6
22#define PT_PAGE_SIZE_MASK (1ULL << 7) 22#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
23#define PT_PAGE_SIZE_SHIFT 7
24#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
23#define PT_PAT_MASK (1ULL << 7) 25#define PT_PAT_MASK (1ULL << 7)
24#define PT_GLOBAL_MASK (1ULL << 8) 26#define PT_GLOBAL_MASK (1ULL << 8)
25#define PT64_NX_SHIFT 63 27#define PT64_NX_SHIFT 63
@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
88 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 90 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
89} 91}
90 92
91static inline bool check_write_user_access(struct kvm_vcpu *vcpu, 93/*
92 bool write_fault, bool user_fault, 94 * Will a fault with a given page-fault error code (pfec) cause a permission
93 unsigned long pte) 95 * fault with the given access (in ACC_* format)?
96 */
97static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
98 unsigned pfec)
94{ 99{
95 if (unlikely(write_fault && !is_writable_pte(pte) 100 return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
96 && (user_fault || is_write_protection(vcpu))))
97 return false;
98
99 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
100 return false;
101
102 return true;
103} 101}
102
104#endif 103#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 7d7d0b9e23eb..daff69e21150 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -116,10 +116,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 116 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); 117 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
118 118
119 if (is_error_pfn(pfn)) { 119 if (is_error_pfn(pfn))
120 kvm_release_pfn_clean(pfn);
121 return; 120 return;
122 }
123 121
124 hpa = pfn << PAGE_SHIFT; 122 hpa = pfn << PAGE_SHIFT;
125 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) 123 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
@@ -190,7 +188,6 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
190 188
191static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) 189static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
192{ 190{
193 struct kvm_memory_slot *slot;
194 unsigned long *rmapp; 191 unsigned long *rmapp;
195 u64 *sptep; 192 u64 *sptep;
196 struct rmap_iterator iter; 193 struct rmap_iterator iter;
@@ -198,8 +195,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
198 if (sp->role.direct || sp->unsync || sp->role.invalid) 195 if (sp->role.direct || sp->unsync || sp->role.invalid)
199 return; 196 return;
200 197
201 slot = gfn_to_memslot(kvm, sp->gfn); 198 rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
202 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
203 199
204 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 200 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
205 sptep = rmap_get_next(&iter)) { 201 sptep = rmap_get_next(&iter)) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bb7cf01cae76..714e2c01a6fe 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -63,10 +63,12 @@
63 */ 63 */
64struct guest_walker { 64struct guest_walker {
65 int level; 65 int level;
66 unsigned max_level;
66 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
67 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
68 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; 69 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
70 unsigned pt_access; 72 unsigned pt_access;
71 unsigned pte_access; 73 unsigned pte_access;
72 gfn_t gfn; 74 gfn_t gfn;
@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
101 return (ret != orig_pte); 103 return (ret != orig_pte);
102} 104}
103 105
104static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, 106static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
105 bool last) 107 struct kvm_mmu *mmu,
108 struct guest_walker *walker,
109 int write_fault)
106{ 110{
107 unsigned access; 111 unsigned level, index;
108 112 pt_element_t pte, orig_pte;
109 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 113 pt_element_t __user *ptep_user;
110 if (last && !is_dirty_gpte(gpte)) 114 gfn_t table_gfn;
111 access &= ~ACC_WRITE_MASK; 115 int ret;
112 116
113#if PTTYPE == 64 117 for (level = walker->max_level; level >= walker->level; --level) {
114 if (vcpu->arch.mmu.nx) 118 pte = orig_pte = walker->ptes[level - 1];
115 access &= ~(gpte >> PT64_NX_SHIFT); 119 table_gfn = walker->table_gfn[level - 1];
116#endif 120 ptep_user = walker->ptep_user[level - 1];
117 return access; 121 index = offset_in_page(ptep_user) / sizeof(pt_element_t);
118} 122 if (!(pte & PT_ACCESSED_MASK)) {
119 123 trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
120static bool FNAME(is_last_gpte)(struct guest_walker *walker, 124 pte |= PT_ACCESSED_MASK;
121 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 125 }
122 pt_element_t gpte) 126 if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
123{ 127 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
124 if (walker->level == PT_PAGE_TABLE_LEVEL) 128 pte |= PT_DIRTY_MASK;
125 return true; 129 }
126 130 if (pte == orig_pte)
127 if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && 131 continue;
128 (PTTYPE == 64 || is_pse(vcpu)))
129 return true;
130 132
131 if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && 133 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
132 (mmu->root_level == PT64_ROOT_LEVEL)) 134 if (ret)
133 return true; 135 return ret;
134 136
135 return false; 137 mark_page_dirty(vcpu->kvm, table_gfn);
138 walker->ptes[level] = pte;
139 }
140 return 0;
136} 141}
137 142
138/* 143/*
@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
142 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 147 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
143 gva_t addr, u32 access) 148 gva_t addr, u32 access)
144{ 149{
150 int ret;
145 pt_element_t pte; 151 pt_element_t pte;
146 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
147 gfn_t table_gfn; 153 gfn_t table_gfn;
148 unsigned index, pt_access, uninitialized_var(pte_access); 154 unsigned index, pt_access, pte_access, accessed_dirty, shift;
149 gpa_t pte_gpa; 155 gpa_t pte_gpa;
150 bool eperm, last_gpte;
151 int offset; 156 int offset;
152 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
153 const int user_fault = access & PFERR_USER_MASK; 158 const int user_fault = access & PFERR_USER_MASK;
154 const int fetch_fault = access & PFERR_FETCH_MASK; 159 const int fetch_fault = access & PFERR_FETCH_MASK;
155 u16 errcode = 0; 160 u16 errcode = 0;
161 gpa_t real_gpa;
162 gfn_t gfn;
156 163
157 trace_kvm_mmu_pagetable_walk(addr, access); 164 trace_kvm_mmu_pagetable_walk(addr, access);
158retry_walk: 165retry_walk:
159 eperm = false;
160 walker->level = mmu->root_level; 166 walker->level = mmu->root_level;
161 pte = mmu->get_cr3(vcpu); 167 pte = mmu->get_cr3(vcpu);
162 168
@@ -169,15 +175,21 @@ retry_walk:
169 --walker->level; 175 --walker->level;
170 } 176 }
171#endif 177#endif
178 walker->max_level = walker->level;
172 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 179 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
173 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); 180 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
174 181
175 pt_access = ACC_ALL; 182 accessed_dirty = PT_ACCESSED_MASK;
183 pt_access = pte_access = ACC_ALL;
184 ++walker->level;
176 185
177 for (;;) { 186 do {
178 gfn_t real_gfn; 187 gfn_t real_gfn;
179 unsigned long host_addr; 188 unsigned long host_addr;
180 189
190 pt_access &= pte_access;
191 --walker->level;
192
181 index = PT_INDEX(addr, walker->level); 193 index = PT_INDEX(addr, walker->level);
182 194
183 table_gfn = gpte_to_gfn(pte); 195 table_gfn = gpte_to_gfn(pte);
@@ -199,6 +211,7 @@ retry_walk:
199 ptep_user = (pt_element_t __user *)((void *)host_addr + offset); 211 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
200 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) 212 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
201 goto error; 213 goto error;
214 walker->ptep_user[walker->level - 1] = ptep_user;
202 215
203 trace_kvm_mmu_paging_element(pte, walker->level); 216 trace_kvm_mmu_paging_element(pte, walker->level);
204 217
@@ -211,92 +224,48 @@ retry_walk:
211 goto error; 224 goto error;
212 } 225 }
213 226
214 if (!check_write_user_access(vcpu, write_fault, user_fault, 227 accessed_dirty &= pte;
215 pte)) 228 pte_access = pt_access & gpte_access(vcpu, pte);
216 eperm = true;
217
218#if PTTYPE == 64
219 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
220 eperm = true;
221#endif
222
223 last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
224 if (last_gpte) {
225 pte_access = pt_access &
226 FNAME(gpte_access)(vcpu, pte, true);
227 /* check if the kernel is fetching from user page */
228 if (unlikely(pte_access & PT_USER_MASK) &&
229 kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
230 if (fetch_fault && !user_fault)
231 eperm = true;
232 }
233
234 if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
235 int ret;
236 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
237 sizeof(pte));
238 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
239 pte, pte|PT_ACCESSED_MASK);
240 if (unlikely(ret < 0))
241 goto error;
242 else if (ret)
243 goto retry_walk;
244
245 mark_page_dirty(vcpu->kvm, table_gfn);
246 pte |= PT_ACCESSED_MASK;
247 }
248 229
249 walker->ptes[walker->level - 1] = pte; 230 walker->ptes[walker->level - 1] = pte;
231 } while (!is_last_gpte(mmu, walker->level, pte));
250 232
251 if (last_gpte) { 233 if (unlikely(permission_fault(mmu, pte_access, access))) {
252 int lvl = walker->level; 234 errcode |= PFERR_PRESENT_MASK;
253 gpa_t real_gpa; 235 goto error;
254 gfn_t gfn; 236 }
255 u32 ac;
256
257 gfn = gpte_to_gfn_lvl(pte, lvl);
258 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
259
260 if (PTTYPE == 32 &&
261 walker->level == PT_DIRECTORY_LEVEL &&
262 is_cpuid_PSE36())
263 gfn += pse36_gfn_delta(pte);
264
265 ac = write_fault | fetch_fault | user_fault;
266 237
267 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), 238 gfn = gpte_to_gfn_lvl(pte, walker->level);
268 ac); 239 gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
269 if (real_gpa == UNMAPPED_GVA)
270 return 0;
271 240
272 walker->gfn = real_gpa >> PAGE_SHIFT; 241 if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
242 gfn += pse36_gfn_delta(pte);
273 243
274 break; 244 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
275 } 245 if (real_gpa == UNMAPPED_GVA)
246 return 0;
276 247
277 pt_access &= FNAME(gpte_access)(vcpu, pte, false); 248 walker->gfn = real_gpa >> PAGE_SHIFT;
278 --walker->level;
279 }
280 249
281 if (unlikely(eperm)) { 250 if (!write_fault)
282 errcode |= PFERR_PRESENT_MASK; 251 protect_clean_gpte(&pte_access, pte);
283 goto error;
284 }
285 252
286 if (write_fault && unlikely(!is_dirty_gpte(pte))) { 253 /*
287 int ret; 254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one
255 * place right.
256 *
257 * On a read fault, do nothing.
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
288 262
289 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 263 if (unlikely(!accessed_dirty)) {
290 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, 264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
291 pte, pte|PT_DIRTY_MASK);
292 if (unlikely(ret < 0)) 265 if (unlikely(ret < 0))
293 goto error; 266 goto error;
294 else if (ret) 267 else if (ret)
295 goto retry_walk; 268 goto retry_walk;
296
297 mark_page_dirty(vcpu->kvm, table_gfn);
298 pte |= PT_DIRTY_MASK;
299 walker->ptes[walker->level - 1] = pte;
300 } 269 }
301 270
302 walker->pt_access = pt_access; 271 walker->pt_access = pt_access;
@@ -368,12 +337,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
368 return; 337 return;
369 338
370 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 339 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
371 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); 340 pte_access = sp->role.access & gpte_access(vcpu, gpte);
341 protect_clean_gpte(&pte_access, gpte);
372 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 342 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
373 if (mmu_invalid_pfn(pfn)) { 343 if (mmu_invalid_pfn(pfn))
374 kvm_release_pfn_clean(pfn);
375 return; 344 return;
376 }
377 345
378 /* 346 /*
379 * we call mmu_set_spte() with host_writable = true because that 347 * we call mmu_set_spte() with host_writable = true because that
@@ -443,15 +411,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
443 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 411 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
444 continue; 412 continue;
445 413
446 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, 414 pte_access = sp->role.access & gpte_access(vcpu, gpte);
447 true); 415 protect_clean_gpte(&pte_access, gpte);
448 gfn = gpte_to_gfn(gpte); 416 gfn = gpte_to_gfn(gpte);
449 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 417 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
450 pte_access & ACC_WRITE_MASK); 418 pte_access & ACC_WRITE_MASK);
451 if (mmu_invalid_pfn(pfn)) { 419 if (mmu_invalid_pfn(pfn))
452 kvm_release_pfn_clean(pfn);
453 break; 420 break;
454 }
455 421
456 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 422 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
457 NULL, PT_PAGE_TABLE_LEVEL, gfn, 423 NULL, PT_PAGE_TABLE_LEVEL, gfn,
@@ -798,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
798 764
799 gfn = gpte_to_gfn(gpte); 765 gfn = gpte_to_gfn(gpte);
800 pte_access = sp->role.access; 766 pte_access = sp->role.access;
801 pte_access &= FNAME(gpte_access)(vcpu, gpte, true); 767 pte_access &= gpte_access(vcpu, gpte);
768 protect_clean_gpte(&pte_access, gpte);
802 769
803 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) 770 if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
804 continue; 771 continue;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 9b7ec1150ab0..cfc258a6bf97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Kernel-based Virtual Machine -- Performane Monitoring Unit support 2 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
3 * 3 *
4 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 4 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
5 * 5 *
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index baead950d6c8..d017df3899ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -163,7 +163,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio);
163 163
164#define MSR_INVALID 0xffffffffU 164#define MSR_INVALID 0xffffffffU
165 165
166static struct svm_direct_access_msrs { 166static const struct svm_direct_access_msrs {
167 u32 index; /* Index of the MSR */ 167 u32 index; /* Index of the MSR */
168 bool always; /* True if intercept is always on */ 168 bool always; /* True if intercept is always on */
169} direct_access_msrs[] = { 169} direct_access_msrs[] = {
@@ -400,7 +400,7 @@ struct svm_init_data {
400 int r; 400 int r;
401}; 401};
402 402
403static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 403static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
404 404
405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
406#define MSRS_RANGE_SIZE 2048 406#define MSRS_RANGE_SIZE 2048
@@ -1146,7 +1146,6 @@ static void init_vmcb(struct vcpu_svm *svm)
1146 1146
1147 svm_set_efer(&svm->vcpu, 0); 1147 svm_set_efer(&svm->vcpu, 0);
1148 save->dr6 = 0xffff0ff0; 1148 save->dr6 = 0xffff0ff0;
1149 save->dr7 = 0x400;
1150 kvm_set_rflags(&svm->vcpu, 2); 1149 kvm_set_rflags(&svm->vcpu, 2);
1151 save->rip = 0x0000fff0; 1150 save->rip = 0x0000fff0;
1152 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1151 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1643,7 +1642,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1643 mark_dirty(svm->vmcb, VMCB_SEG); 1642 mark_dirty(svm->vmcb, VMCB_SEG);
1644} 1643}
1645 1644
1646static void update_db_intercept(struct kvm_vcpu *vcpu) 1645static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
1647{ 1646{
1648 struct vcpu_svm *svm = to_svm(vcpu); 1647 struct vcpu_svm *svm = to_svm(vcpu);
1649 1648
@@ -1663,20 +1662,6 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1663 vcpu->guest_debug = 0; 1662 vcpu->guest_debug = 0;
1664} 1663}
1665 1664
1666static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1667{
1668 struct vcpu_svm *svm = to_svm(vcpu);
1669
1670 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1671 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1672 else
1673 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1674
1675 mark_dirty(svm->vmcb, VMCB_DR);
1676
1677 update_db_intercept(vcpu);
1678}
1679
1680static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1665static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1681{ 1666{
1682 if (sd->next_asid > sd->max_asid) { 1667 if (sd->next_asid > sd->max_asid) {
@@ -1748,7 +1733,7 @@ static int db_interception(struct vcpu_svm *svm)
1748 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1733 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1749 svm->vmcb->save.rflags &= 1734 svm->vmcb->save.rflags &=
1750 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1735 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1751 update_db_intercept(&svm->vcpu); 1736 update_db_bp_intercept(&svm->vcpu);
1752 } 1737 }
1753 1738
1754 if (svm->vcpu.guest_debug & 1739 if (svm->vcpu.guest_debug &
@@ -2063,7 +2048,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
2063 if (svm->nested.intercept & 1ULL) { 2048 if (svm->nested.intercept & 1ULL) {
2064 /* 2049 /*
2065 * The #vmexit can't be emulated here directly because this 2050 * The #vmexit can't be emulated here directly because this
2066 * code path runs with irqs and preemtion disabled. A 2051 * code path runs with irqs and preemption disabled. A
2067 * #vmexit emulation might sleep. Only signal request for 2052 * #vmexit emulation might sleep. Only signal request for
2068 * the #vmexit here. 2053 * the #vmexit here.
2069 */ 2054 */
@@ -2105,7 +2090,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2105 return kmap(page); 2090 return kmap(page);
2106 2091
2107error: 2092error:
2108 kvm_release_page_clean(page);
2109 kvm_inject_gp(&svm->vcpu, 0); 2093 kvm_inject_gp(&svm->vcpu, 0);
2110 2094
2111 return NULL; 2095 return NULL;
@@ -2409,7 +2393,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2409{ 2393{
2410 /* 2394 /*
2411 * This function merges the msr permission bitmaps of kvm and the 2395 * This function merges the msr permission bitmaps of kvm and the
2412 * nested vmcb. It is omptimized in that it only merges the parts where 2396 * nested vmcb. It is optimized in that it only merges the parts where
2413 * the kvm msr permission bitmap may contain zero bits 2397 * the kvm msr permission bitmap may contain zero bits
2414 */ 2398 */
2415 int i; 2399 int i;
@@ -3268,7 +3252,7 @@ static int pause_interception(struct vcpu_svm *svm)
3268 return 1; 3252 return 1;
3269} 3253}
3270 3254
3271static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3255static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3272 [SVM_EXIT_READ_CR0] = cr_interception, 3256 [SVM_EXIT_READ_CR0] = cr_interception,
3273 [SVM_EXIT_READ_CR3] = cr_interception, 3257 [SVM_EXIT_READ_CR3] = cr_interception,
3274 [SVM_EXIT_READ_CR4] = cr_interception, 3258 [SVM_EXIT_READ_CR4] = cr_interception,
@@ -3660,7 +3644,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
3660 */ 3644 */
3661 svm->nmi_singlestep = true; 3645 svm->nmi_singlestep = true;
3662 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3646 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3663 update_db_intercept(vcpu); 3647 update_db_bp_intercept(vcpu);
3664} 3648}
3665 3649
3666static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3650static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3783,12 +3767,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3783 svm_complete_interrupts(svm); 3767 svm_complete_interrupts(svm);
3784} 3768}
3785 3769
3786#ifdef CONFIG_X86_64
3787#define R "r"
3788#else
3789#define R "e"
3790#endif
3791
3792static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3770static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3793{ 3771{
3794 struct vcpu_svm *svm = to_svm(vcpu); 3772 struct vcpu_svm *svm = to_svm(vcpu);
@@ -3815,13 +3793,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3815 local_irq_enable(); 3793 local_irq_enable();
3816 3794
3817 asm volatile ( 3795 asm volatile (
3818 "push %%"R"bp; \n\t" 3796 "push %%" _ASM_BP "; \n\t"
3819 "mov %c[rbx](%[svm]), %%"R"bx \n\t" 3797 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
3820 "mov %c[rcx](%[svm]), %%"R"cx \n\t" 3798 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
3821 "mov %c[rdx](%[svm]), %%"R"dx \n\t" 3799 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
3822 "mov %c[rsi](%[svm]), %%"R"si \n\t" 3800 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
3823 "mov %c[rdi](%[svm]), %%"R"di \n\t" 3801 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
3824 "mov %c[rbp](%[svm]), %%"R"bp \n\t" 3802 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
3825#ifdef CONFIG_X86_64 3803#ifdef CONFIG_X86_64
3826 "mov %c[r8](%[svm]), %%r8 \n\t" 3804 "mov %c[r8](%[svm]), %%r8 \n\t"
3827 "mov %c[r9](%[svm]), %%r9 \n\t" 3805 "mov %c[r9](%[svm]), %%r9 \n\t"
@@ -3834,20 +3812,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3834#endif 3812#endif
3835 3813
3836 /* Enter guest mode */ 3814 /* Enter guest mode */
3837 "push %%"R"ax \n\t" 3815 "push %%" _ASM_AX " \n\t"
3838 "mov %c[vmcb](%[svm]), %%"R"ax \n\t" 3816 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
3839 __ex(SVM_VMLOAD) "\n\t" 3817 __ex(SVM_VMLOAD) "\n\t"
3840 __ex(SVM_VMRUN) "\n\t" 3818 __ex(SVM_VMRUN) "\n\t"
3841 __ex(SVM_VMSAVE) "\n\t" 3819 __ex(SVM_VMSAVE) "\n\t"
3842 "pop %%"R"ax \n\t" 3820 "pop %%" _ASM_AX " \n\t"
3843 3821
3844 /* Save guest registers, load host registers */ 3822 /* Save guest registers, load host registers */
3845 "mov %%"R"bx, %c[rbx](%[svm]) \n\t" 3823 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
3846 "mov %%"R"cx, %c[rcx](%[svm]) \n\t" 3824 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
3847 "mov %%"R"dx, %c[rdx](%[svm]) \n\t" 3825 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
3848 "mov %%"R"si, %c[rsi](%[svm]) \n\t" 3826 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
3849 "mov %%"R"di, %c[rdi](%[svm]) \n\t" 3827 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
3850 "mov %%"R"bp, %c[rbp](%[svm]) \n\t" 3828 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
3851#ifdef CONFIG_X86_64 3829#ifdef CONFIG_X86_64
3852 "mov %%r8, %c[r8](%[svm]) \n\t" 3830 "mov %%r8, %c[r8](%[svm]) \n\t"
3853 "mov %%r9, %c[r9](%[svm]) \n\t" 3831 "mov %%r9, %c[r9](%[svm]) \n\t"
@@ -3858,7 +3836,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3858 "mov %%r14, %c[r14](%[svm]) \n\t" 3836 "mov %%r14, %c[r14](%[svm]) \n\t"
3859 "mov %%r15, %c[r15](%[svm]) \n\t" 3837 "mov %%r15, %c[r15](%[svm]) \n\t"
3860#endif 3838#endif
3861 "pop %%"R"bp" 3839 "pop %%" _ASM_BP
3862 : 3840 :
3863 : [svm]"a"(svm), 3841 : [svm]"a"(svm),
3864 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 3842 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -3879,9 +3857,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3879 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 3857 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3880#endif 3858#endif
3881 : "cc", "memory" 3859 : "cc", "memory"
3882 , R"bx", R"cx", R"dx", R"si", R"di"
3883#ifdef CONFIG_X86_64 3860#ifdef CONFIG_X86_64
3861 , "rbx", "rcx", "rdx", "rsi", "rdi"
3884 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 3862 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3863#else
3864 , "ebx", "ecx", "edx", "esi", "edi"
3885#endif 3865#endif
3886 ); 3866 );
3887 3867
@@ -3941,8 +3921,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3941 mark_all_clean(svm->vmcb); 3921 mark_all_clean(svm->vmcb);
3942} 3922}
3943 3923
3944#undef R
3945
3946static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 3924static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3947{ 3925{
3948 struct vcpu_svm *svm = to_svm(vcpu); 3926 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4069,7 +4047,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4069#define POST_MEM(exit) { .exit_code = (exit), \ 4047#define POST_MEM(exit) { .exit_code = (exit), \
4070 .stage = X86_ICPT_POST_MEMACCESS, } 4048 .stage = X86_ICPT_POST_MEMACCESS, }
4071 4049
4072static struct __x86_intercept { 4050static const struct __x86_intercept {
4073 u32 exit_code; 4051 u32 exit_code;
4074 enum x86_intercept_stage stage; 4052 enum x86_intercept_stage stage;
4075} x86_intercept_map[] = { 4053} x86_intercept_map[] = {
@@ -4260,7 +4238,7 @@ static struct kvm_x86_ops svm_x86_ops = {
4260 .vcpu_load = svm_vcpu_load, 4238 .vcpu_load = svm_vcpu_load,
4261 .vcpu_put = svm_vcpu_put, 4239 .vcpu_put = svm_vcpu_put,
4262 4240
4263 .set_guest_debug = svm_guest_debug, 4241 .update_db_bp_intercept = update_db_bp_intercept,
4264 .get_msr = svm_get_msr, 4242 .get_msr = svm_get_msr,
4265 .set_msr = svm_set_msr, 4243 .set_msr = svm_set_msr,
4266 .get_segment_base = svm_get_segment_base, 4244 .get_segment_base = svm_get_segment_base,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
deleted file mode 100644
index 6b85cc647f34..000000000000
--- a/arch/x86/kvm/timer.c
+++ /dev/null
@@ -1,47 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * timer support
8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
13 */
14
15#include <linux/kvm_host.h>
16#include <linux/kvm.h>
17#include <linux/hrtimer.h>
18#include <linux/atomic.h>
19#include "kvm_timer.h"
20
21enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
22{
23 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
24 struct kvm_vcpu *vcpu = ktimer->vcpu;
25 wait_queue_head_t *q = &vcpu->wq;
26
27 /*
28 * There is a race window between reading and incrementing, but we do
29 * not care about potentially losing timer events in the !reinject
30 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
31 * in vcpu_enter_guest.
32 */
33 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
34 atomic_inc(&ktimer->pending);
35 /* FIXME: this code should not know anything about vcpus */
36 kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
37 }
38
39 if (waitqueue_active(q))
40 wake_up_interruptible(q);
41
42 if (ktimer->t_ops->is_periodic(ktimer)) {
43 hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
44 return HRTIMER_RESTART;
45 } else
46 return HRTIMER_NORESTART;
47}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 851aa7c3b890..ad6b1dd06f8b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 127static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
128module_param(ple_window, int, S_IRUGO); 128module_param(ple_window, int, S_IRUGO);
129 129
130extern const ulong vmx_return;
131
130#define NR_AUTOLOAD_MSRS 8 132#define NR_AUTOLOAD_MSRS 8
131#define VMCS02_POOL_SIZE 1 133#define VMCS02_POOL_SIZE 1
132 134
@@ -405,16 +407,16 @@ struct vcpu_vmx {
405 struct { 407 struct {
406 int vm86_active; 408 int vm86_active;
407 ulong save_rflags; 409 ulong save_rflags;
410 struct kvm_segment segs[8];
411 } rmode;
412 struct {
413 u32 bitmask; /* 4 bits per segment (1 bit per field) */
408 struct kvm_save_segment { 414 struct kvm_save_segment {
409 u16 selector; 415 u16 selector;
410 unsigned long base; 416 unsigned long base;
411 u32 limit; 417 u32 limit;
412 u32 ar; 418 u32 ar;
413 } tr, es, ds, fs, gs; 419 } seg[8];
414 } rmode;
415 struct {
416 u32 bitmask; /* 4 bits per segment (1 bit per field) */
417 struct kvm_save_segment seg[8];
418 } segment_cache; 420 } segment_cache;
419 int vpid; 421 int vpid;
420 bool emulation_required; 422 bool emulation_required;
@@ -450,7 +452,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
450#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 452#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
451 [number##_HIGH] = VMCS12_OFFSET(name)+4 453 [number##_HIGH] = VMCS12_OFFSET(name)+4
452 454
453static unsigned short vmcs_field_to_offset_table[] = { 455static const unsigned short vmcs_field_to_offset_table[] = {
454 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 456 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
455 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 457 FIELD(GUEST_ES_SELECTOR, guest_es_selector),
456 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 458 FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
@@ -596,10 +598,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
596static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 598static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
597{ 599{
598 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 600 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
599 if (is_error_page(page)) { 601 if (is_error_page(page))
600 kvm_release_page_clean(page);
601 return NULL; 602 return NULL;
602 } 603
603 return page; 604 return page;
604} 605}
605 606
@@ -667,7 +668,7 @@ static struct vmx_capability {
667 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 668 .ar_bytes = GUEST_##seg##_AR_BYTES, \
668 } 669 }
669 670
670static struct kvm_vmx_segment_field { 671static const struct kvm_vmx_segment_field {
671 unsigned selector; 672 unsigned selector;
672 unsigned base; 673 unsigned base;
673 unsigned limit; 674 unsigned limit;
@@ -1343,7 +1344,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1343 guest_efer = vmx->vcpu.arch.efer; 1344 guest_efer = vmx->vcpu.arch.efer;
1344 1345
1345 /* 1346 /*
1346 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 1347 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
1347 * outside long mode 1348 * outside long mode
1348 */ 1349 */
1349 ignore_bits = EFER_NX | EFER_SCE; 1350 ignore_bits = EFER_NX | EFER_SCE;
@@ -1995,7 +1996,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
1995#endif 1996#endif
1996 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 1997 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
1997 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 1998 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
1998 CPU_BASED_RDPMC_EXITING | 1999 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
1999 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2000 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2000 /* 2001 /*
2001 * We can allow some features even when not supported by the 2002 * We can allow some features even when not supported by the
@@ -2291,16 +2292,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2291 } 2292 }
2292} 2293}
2293 2294
2294static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
2295{
2296 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
2297 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
2298 else
2299 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2300
2301 update_exception_bitmap(vcpu);
2302}
2303
2304static __init int cpu_has_kvm_support(void) 2295static __init int cpu_has_kvm_support(void)
2305{ 2296{
2306 return cpu_has_vmx(); 2297 return cpu_has_vmx();
@@ -2698,20 +2689,17 @@ static __exit void hardware_unsetup(void)
2698 free_kvm_area(); 2689 free_kvm_area();
2699} 2690}
2700 2691
2701static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) 2692static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
2702{ 2693{
2703 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2694 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2695 struct kvm_segment tmp = *save;
2704 2696
2705 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { 2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
2706 vmcs_write16(sf->selector, save->selector); 2698 tmp.base = vmcs_readl(sf->base);
2707 vmcs_writel(sf->base, save->base); 2699 tmp.selector = vmcs_read16(sf->selector);
2708 vmcs_write32(sf->limit, save->limit); 2700 tmp.s = 1;
2709 vmcs_write32(sf->ar_bytes, save->ar);
2710 } else {
2711 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
2712 << AR_DPL_SHIFT;
2713 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
2714 } 2701 }
2702 vmx_set_segment(vcpu, &tmp, seg);
2715} 2703}
2716 2704
2717static void enter_pmode(struct kvm_vcpu *vcpu) 2705static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2724,10 +2712,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2724 2712
2725 vmx_segment_cache_clear(vmx); 2713 vmx_segment_cache_clear(vmx);
2726 2714
2727 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); 2715 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2728 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
2729 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
2730 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
2731 2716
2732 flags = vmcs_readl(GUEST_RFLAGS); 2717 flags = vmcs_readl(GUEST_RFLAGS);
2733 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2718 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2742,10 +2727,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2742 if (emulate_invalid_guest_state) 2727 if (emulate_invalid_guest_state)
2743 return; 2728 return;
2744 2729
2745 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); 2730 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2746 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); 2731 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2747 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 2732 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2748 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 2733 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2749 2734
2750 vmx_segment_cache_clear(vmx); 2735 vmx_segment_cache_clear(vmx);
2751 2736
@@ -2773,14 +2758,10 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2773 return kvm->arch.tss_addr; 2758 return kvm->arch.tss_addr;
2774} 2759}
2775 2760
2776static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 2761static void fix_rmode_seg(int seg, struct kvm_segment *save)
2777{ 2762{
2778 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2779 2764
2780 save->selector = vmcs_read16(sf->selector);
2781 save->base = vmcs_readl(sf->base);
2782 save->limit = vmcs_read32(sf->limit);
2783 save->ar = vmcs_read32(sf->ar_bytes);
2784 vmcs_write16(sf->selector, save->base >> 4); 2765 vmcs_write16(sf->selector, save->base >> 4);
2785 vmcs_write32(sf->base, save->base & 0xffff0); 2766 vmcs_write32(sf->base, save->base & 0xffff0);
2786 vmcs_write32(sf->limit, 0xffff); 2767 vmcs_write32(sf->limit, 0xffff);
@@ -2800,9 +2781,16 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2800 if (enable_unrestricted_guest) 2781 if (enable_unrestricted_guest)
2801 return; 2782 return;
2802 2783
2784 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2785 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2786 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2787 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2788 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2789
2803 vmx->emulation_required = 1; 2790 vmx->emulation_required = 1;
2804 vmx->rmode.vm86_active = 1; 2791 vmx->rmode.vm86_active = 1;
2805 2792
2793
2806 /* 2794 /*
2807 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2795 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2808 * vcpu. Call it here with phys address pointing 16M below 4G. 2796 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2817,14 +2805,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2817 2805
2818 vmx_segment_cache_clear(vmx); 2806 vmx_segment_cache_clear(vmx);
2819 2807
2820 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
2821 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
2822 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 2808 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
2823
2824 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
2825 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 2809 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
2826
2827 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
2828 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 2810 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2829 2811
2830 flags = vmcs_readl(GUEST_RFLAGS); 2812 flags = vmcs_readl(GUEST_RFLAGS);
@@ -3117,35 +3099,24 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3117 struct kvm_segment *var, int seg) 3099 struct kvm_segment *var, int seg)
3118{ 3100{
3119 struct vcpu_vmx *vmx = to_vmx(vcpu); 3101 struct vcpu_vmx *vmx = to_vmx(vcpu);
3120 struct kvm_save_segment *save;
3121 u32 ar; 3102 u32 ar;
3122 3103
3123 if (vmx->rmode.vm86_active 3104 if (vmx->rmode.vm86_active
3124 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES 3105 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3125 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS 3106 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3126 || seg == VCPU_SREG_GS) 3107 || seg == VCPU_SREG_GS)) {
3127 && !emulate_invalid_guest_state) { 3108 *var = vmx->rmode.segs[seg];
3128 switch (seg) {
3129 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
3130 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
3131 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
3132 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
3133 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
3134 default: BUG();
3135 }
3136 var->selector = save->selector;
3137 var->base = save->base;
3138 var->limit = save->limit;
3139 ar = save->ar;
3140 if (seg == VCPU_SREG_TR 3109 if (seg == VCPU_SREG_TR
3141 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3110 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3142 goto use_saved_rmode_seg; 3111 return;
3112 var->base = vmx_read_guest_seg_base(vmx, seg);
3113 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3114 return;
3143 } 3115 }
3144 var->base = vmx_read_guest_seg_base(vmx, seg); 3116 var->base = vmx_read_guest_seg_base(vmx, seg);
3145 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3117 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3146 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3118 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3147 ar = vmx_read_guest_seg_ar(vmx, seg); 3119 ar = vmx_read_guest_seg_ar(vmx, seg);
3148use_saved_rmode_seg:
3149 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 3120 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3150 ar = 0; 3121 ar = 0;
3151 var->type = ar & 15; 3122 var->type = ar & 15;
@@ -3227,23 +3198,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3227 struct kvm_segment *var, int seg) 3198 struct kvm_segment *var, int seg)
3228{ 3199{
3229 struct vcpu_vmx *vmx = to_vmx(vcpu); 3200 struct vcpu_vmx *vmx = to_vmx(vcpu);
3230 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3201 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3231 u32 ar; 3202 u32 ar;
3232 3203
3233 vmx_segment_cache_clear(vmx); 3204 vmx_segment_cache_clear(vmx);
3234 3205
3235 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3206 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
3236 vmcs_write16(sf->selector, var->selector); 3207 vmcs_write16(sf->selector, var->selector);
3237 vmx->rmode.tr.selector = var->selector; 3208 vmx->rmode.segs[VCPU_SREG_TR] = *var;
3238 vmx->rmode.tr.base = var->base;
3239 vmx->rmode.tr.limit = var->limit;
3240 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
3241 return; 3209 return;
3242 } 3210 }
3243 vmcs_writel(sf->base, var->base); 3211 vmcs_writel(sf->base, var->base);
3244 vmcs_write32(sf->limit, var->limit); 3212 vmcs_write32(sf->limit, var->limit);
3245 vmcs_write16(sf->selector, var->selector); 3213 vmcs_write16(sf->selector, var->selector);
3246 if (vmx->rmode.vm86_active && var->s) { 3214 if (vmx->rmode.vm86_active && var->s) {
3215 vmx->rmode.segs[seg] = *var;
3247 /* 3216 /*
3248 * Hack real-mode segments into vm86 compatibility. 3217 * Hack real-mode segments into vm86 compatibility.
3249 */ 3218 */
@@ -3258,7 +3227,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3258 * qemu binaries. 3227 * qemu binaries.
3259 * IA32 arch specifies that at the time of processor reset the 3228 * IA32 arch specifies that at the time of processor reset the
3260 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3229 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3261 * is setting it to 0 in the usedland code. This causes invalid guest 3230 * is setting it to 0 in the userland code. This causes invalid guest
3262 * state vmexit when "unrestricted guest" mode is turned on. 3231 * state vmexit when "unrestricted guest" mode is turned on.
3263 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3232 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3264 * tree. Newer qemu binaries with that qemu fix would not need this 3233 * tree. Newer qemu binaries with that qemu fix would not need this
@@ -3288,16 +3257,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3288 vmcs_readl(GUEST_CS_BASE) >> 4); 3257 vmcs_readl(GUEST_CS_BASE) >> 4);
3289 break; 3258 break;
3290 case VCPU_SREG_ES: 3259 case VCPU_SREG_ES:
3291 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
3292 break;
3293 case VCPU_SREG_DS: 3260 case VCPU_SREG_DS:
3294 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
3295 break;
3296 case VCPU_SREG_GS: 3261 case VCPU_SREG_GS:
3297 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
3298 break;
3299 case VCPU_SREG_FS: 3262 case VCPU_SREG_FS:
3300 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); 3263 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3301 break; 3264 break;
3302 case VCPU_SREG_SS: 3265 case VCPU_SREG_SS:
3303 vmcs_write16(GUEST_SS_SELECTOR, 3266 vmcs_write16(GUEST_SS_SELECTOR,
@@ -3351,9 +3314,9 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3351 3314
3352 if (var.base != (var.selector << 4)) 3315 if (var.base != (var.selector << 4))
3353 return false; 3316 return false;
3354 if (var.limit != 0xffff) 3317 if (var.limit < 0xffff)
3355 return false; 3318 return false;
3356 if (ar != 0xf3) 3319 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
3357 return false; 3320 return false;
3358 3321
3359 return true; 3322 return true;
@@ -3605,7 +3568,7 @@ out:
3605 3568
3606static void seg_setup(int seg) 3569static void seg_setup(int seg)
3607{ 3570{
3608 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3571 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3609 unsigned int ar; 3572 unsigned int ar;
3610 3573
3611 vmcs_write16(sf->selector, 0); 3574 vmcs_write16(sf->selector, 0);
@@ -3770,8 +3733,7 @@ static void vmx_set_constant_host_state(void)
3770 native_store_idt(&dt); 3733 native_store_idt(&dt);
3771 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 3734 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
3772 3735
3773 asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); 3736 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
3774 vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
3775 3737
3776 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 3738 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3777 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 3739 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
@@ -4005,8 +3967,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4005 kvm_rip_write(vcpu, 0); 3967 kvm_rip_write(vcpu, 0);
4006 kvm_register_write(vcpu, VCPU_REGS_RSP, 0); 3968 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
4007 3969
4008 vmcs_writel(GUEST_DR7, 0x400);
4009
4010 vmcs_writel(GUEST_GDTR_BASE, 0); 3970 vmcs_writel(GUEST_GDTR_BASE, 0);
4011 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 3971 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4012 3972
@@ -4456,7 +4416,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4456 hypercall[2] = 0xc1; 4416 hypercall[2] = 0xc1;
4457} 4417}
4458 4418
4459/* called to set cr0 as approriate for a mov-to-cr0 exit. */ 4419/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4460static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4420static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4461{ 4421{
4462 if (to_vmx(vcpu)->nested.vmxon && 4422 if (to_vmx(vcpu)->nested.vmxon &&
@@ -5701,7 +5661,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
5701 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5661 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5702 * to be done to userspace and return 0. 5662 * to be done to userspace and return 0.
5703 */ 5663 */
5704static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5664static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5705 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 5665 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
5706 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5666 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5707 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5667 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -6229,17 +6189,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
6229 msrs[i].host); 6189 msrs[i].host);
6230} 6190}
6231 6191
6232#ifdef CONFIG_X86_64
6233#define R "r"
6234#define Q "q"
6235#else
6236#define R "e"
6237#define Q "l"
6238#endif
6239
6240static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 6192static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6241{ 6193{
6242 struct vcpu_vmx *vmx = to_vmx(vcpu); 6194 struct vcpu_vmx *vmx = to_vmx(vcpu);
6195 unsigned long debugctlmsr;
6243 6196
6244 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { 6197 if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
6245 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6198 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6279,34 +6232,35 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6279 vmx_set_interrupt_shadow(vcpu, 0); 6232 vmx_set_interrupt_shadow(vcpu, 0);
6280 6233
6281 atomic_switch_perf_msrs(vmx); 6234 atomic_switch_perf_msrs(vmx);
6235 debugctlmsr = get_debugctlmsr();
6282 6236
6283 vmx->__launched = vmx->loaded_vmcs->launched; 6237 vmx->__launched = vmx->loaded_vmcs->launched;
6284 asm( 6238 asm(
6285 /* Store host registers */ 6239 /* Store host registers */
6286 "push %%"R"dx; push %%"R"bp;" 6240 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
6287 "push %%"R"cx \n\t" /* placeholder for guest rcx */ 6241 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
6288 "push %%"R"cx \n\t" 6242 "push %%" _ASM_CX " \n\t"
6289 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 6243 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6290 "je 1f \n\t" 6244 "je 1f \n\t"
6291 "mov %%"R"sp, %c[host_rsp](%0) \n\t" 6245 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
6292 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 6246 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
6293 "1: \n\t" 6247 "1: \n\t"
6294 /* Reload cr2 if changed */ 6248 /* Reload cr2 if changed */
6295 "mov %c[cr2](%0), %%"R"ax \n\t" 6249 "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
6296 "mov %%cr2, %%"R"dx \n\t" 6250 "mov %%cr2, %%" _ASM_DX " \n\t"
6297 "cmp %%"R"ax, %%"R"dx \n\t" 6251 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
6298 "je 2f \n\t" 6252 "je 2f \n\t"
6299 "mov %%"R"ax, %%cr2 \n\t" 6253 "mov %%" _ASM_AX", %%cr2 \n\t"
6300 "2: \n\t" 6254 "2: \n\t"
6301 /* Check if vmlaunch of vmresume is needed */ 6255 /* Check if vmlaunch of vmresume is needed */
6302 "cmpl $0, %c[launched](%0) \n\t" 6256 "cmpl $0, %c[launched](%0) \n\t"
6303 /* Load guest registers. Don't clobber flags. */ 6257 /* Load guest registers. Don't clobber flags. */
6304 "mov %c[rax](%0), %%"R"ax \n\t" 6258 "mov %c[rax](%0), %%" _ASM_AX " \n\t"
6305 "mov %c[rbx](%0), %%"R"bx \n\t" 6259 "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
6306 "mov %c[rdx](%0), %%"R"dx \n\t" 6260 "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
6307 "mov %c[rsi](%0), %%"R"si \n\t" 6261 "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
6308 "mov %c[rdi](%0), %%"R"di \n\t" 6262 "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
6309 "mov %c[rbp](%0), %%"R"bp \n\t" 6263 "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
6310#ifdef CONFIG_X86_64 6264#ifdef CONFIG_X86_64
6311 "mov %c[r8](%0), %%r8 \n\t" 6265 "mov %c[r8](%0), %%r8 \n\t"
6312 "mov %c[r9](%0), %%r9 \n\t" 6266 "mov %c[r9](%0), %%r9 \n\t"
@@ -6317,24 +6271,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6317 "mov %c[r14](%0), %%r14 \n\t" 6271 "mov %c[r14](%0), %%r14 \n\t"
6318 "mov %c[r15](%0), %%r15 \n\t" 6272 "mov %c[r15](%0), %%r15 \n\t"
6319#endif 6273#endif
6320 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ 6274 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
6321 6275
6322 /* Enter guest mode */ 6276 /* Enter guest mode */
6323 "jne .Llaunched \n\t" 6277 "jne 1f \n\t"
6324 __ex(ASM_VMX_VMLAUNCH) "\n\t" 6278 __ex(ASM_VMX_VMLAUNCH) "\n\t"
6325 "jmp .Lkvm_vmx_return \n\t" 6279 "jmp 2f \n\t"
6326 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 6280 "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
6327 ".Lkvm_vmx_return: " 6281 "2: "
6328 /* Save guest registers, load host registers, keep flags */ 6282 /* Save guest registers, load host registers, keep flags */
6329 "mov %0, %c[wordsize](%%"R"sp) \n\t" 6283 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
6330 "pop %0 \n\t" 6284 "pop %0 \n\t"
6331 "mov %%"R"ax, %c[rax](%0) \n\t" 6285 "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
6332 "mov %%"R"bx, %c[rbx](%0) \n\t" 6286 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
6333 "pop"Q" %c[rcx](%0) \n\t" 6287 __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
6334 "mov %%"R"dx, %c[rdx](%0) \n\t" 6288 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
6335 "mov %%"R"si, %c[rsi](%0) \n\t" 6289 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
6336 "mov %%"R"di, %c[rdi](%0) \n\t" 6290 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
6337 "mov %%"R"bp, %c[rbp](%0) \n\t" 6291 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
6338#ifdef CONFIG_X86_64 6292#ifdef CONFIG_X86_64
6339 "mov %%r8, %c[r8](%0) \n\t" 6293 "mov %%r8, %c[r8](%0) \n\t"
6340 "mov %%r9, %c[r9](%0) \n\t" 6294 "mov %%r9, %c[r9](%0) \n\t"
@@ -6345,11 +6299,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6345 "mov %%r14, %c[r14](%0) \n\t" 6299 "mov %%r14, %c[r14](%0) \n\t"
6346 "mov %%r15, %c[r15](%0) \n\t" 6300 "mov %%r15, %c[r15](%0) \n\t"
6347#endif 6301#endif
6348 "mov %%cr2, %%"R"ax \n\t" 6302 "mov %%cr2, %%" _ASM_AX " \n\t"
6349 "mov %%"R"ax, %c[cr2](%0) \n\t" 6303 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
6350 6304
6351 "pop %%"R"bp; pop %%"R"dx \n\t" 6305 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
6352 "setbe %c[fail](%0) \n\t" 6306 "setbe %c[fail](%0) \n\t"
6307 ".pushsection .rodata \n\t"
6308 ".global vmx_return \n\t"
6309 "vmx_return: " _ASM_PTR " 2b \n\t"
6310 ".popsection"
6353 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 6311 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
6354 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 6312 [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
6355 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 6313 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
@@ -6374,12 +6332,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6374 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 6332 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
6375 [wordsize]"i"(sizeof(ulong)) 6333 [wordsize]"i"(sizeof(ulong))
6376 : "cc", "memory" 6334 : "cc", "memory"
6377 , R"ax", R"bx", R"di", R"si"
6378#ifdef CONFIG_X86_64 6335#ifdef CONFIG_X86_64
6336 , "rax", "rbx", "rdi", "rsi"
6379 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 6337 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
6338#else
6339 , "eax", "ebx", "edi", "esi"
6380#endif 6340#endif
6381 ); 6341 );
6382 6342
6343 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6344 if (debugctlmsr)
6345 update_debugctlmsr(debugctlmsr);
6346
6383#ifndef CONFIG_X86_64 6347#ifndef CONFIG_X86_64
6384 /* 6348 /*
6385 * The sysexit path does not restore ds/es, so we must set them to 6349 * The sysexit path does not restore ds/es, so we must set them to
@@ -6424,9 +6388,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6424 vmx_complete_interrupts(vmx); 6388 vmx_complete_interrupts(vmx);
6425} 6389}
6426 6390
6427#undef R
6428#undef Q
6429
6430static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6391static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
6431{ 6392{
6432 struct vcpu_vmx *vmx = to_vmx(vcpu); 6393 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7281,7 +7242,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7281 .vcpu_load = vmx_vcpu_load, 7242 .vcpu_load = vmx_vcpu_load,
7282 .vcpu_put = vmx_vcpu_put, 7243 .vcpu_put = vmx_vcpu_put,
7283 7244
7284 .set_guest_debug = set_guest_debug, 7245 .update_db_bp_intercept = update_exception_bitmap,
7285 .get_msr = vmx_get_msr, 7246 .get_msr = vmx_get_msr,
7286 .set_msr = vmx_set_msr, 7247 .set_msr = vmx_set_msr,
7287 .get_segment_base = vmx_get_segment_base, 7248 .get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f09552572fa..1eefebe5d727 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,20 +246,14 @@ static void drop_user_return_notifiers(void *ignore)
246 246
247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 247u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
248{ 248{
249 if (irqchip_in_kernel(vcpu->kvm)) 249 return vcpu->arch.apic_base;
250 return vcpu->arch.apic_base;
251 else
252 return vcpu->arch.apic_base;
253} 250}
254EXPORT_SYMBOL_GPL(kvm_get_apic_base); 251EXPORT_SYMBOL_GPL(kvm_get_apic_base);
255 252
256void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 253void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
257{ 254{
258 /* TODO: reserve bits check */ 255 /* TODO: reserve bits check */
259 if (irqchip_in_kernel(vcpu->kvm)) 256 kvm_lapic_set_base(vcpu, data);
260 kvm_lapic_set_base(vcpu, data);
261 else
262 vcpu->arch.apic_base = data;
263} 257}
264EXPORT_SYMBOL_GPL(kvm_set_apic_base); 258EXPORT_SYMBOL_GPL(kvm_set_apic_base);
265 259
@@ -698,6 +692,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
698} 692}
699EXPORT_SYMBOL_GPL(kvm_get_cr8); 693EXPORT_SYMBOL_GPL(kvm_get_cr8);
700 694
695static void kvm_update_dr7(struct kvm_vcpu *vcpu)
696{
697 unsigned long dr7;
698
699 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
700 dr7 = vcpu->arch.guest_debug_dr7;
701 else
702 dr7 = vcpu->arch.dr7;
703 kvm_x86_ops->set_dr7(vcpu, dr7);
704 vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
705}
706
701static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 707static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
702{ 708{
703 switch (dr) { 709 switch (dr) {
@@ -723,10 +729,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
723 if (val & 0xffffffff00000000ULL) 729 if (val & 0xffffffff00000000ULL)
724 return -1; /* #GP */ 730 return -1; /* #GP */
725 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 731 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
726 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 732 kvm_update_dr7(vcpu);
727 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
728 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
729 }
730 break; 733 break;
731 } 734 }
732 735
@@ -823,7 +826,7 @@ static u32 msrs_to_save[] = {
823 826
824static unsigned num_msrs_to_save; 827static unsigned num_msrs_to_save;
825 828
826static u32 emulated_msrs[] = { 829static const u32 emulated_msrs[] = {
827 MSR_IA32_TSCDEADLINE, 830 MSR_IA32_TSCDEADLINE,
828 MSR_IA32_MISC_ENABLE, 831 MSR_IA32_MISC_ENABLE,
829 MSR_IA32_MCG_STATUS, 832 MSR_IA32_MCG_STATUS,
@@ -1097,7 +1100,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1097 * For each generation, we track the original measured 1100 * For each generation, we track the original measured
1098 * nanosecond time, offset, and write, so if TSCs are in 1101 * nanosecond time, offset, and write, so if TSCs are in
1099 * sync, we can match exact offset, and if not, we can match 1102 * sync, we can match exact offset, and if not, we can match
1100 * exact software computaion in compute_guest_tsc() 1103 * exact software computation in compute_guest_tsc()
1101 * 1104 *
1102 * These values are tracked in kvm->arch.cur_xxx variables. 1105 * These values are tracked in kvm->arch.cur_xxx variables.
1103 */ 1106 */
@@ -1140,6 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1140 unsigned long this_tsc_khz; 1143 unsigned long this_tsc_khz;
1141 s64 kernel_ns, max_kernel_ns; 1144 s64 kernel_ns, max_kernel_ns;
1142 u64 tsc_timestamp; 1145 u64 tsc_timestamp;
1146 u8 pvclock_flags;
1143 1147
1144 /* Keep irq disabled to prevent changes to the clock */ 1148 /* Keep irq disabled to prevent changes to the clock */
1145 local_irq_save(flags); 1149 local_irq_save(flags);
@@ -1221,7 +1225,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1221 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1225 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1222 vcpu->last_kernel_ns = kernel_ns; 1226 vcpu->last_kernel_ns = kernel_ns;
1223 vcpu->last_guest_tsc = tsc_timestamp; 1227 vcpu->last_guest_tsc = tsc_timestamp;
1224 vcpu->hv_clock.flags = 0; 1228
1229 pvclock_flags = 0;
1230 if (vcpu->pvclock_set_guest_stopped_request) {
1231 pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1232 vcpu->pvclock_set_guest_stopped_request = false;
1233 }
1234
1235 vcpu->hv_clock.flags = pvclock_flags;
1225 1236
1226 /* 1237 /*
1227 * The interface expects us to write an even number signaling that the 1238 * The interface expects us to write an even number signaling that the
@@ -1504,7 +1515,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1504{ 1515{
1505 gpa_t gpa = data & ~0x3f; 1516 gpa_t gpa = data & ~0x3f;
1506 1517
1507 /* Bits 2:5 are resrved, Should be zero */ 1518 /* Bits 2:5 are reserved, Should be zero */
1508 if (data & 0x3c) 1519 if (data & 0x3c)
1509 return 1; 1520 return 1;
1510 1521
@@ -1639,10 +1650,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1639 vcpu->arch.time_page = 1650 vcpu->arch.time_page =
1640 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 1651 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1641 1652
1642 if (is_error_page(vcpu->arch.time_page)) { 1653 if (is_error_page(vcpu->arch.time_page))
1643 kvm_release_page_clean(vcpu->arch.time_page);
1644 vcpu->arch.time_page = NULL; 1654 vcpu->arch.time_page = NULL;
1645 } 1655
1646 break; 1656 break;
1647 } 1657 }
1648 case MSR_KVM_ASYNC_PF_EN: 1658 case MSR_KVM_ASYNC_PF_EN:
@@ -1727,7 +1737,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1727 * Ignore all writes to this no longer documented MSR. 1737 * Ignore all writes to this no longer documented MSR.
1728 * Writes are only relevant for old K7 processors, 1738 * Writes are only relevant for old K7 processors,
1729 * all pre-dating SVM, but a recommended workaround from 1739 * all pre-dating SVM, but a recommended workaround from
1730 * AMD for these chips. It is possible to speicify the 1740 * AMD for these chips. It is possible to specify the
1731 * affected processor models on the command line, hence 1741 * affected processor models on the command line, hence
1732 * the need to ignore the workaround. 1742 * the need to ignore the workaround.
1733 */ 1743 */
@@ -2177,6 +2187,8 @@ int kvm_dev_ioctl_check_extension(long ext)
2177 case KVM_CAP_GET_TSC_KHZ: 2187 case KVM_CAP_GET_TSC_KHZ:
2178 case KVM_CAP_PCI_2_3: 2188 case KVM_CAP_PCI_2_3:
2179 case KVM_CAP_KVMCLOCK_CTRL: 2189 case KVM_CAP_KVMCLOCK_CTRL:
2190 case KVM_CAP_READONLY_MEM:
2191 case KVM_CAP_IRQFD_RESAMPLE:
2180 r = 1; 2192 r = 1;
2181 break; 2193 break;
2182 case KVM_CAP_COALESCED_MMIO: 2194 case KVM_CAP_COALESCED_MMIO:
@@ -2358,8 +2370,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2358static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2370static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2359 struct kvm_lapic_state *s) 2371 struct kvm_lapic_state *s)
2360{ 2372{
2361 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2373 kvm_apic_post_state_restore(vcpu, s);
2362 kvm_apic_post_state_restore(vcpu);
2363 update_cr8_intercept(vcpu); 2374 update_cr8_intercept(vcpu);
2364 2375
2365 return 0; 2376 return 0;
@@ -2368,7 +2379,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2368static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, 2379static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2369 struct kvm_interrupt *irq) 2380 struct kvm_interrupt *irq)
2370{ 2381{
2371 if (irq->irq < 0 || irq->irq >= 256) 2382 if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
2372 return -EINVAL; 2383 return -EINVAL;
2373 if (irqchip_in_kernel(vcpu->kvm)) 2384 if (irqchip_in_kernel(vcpu->kvm))
2374 return -ENXIO; 2385 return -ENXIO;
@@ -2635,11 +2646,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2635 */ 2646 */
2636static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) 2647static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
2637{ 2648{
2638 struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
2639 if (!vcpu->arch.time_page) 2649 if (!vcpu->arch.time_page)
2640 return -EINVAL; 2650 return -EINVAL;
2641 src->flags |= PVCLOCK_GUEST_STOPPED; 2651 vcpu->arch.pvclock_set_guest_stopped_request = true;
2642 mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
2643 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2652 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2644 return 0; 2653 return 0;
2645} 2654}
@@ -3090,7 +3099,7 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3090 if (!kvm->arch.vpit) 3099 if (!kvm->arch.vpit)
3091 return -ENXIO; 3100 return -ENXIO;
3092 mutex_lock(&kvm->arch.vpit->pit_state.lock); 3101 mutex_lock(&kvm->arch.vpit->pit_state.lock);
3093 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 3102 kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3094 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3103 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3095 return 0; 3104 return 0;
3096} 3105}
@@ -3173,6 +3182,16 @@ out:
3173 return r; 3182 return r;
3174} 3183}
3175 3184
3185int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
3186{
3187 if (!irqchip_in_kernel(kvm))
3188 return -ENXIO;
3189
3190 irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3191 irq_event->irq, irq_event->level);
3192 return 0;
3193}
3194
3176long kvm_arch_vm_ioctl(struct file *filp, 3195long kvm_arch_vm_ioctl(struct file *filp,
3177 unsigned int ioctl, unsigned long arg) 3196 unsigned int ioctl, unsigned long arg)
3178{ 3197{
@@ -3279,29 +3298,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3279 create_pit_unlock: 3298 create_pit_unlock:
3280 mutex_unlock(&kvm->slots_lock); 3299 mutex_unlock(&kvm->slots_lock);
3281 break; 3300 break;
3282 case KVM_IRQ_LINE_STATUS:
3283 case KVM_IRQ_LINE: {
3284 struct kvm_irq_level irq_event;
3285
3286 r = -EFAULT;
3287 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3288 goto out;
3289 r = -ENXIO;
3290 if (irqchip_in_kernel(kvm)) {
3291 __s32 status;
3292 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3293 irq_event.irq, irq_event.level);
3294 if (ioctl == KVM_IRQ_LINE_STATUS) {
3295 r = -EFAULT;
3296 irq_event.status = status;
3297 if (copy_to_user(argp, &irq_event,
3298 sizeof irq_event))
3299 goto out;
3300 }
3301 r = 0;
3302 }
3303 break;
3304 }
3305 case KVM_GET_IRQCHIP: { 3301 case KVM_GET_IRQCHIP: {
3306 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 3302 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3307 struct kvm_irqchip *chip; 3303 struct kvm_irqchip *chip;
@@ -3689,20 +3685,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
3689 gpa_t *gpa, struct x86_exception *exception, 3685 gpa_t *gpa, struct x86_exception *exception,
3690 bool write) 3686 bool write)
3691{ 3687{
3692 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3688 u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
3689 | (write ? PFERR_WRITE_MASK : 0);
3693 3690
3694 if (vcpu_match_mmio_gva(vcpu, gva) && 3691 if (vcpu_match_mmio_gva(vcpu, gva)
3695 check_write_user_access(vcpu, write, access, 3692 && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
3696 vcpu->arch.access)) {
3697 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | 3693 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
3698 (gva & (PAGE_SIZE - 1)); 3694 (gva & (PAGE_SIZE - 1));
3699 trace_vcpu_match_mmio(gva, *gpa, write, false); 3695 trace_vcpu_match_mmio(gva, *gpa, write, false);
3700 return 1; 3696 return 1;
3701 } 3697 }
3702 3698
3703 if (write)
3704 access |= PFERR_WRITE_MASK;
3705
3706 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); 3699 *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3707 3700
3708 if (*gpa == UNMAPPED_GVA) 3701 if (*gpa == UNMAPPED_GVA)
@@ -3790,14 +3783,14 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
3790 return X86EMUL_CONTINUE; 3783 return X86EMUL_CONTINUE;
3791} 3784}
3792 3785
3793static struct read_write_emulator_ops read_emultor = { 3786static const struct read_write_emulator_ops read_emultor = {
3794 .read_write_prepare = read_prepare, 3787 .read_write_prepare = read_prepare,
3795 .read_write_emulate = read_emulate, 3788 .read_write_emulate = read_emulate,
3796 .read_write_mmio = vcpu_mmio_read, 3789 .read_write_mmio = vcpu_mmio_read,
3797 .read_write_exit_mmio = read_exit_mmio, 3790 .read_write_exit_mmio = read_exit_mmio,
3798}; 3791};
3799 3792
3800static struct read_write_emulator_ops write_emultor = { 3793static const struct read_write_emulator_ops write_emultor = {
3801 .read_write_emulate = write_emulate, 3794 .read_write_emulate = write_emulate,
3802 .read_write_mmio = write_mmio, 3795 .read_write_mmio = write_mmio,
3803 .read_write_exit_mmio = write_exit_mmio, 3796 .read_write_exit_mmio = write_exit_mmio,
@@ -3808,7 +3801,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
3808 unsigned int bytes, 3801 unsigned int bytes,
3809 struct x86_exception *exception, 3802 struct x86_exception *exception,
3810 struct kvm_vcpu *vcpu, 3803 struct kvm_vcpu *vcpu,
3811 struct read_write_emulator_ops *ops) 3804 const struct read_write_emulator_ops *ops)
3812{ 3805{
3813 gpa_t gpa; 3806 gpa_t gpa;
3814 int handled, ret; 3807 int handled, ret;
@@ -3857,7 +3850,7 @@ mmio:
3857int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, 3850int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
3858 void *val, unsigned int bytes, 3851 void *val, unsigned int bytes,
3859 struct x86_exception *exception, 3852 struct x86_exception *exception,
3860 struct read_write_emulator_ops *ops) 3853 const struct read_write_emulator_ops *ops)
3861{ 3854{
3862 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 3855 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3863 gpa_t gpa; 3856 gpa_t gpa;
@@ -3962,10 +3955,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
3962 goto emul_write; 3955 goto emul_write;
3963 3956
3964 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3957 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3965 if (is_error_page(page)) { 3958 if (is_error_page(page))
3966 kvm_release_page_clean(page);
3967 goto emul_write; 3959 goto emul_write;
3968 }
3969 3960
3970 kaddr = kmap_atomic(page); 3961 kaddr = kmap_atomic(page);
3971 kaddr += offset_in_page(gpa); 3962 kaddr += offset_in_page(gpa);
@@ -4332,7 +4323,19 @@ static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4332 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); 4323 kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4333} 4324}
4334 4325
4335static struct x86_emulate_ops emulate_ops = { 4326static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4327{
4328 return kvm_register_read(emul_to_vcpu(ctxt), reg);
4329}
4330
4331static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4332{
4333 kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4334}
4335
4336static const struct x86_emulate_ops emulate_ops = {
4337 .read_gpr = emulator_read_gpr,
4338 .write_gpr = emulator_write_gpr,
4336 .read_std = kvm_read_guest_virt_system, 4339 .read_std = kvm_read_guest_virt_system,
4337 .write_std = kvm_write_guest_virt_system, 4340 .write_std = kvm_write_guest_virt_system,
4338 .fetch = kvm_fetch_guest_virt, 4341 .fetch = kvm_fetch_guest_virt,
@@ -4367,14 +4370,6 @@ static struct x86_emulate_ops emulate_ops = {
4367 .get_cpuid = emulator_get_cpuid, 4370 .get_cpuid = emulator_get_cpuid,
4368}; 4371};
4369 4372
4370static void cache_all_regs(struct kvm_vcpu *vcpu)
4371{
4372 kvm_register_read(vcpu, VCPU_REGS_RAX);
4373 kvm_register_read(vcpu, VCPU_REGS_RSP);
4374 kvm_register_read(vcpu, VCPU_REGS_RIP);
4375 vcpu->arch.regs_dirty = ~0;
4376}
4377
4378static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 4373static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4379{ 4374{
4380 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 4375 u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4401,12 +4396,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4401 kvm_queue_exception(vcpu, ctxt->exception.vector); 4396 kvm_queue_exception(vcpu, ctxt->exception.vector);
4402} 4397}
4403 4398
4404static void init_decode_cache(struct x86_emulate_ctxt *ctxt, 4399static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
4405 const unsigned long *regs)
4406{ 4400{
4407 memset(&ctxt->twobyte, 0, 4401 memset(&ctxt->twobyte, 0,
4408 (void *)&ctxt->regs - (void *)&ctxt->twobyte); 4402 (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
4409 memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4410 4403
4411 ctxt->fetch.start = 0; 4404 ctxt->fetch.start = 0;
4412 ctxt->fetch.end = 0; 4405 ctxt->fetch.end = 0;
@@ -4421,14 +4414,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4421 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4414 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4422 int cs_db, cs_l; 4415 int cs_db, cs_l;
4423 4416
4424 /*
4425 * TODO: fix emulate.c to use guest_read/write_register
4426 * instead of direct ->regs accesses, can save hundred cycles
4427 * on Intel for instructions that don't read/change RSP, for
4428 * for example.
4429 */
4430 cache_all_regs(vcpu);
4431
4432 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4417 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4433 4418
4434 ctxt->eflags = kvm_get_rflags(vcpu); 4419 ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4440,7 +4425,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4440 X86EMUL_MODE_PROT16; 4425 X86EMUL_MODE_PROT16;
4441 ctxt->guest_mode = is_guest_mode(vcpu); 4426 ctxt->guest_mode = is_guest_mode(vcpu);
4442 4427
4443 init_decode_cache(ctxt, vcpu->arch.regs); 4428 init_decode_cache(ctxt);
4444 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4429 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4445} 4430}
4446 4431
@@ -4460,7 +4445,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4460 return EMULATE_FAIL; 4445 return EMULATE_FAIL;
4461 4446
4462 ctxt->eip = ctxt->_eip; 4447 ctxt->eip = ctxt->_eip;
4463 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4464 kvm_rip_write(vcpu, ctxt->eip); 4448 kvm_rip_write(vcpu, ctxt->eip);
4465 kvm_set_rflags(vcpu, ctxt->eflags); 4449 kvm_set_rflags(vcpu, ctxt->eflags);
4466 4450
@@ -4493,13 +4477,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4493static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4477static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4494{ 4478{
4495 gpa_t gpa; 4479 gpa_t gpa;
4480 pfn_t pfn;
4496 4481
4497 if (tdp_enabled) 4482 if (tdp_enabled)
4498 return false; 4483 return false;
4499 4484
4500 /* 4485 /*
4501 * if emulation was due to access to shadowed page table 4486 * if emulation was due to access to shadowed page table
4502 * and it failed try to unshadow page and re-entetr the 4487 * and it failed try to unshadow page and re-enter the
4503 * guest to let CPU execute the instruction. 4488 * guest to let CPU execute the instruction.
4504 */ 4489 */
4505 if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 4490 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4510,8 +4495,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4510 if (gpa == UNMAPPED_GVA) 4495 if (gpa == UNMAPPED_GVA)
4511 return true; /* let cpu generate fault */ 4496 return true; /* let cpu generate fault */
4512 4497
4513 if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 4498 /*
4499 * Do not retry the unhandleable instruction if it faults on the
4500 * readonly host memory, otherwise it will goto a infinite loop:
4501 * retry instruction -> write #PF -> emulation fail -> retry
4502 * instruction -> ...
4503 */
4504 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4505 if (!is_error_pfn(pfn)) {
4506 kvm_release_pfn_clean(pfn);
4514 return true; 4507 return true;
4508 }
4515 4509
4516 return false; 4510 return false;
4517} 4511}
@@ -4560,6 +4554,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4560 return true; 4554 return true;
4561} 4555}
4562 4556
4557static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
4558static int complete_emulated_pio(struct kvm_vcpu *vcpu);
4559
4563int x86_emulate_instruction(struct kvm_vcpu *vcpu, 4560int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4564 unsigned long cr2, 4561 unsigned long cr2,
4565 int emulation_type, 4562 int emulation_type,
@@ -4608,7 +4605,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4608 changes registers values during IO operation */ 4605 changes registers values during IO operation */
4609 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { 4606 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4610 vcpu->arch.emulate_regs_need_sync_from_vcpu = false; 4607 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4611 memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); 4608 emulator_invalidate_register_cache(ctxt);
4612 } 4609 }
4613 4610
4614restart: 4611restart:
@@ -4630,13 +4627,16 @@ restart:
4630 } else if (vcpu->arch.pio.count) { 4627 } else if (vcpu->arch.pio.count) {
4631 if (!vcpu->arch.pio.in) 4628 if (!vcpu->arch.pio.in)
4632 vcpu->arch.pio.count = 0; 4629 vcpu->arch.pio.count = 0;
4633 else 4630 else {
4634 writeback = false; 4631 writeback = false;
4632 vcpu->arch.complete_userspace_io = complete_emulated_pio;
4633 }
4635 r = EMULATE_DO_MMIO; 4634 r = EMULATE_DO_MMIO;
4636 } else if (vcpu->mmio_needed) { 4635 } else if (vcpu->mmio_needed) {
4637 if (!vcpu->mmio_is_write) 4636 if (!vcpu->mmio_is_write)
4638 writeback = false; 4637 writeback = false;
4639 r = EMULATE_DO_MMIO; 4638 r = EMULATE_DO_MMIO;
4639 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
4640 } else if (r == EMULATION_RESTART) 4640 } else if (r == EMULATION_RESTART)
4641 goto restart; 4641 goto restart;
4642 else 4642 else
@@ -4646,7 +4646,6 @@ restart:
4646 toggle_interruptibility(vcpu, ctxt->interruptibility); 4646 toggle_interruptibility(vcpu, ctxt->interruptibility);
4647 kvm_set_rflags(vcpu, ctxt->eflags); 4647 kvm_set_rflags(vcpu, ctxt->eflags);
4648 kvm_make_request(KVM_REQ_EVENT, vcpu); 4648 kvm_make_request(KVM_REQ_EVENT, vcpu);
4649 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4650 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 4649 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4651 kvm_rip_write(vcpu, ctxt->eip); 4650 kvm_rip_write(vcpu, ctxt->eip);
4652 } else 4651 } else
@@ -4929,6 +4928,7 @@ int kvm_arch_init(void *opaque)
4929 if (cpu_has_xsave) 4928 if (cpu_has_xsave)
4930 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4929 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
4931 4930
4931 kvm_lapic_init();
4932 return 0; 4932 return 0;
4933 4933
4934out: 4934out:
@@ -5499,6 +5499,24 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5499 return r; 5499 return r;
5500} 5500}
5501 5501
5502static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
5503{
5504 int r;
5505 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5506 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5507 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5508 if (r != EMULATE_DONE)
5509 return 0;
5510 return 1;
5511}
5512
5513static int complete_emulated_pio(struct kvm_vcpu *vcpu)
5514{
5515 BUG_ON(!vcpu->arch.pio.count);
5516
5517 return complete_emulated_io(vcpu);
5518}
5519
5502/* 5520/*
5503 * Implements the following, as a state machine: 5521 * Implements the following, as a state machine:
5504 * 5522 *
@@ -5515,47 +5533,37 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5515 * copy data 5533 * copy data
5516 * exit 5534 * exit
5517 */ 5535 */
5518static int complete_mmio(struct kvm_vcpu *vcpu) 5536static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
5519{ 5537{
5520 struct kvm_run *run = vcpu->run; 5538 struct kvm_run *run = vcpu->run;
5521 struct kvm_mmio_fragment *frag; 5539 struct kvm_mmio_fragment *frag;
5522 int r;
5523 5540
5524 if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) 5541 BUG_ON(!vcpu->mmio_needed);
5525 return 1;
5526 5542
5527 if (vcpu->mmio_needed) { 5543 /* Complete previous fragment */
5528 /* Complete previous fragment */ 5544 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
5529 frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; 5545 if (!vcpu->mmio_is_write)
5530 if (!vcpu->mmio_is_write) 5546 memcpy(frag->data, run->mmio.data, frag->len);
5531 memcpy(frag->data, run->mmio.data, frag->len); 5547 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
5532 if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { 5548 vcpu->mmio_needed = 0;
5533 vcpu->mmio_needed = 0;
5534 if (vcpu->mmio_is_write)
5535 return 1;
5536 vcpu->mmio_read_completed = 1;
5537 goto done;
5538 }
5539 /* Initiate next fragment */
5540 ++frag;
5541 run->exit_reason = KVM_EXIT_MMIO;
5542 run->mmio.phys_addr = frag->gpa;
5543 if (vcpu->mmio_is_write) 5549 if (vcpu->mmio_is_write)
5544 memcpy(run->mmio.data, frag->data, frag->len); 5550 return 1;
5545 run->mmio.len = frag->len; 5551 vcpu->mmio_read_completed = 1;
5546 run->mmio.is_write = vcpu->mmio_is_write; 5552 return complete_emulated_io(vcpu);
5547 return 0; 5553 }
5548 5554 /* Initiate next fragment */
5549 } 5555 ++frag;
5550done: 5556 run->exit_reason = KVM_EXIT_MMIO;
5551 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 5557 run->mmio.phys_addr = frag->gpa;
5552 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); 5558 if (vcpu->mmio_is_write)
5553 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5559 memcpy(run->mmio.data, frag->data, frag->len);
5554 if (r != EMULATE_DONE) 5560 run->mmio.len = frag->len;
5555 return 0; 5561 run->mmio.is_write = vcpu->mmio_is_write;
5556 return 1; 5562 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5563 return 0;
5557} 5564}
5558 5565
5566
5559int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5567int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5560{ 5568{
5561 int r; 5569 int r;
@@ -5582,9 +5590,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5582 } 5590 }
5583 } 5591 }
5584 5592
5585 r = complete_mmio(vcpu); 5593 if (unlikely(vcpu->arch.complete_userspace_io)) {
5586 if (r <= 0) 5594 int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
5587 goto out; 5595 vcpu->arch.complete_userspace_io = NULL;
5596 r = cui(vcpu);
5597 if (r <= 0)
5598 goto out;
5599 } else
5600 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
5588 5601
5589 r = __vcpu_run(vcpu); 5602 r = __vcpu_run(vcpu);
5590 5603
@@ -5602,12 +5615,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5602 /* 5615 /*
5603 * We are here if userspace calls get_regs() in the middle of 5616 * We are here if userspace calls get_regs() in the middle of
5604 * instruction emulation. Registers state needs to be copied 5617 * instruction emulation. Registers state needs to be copied
5605 * back from emulation context to vcpu. Usrapace shouldn't do 5618 * back from emulation context to vcpu. Userspace shouldn't do
5606 * that usually, but some bad designed PV devices (vmware 5619 * that usually, but some bad designed PV devices (vmware
5607 * backdoor interface) need this to work 5620 * backdoor interface) need this to work
5608 */ 5621 */
5609 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 5622 emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
5610 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5611 vcpu->arch.emulate_regs_need_sync_to_vcpu = false; 5623 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5612 } 5624 }
5613 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5625 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5747,7 +5759,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
5747 if (ret) 5759 if (ret)
5748 return EMULATE_FAIL; 5760 return EMULATE_FAIL;
5749 5761
5750 memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5751 kvm_rip_write(vcpu, ctxt->eip); 5762 kvm_rip_write(vcpu, ctxt->eip);
5752 kvm_set_rflags(vcpu, ctxt->eflags); 5763 kvm_set_rflags(vcpu, ctxt->eflags);
5753 kvm_make_request(KVM_REQ_EVENT, vcpu); 5764 kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5799,7 +5810,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5799 if (mmu_reset_needed) 5810 if (mmu_reset_needed)
5800 kvm_mmu_reset_context(vcpu); 5811 kvm_mmu_reset_context(vcpu);
5801 5812
5802 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 5813 max_bits = KVM_NR_INTERRUPTS;
5803 pending_vec = find_first_bit( 5814 pending_vec = find_first_bit(
5804 (const unsigned long *)sregs->interrupt_bitmap, max_bits); 5815 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
5805 if (pending_vec < max_bits) { 5816 if (pending_vec < max_bits) {
@@ -5859,13 +5870,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5859 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5870 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5860 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5871 for (i = 0; i < KVM_NR_DB_REGS; ++i)
5861 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5872 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
5862 vcpu->arch.switch_db_regs = 5873 vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
5863 (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
5864 } else { 5874 } else {
5865 for (i = 0; i < KVM_NR_DB_REGS; i++) 5875 for (i = 0; i < KVM_NR_DB_REGS; i++)
5866 vcpu->arch.eff_db[i] = vcpu->arch.db[i]; 5876 vcpu->arch.eff_db[i] = vcpu->arch.db[i];
5867 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5868 } 5877 }
5878 kvm_update_dr7(vcpu);
5869 5879
5870 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5880 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5871 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + 5881 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5877,7 +5887,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5877 */ 5887 */
5878 kvm_set_rflags(vcpu, rflags); 5888 kvm_set_rflags(vcpu, rflags);
5879 5889
5880 kvm_x86_ops->set_guest_debug(vcpu, dbg); 5890 kvm_x86_ops->update_db_bp_intercept(vcpu);
5881 5891
5882 r = 0; 5892 r = 0;
5883 5893
@@ -6023,7 +6033,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6023 int r; 6033 int r;
6024 6034
6025 vcpu->arch.mtrr_state.have_fixed = 1; 6035 vcpu->arch.mtrr_state.have_fixed = 1;
6026 vcpu_load(vcpu); 6036 r = vcpu_load(vcpu);
6037 if (r)
6038 return r;
6027 r = kvm_arch_vcpu_reset(vcpu); 6039 r = kvm_arch_vcpu_reset(vcpu);
6028 if (r == 0) 6040 if (r == 0)
6029 r = kvm_mmu_setup(vcpu); 6041 r = kvm_mmu_setup(vcpu);
@@ -6034,9 +6046,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6034 6046
6035void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6047void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6036{ 6048{
6049 int r;
6037 vcpu->arch.apf.msr_val = 0; 6050 vcpu->arch.apf.msr_val = 0;
6038 6051
6039 vcpu_load(vcpu); 6052 r = vcpu_load(vcpu);
6053 BUG_ON(r);
6040 kvm_mmu_unload(vcpu); 6054 kvm_mmu_unload(vcpu);
6041 vcpu_put(vcpu); 6055 vcpu_put(vcpu);
6042 6056
@@ -6050,10 +6064,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6050 vcpu->arch.nmi_pending = 0; 6064 vcpu->arch.nmi_pending = 0;
6051 vcpu->arch.nmi_injected = false; 6065 vcpu->arch.nmi_injected = false;
6052 6066
6053 vcpu->arch.switch_db_regs = 0;
6054 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); 6067 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6055 vcpu->arch.dr6 = DR6_FIXED_1; 6068 vcpu->arch.dr6 = DR6_FIXED_1;
6056 vcpu->arch.dr7 = DR7_FIXED_1; 6069 vcpu->arch.dr7 = DR7_FIXED_1;
6070 kvm_update_dr7(vcpu);
6057 6071
6058 kvm_make_request(KVM_REQ_EVENT, vcpu); 6072 kvm_make_request(KVM_REQ_EVENT, vcpu);
6059 vcpu->arch.apf.msr_val = 0; 6073 vcpu->arch.apf.msr_val = 0;
@@ -6132,7 +6146,7 @@ int kvm_arch_hardware_enable(void *garbage)
6132 * as we reset last_host_tsc on all VCPUs to stop this from being 6146 * as we reset last_host_tsc on all VCPUs to stop this from being
6133 * called multiple times (one for each physical CPU bringup). 6147 * called multiple times (one for each physical CPU bringup).
6134 * 6148 *
6135 * Platforms with unnreliable TSCs don't have to deal with this, they 6149 * Platforms with unreliable TSCs don't have to deal with this, they
6136 * will be compensated by the logic in vcpu_load, which sets the TSC to 6150 * will be compensated by the logic in vcpu_load, which sets the TSC to
6137 * catchup mode. This will catchup all VCPUs to real time, but cannot 6151 * catchup mode. This will catchup all VCPUs to real time, but cannot
6138 * guarantee that they stay in perfect synchronization. 6152 * guarantee that they stay in perfect synchronization.
@@ -6185,6 +6199,8 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
6185 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); 6199 return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
6186} 6200}
6187 6201
6202struct static_key kvm_no_apic_vcpu __read_mostly;
6203
6188int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 6204int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6189{ 6205{
6190 struct page *page; 6206 struct page *page;
@@ -6217,7 +6233,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6217 r = kvm_create_lapic(vcpu); 6233 r = kvm_create_lapic(vcpu);
6218 if (r < 0) 6234 if (r < 0)
6219 goto fail_mmu_destroy; 6235 goto fail_mmu_destroy;
6220 } 6236 } else
6237 static_key_slow_inc(&kvm_no_apic_vcpu);
6221 6238
6222 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, 6239 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6223 GFP_KERNEL); 6240 GFP_KERNEL);
@@ -6257,6 +6274,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6257 kvm_mmu_destroy(vcpu); 6274 kvm_mmu_destroy(vcpu);
6258 srcu_read_unlock(&vcpu->kvm->srcu, idx); 6275 srcu_read_unlock(&vcpu->kvm->srcu, idx);
6259 free_page((unsigned long)vcpu->arch.pio_data); 6276 free_page((unsigned long)vcpu->arch.pio_data);
6277 if (!irqchip_in_kernel(vcpu->kvm))
6278 static_key_slow_dec(&kvm_no_apic_vcpu);
6260} 6279}
6261 6280
6262int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) 6281int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6269,15 +6288,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
6269 6288
6270 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6289 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6271 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6290 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
6291 /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
6292 set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
6293 &kvm->arch.irq_sources_bitmap);
6272 6294
6273 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6295 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
6296 mutex_init(&kvm->arch.apic_map_lock);
6274 6297
6275 return 0; 6298 return 0;
6276} 6299}
6277 6300
6278static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6301static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6279{ 6302{
6280 vcpu_load(vcpu); 6303 int r;
6304 r = vcpu_load(vcpu);
6305 BUG_ON(r);
6281 kvm_mmu_unload(vcpu); 6306 kvm_mmu_unload(vcpu);
6282 vcpu_put(vcpu); 6307 vcpu_put(vcpu);
6283} 6308}
@@ -6321,6 +6346,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
6321 put_page(kvm->arch.apic_access_page); 6346 put_page(kvm->arch.apic_access_page);
6322 if (kvm->arch.ept_identity_pagetable) 6347 if (kvm->arch.ept_identity_pagetable)
6323 put_page(kvm->arch.ept_identity_pagetable); 6348 put_page(kvm->arch.ept_identity_pagetable);
6349 kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
6324} 6350}
6325 6351
6326void kvm_arch_free_memslot(struct kvm_memory_slot *free, 6352void kvm_arch_free_memslot(struct kvm_memory_slot *free,
@@ -6328,10 +6354,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
6328{ 6354{
6329 int i; 6355 int i;
6330 6356
6331 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6357 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6332 if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { 6358 if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
6333 kvm_kvfree(free->arch.lpage_info[i]); 6359 kvm_kvfree(free->arch.rmap[i]);
6334 free->arch.lpage_info[i] = NULL; 6360 free->arch.rmap[i] = NULL;
6361 }
6362 if (i == 0)
6363 continue;
6364
6365 if (!dont || free->arch.lpage_info[i - 1] !=
6366 dont->arch.lpage_info[i - 1]) {
6367 kvm_kvfree(free->arch.lpage_info[i - 1]);
6368 free->arch.lpage_info[i - 1] = NULL;
6335 } 6369 }
6336 } 6370 }
6337} 6371}
@@ -6340,23 +6374,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6340{ 6374{
6341 int i; 6375 int i;
6342 6376
6343 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6377 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6344 unsigned long ugfn; 6378 unsigned long ugfn;
6345 int lpages; 6379 int lpages;
6346 int level = i + 2; 6380 int level = i + 1;
6347 6381
6348 lpages = gfn_to_index(slot->base_gfn + npages - 1, 6382 lpages = gfn_to_index(slot->base_gfn + npages - 1,
6349 slot->base_gfn, level) + 1; 6383 slot->base_gfn, level) + 1;
6350 6384
6351 slot->arch.lpage_info[i] = 6385 slot->arch.rmap[i] =
6352 kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); 6386 kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
6353 if (!slot->arch.lpage_info[i]) 6387 if (!slot->arch.rmap[i])
6388 goto out_free;
6389 if (i == 0)
6390 continue;
6391
6392 slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
6393 sizeof(*slot->arch.lpage_info[i - 1]));
6394 if (!slot->arch.lpage_info[i - 1])
6354 goto out_free; 6395 goto out_free;
6355 6396
6356 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 6397 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
6357 slot->arch.lpage_info[i][0].write_count = 1; 6398 slot->arch.lpage_info[i - 1][0].write_count = 1;
6358 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 6399 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
6359 slot->arch.lpage_info[i][lpages - 1].write_count = 1; 6400 slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
6360 ugfn = slot->userspace_addr >> PAGE_SHIFT; 6401 ugfn = slot->userspace_addr >> PAGE_SHIFT;
6361 /* 6402 /*
6362 * If the gfn and userspace address are not aligned wrt each 6403 * If the gfn and userspace address are not aligned wrt each
@@ -6368,16 +6409,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
6368 unsigned long j; 6409 unsigned long j;
6369 6410
6370 for (j = 0; j < lpages; ++j) 6411 for (j = 0; j < lpages; ++j)
6371 slot->arch.lpage_info[i][j].write_count = 1; 6412 slot->arch.lpage_info[i - 1][j].write_count = 1;
6372 } 6413 }
6373 } 6414 }
6374 6415
6375 return 0; 6416 return 0;
6376 6417
6377out_free: 6418out_free:
6378 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 6419 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
6379 kvm_kvfree(slot->arch.lpage_info[i]); 6420 kvm_kvfree(slot->arch.rmap[i]);
6380 slot->arch.lpage_info[i] = NULL; 6421 slot->arch.rmap[i] = NULL;
6422 if (i == 0)
6423 continue;
6424
6425 kvm_kvfree(slot->arch.lpage_info[i - 1]);
6426 slot->arch.lpage_info[i - 1] = NULL;
6381 } 6427 }
6382 return -ENOMEM; 6428 return -ENOMEM;
6383} 6429}
@@ -6396,10 +6442,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6396 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6442 map_flags = MAP_SHARED | MAP_ANONYMOUS;
6397 6443
6398 /*To keep backward compatibility with older userspace, 6444 /*To keep backward compatibility with older userspace,
6399 *x86 needs to hanlde !user_alloc case. 6445 *x86 needs to handle !user_alloc case.
6400 */ 6446 */
6401 if (!user_alloc) { 6447 if (!user_alloc) {
6402 if (npages && !old.rmap) { 6448 if (npages && !old.npages) {
6403 unsigned long userspace_addr; 6449 unsigned long userspace_addr;
6404 6450
6405 userspace_addr = vm_mmap(NULL, 0, 6451 userspace_addr = vm_mmap(NULL, 0,
@@ -6427,7 +6473,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6427 6473
6428 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6474 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6429 6475
6430 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6476 if (!user_alloc && !old.user_alloc && old.npages && !npages) {
6431 int ret; 6477 int ret;
6432 6478
6433 ret = vm_munmap(old.userspace_addr, 6479 ret = vm_munmap(old.userspace_addr,
@@ -6446,14 +6492,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6446 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6492 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6447 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6493 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6448 spin_unlock(&kvm->mmu_lock); 6494 spin_unlock(&kvm->mmu_lock);
6495 /*
6496 * If memory slot is created, or moved, we need to clear all
6497 * mmio sptes.
6498 */
6499 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
6500 kvm_mmu_zap_all(kvm);
6501 kvm_reload_remote_mmus(kvm);
6502 }
6449} 6503}
6450 6504
6451void kvm_arch_flush_shadow(struct kvm *kvm) 6505void kvm_arch_flush_shadow_all(struct kvm *kvm)
6452{ 6506{
6453 kvm_mmu_zap_all(kvm); 6507 kvm_mmu_zap_all(kvm);
6454 kvm_reload_remote_mmus(kvm); 6508 kvm_reload_remote_mmus(kvm);
6455} 6509}
6456 6510
6511void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6512 struct kvm_memory_slot *slot)
6513{
6514 kvm_arch_flush_shadow_all(kvm);
6515}
6516
6457int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6517int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6458{ 6518{
6459 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 6519 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3d1134ddb885..2b5219c12ac8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -124,4 +124,5 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
124 124
125extern u64 host_xcr0; 125extern u64 host_xcr0;
126 126
127extern struct static_key kvm_no_apic_vcpu;
127#endif 128#endif