aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-24 16:07:18 -0500
commit89f883372fa60f604d136924baf3e89ff1870e9e (patch)
treecb69b0a14957945ba00d3d392bf9ccbbef56f3b8 /arch/x86
parent9e2d59ad580d590134285f361a0e80f0e98c0207 (diff)
parent6b73a96065e89dc9fa75ba4f78b1aa3a3bbd0470 (diff)
Merge tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "KVM updates for the 3.9 merge window, including x86 real mode emulation fixes, stronger memory slot interface restrictions, mmu_lock spinlock hold time reduction, improved handling of large page faults on shadow, initial APICv HW acceleration support, s390 channel IO based virtio, amongst others" * tag 'kvm-3.9-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits) Revert "KVM: MMU: lazily drop large spte" x86: pvclock kvm: align allocation size to page size KVM: nVMX: Remove redundant get_vmcs12 from nested_vmx_exit_handled_msr x86 emulator: fix parity calculation for AAD instruction KVM: PPC: BookE: Handle alignment interrupts booke: Added DBCR4 SPR number KVM: PPC: booke: Allow multiple exception types KVM: PPC: booke: use vcpu reference from thread_struct KVM: Remove user_alloc from struct kvm_memory_slot KVM: VMX: disable apicv by default KVM: s390: Fix handling of iscs. KVM: MMU: cleanup __direct_map KVM: MMU: remove pt_access in mmu_set_spte KVM: MMU: cleanup mapping-level KVM: MMU: lazily drop large spte KVM: VMX: cleanup vmx_set_cr0(). KVM: VMX: add missing exit names to VMX_EXIT_REASONS array KVM: VMX: disable SMEP feature when guest is in non-paging mode KVM: Remove duplicate text in api.txt Revert "KVM: MMU: split kvm_mmu_free_page" ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h26
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/vmx.h18
-rw-r--r--arch/x86/include/uapi/asm/vmx.h9
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kvm/emulate.c673
-rw-r--r--arch/x86/kvm/i8254.c1
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/irq.c74
-rw-r--r--arch/x86/kvm/lapic.c140
-rw-r--r--arch/x86/kvm/lapic.h34
-rw-r--r--arch/x86/kvm/mmu.c168
-rw-r--r--arch/x86/kvm/mmutrace.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h106
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/vmx.c714
-rw-r--r--arch/x86/kvm/x86.c168
17 files changed, 1411 insertions, 765 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dc87b65e9c3a..635a74d22409 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,10 +33,10 @@
33 33
34#define KVM_MAX_VCPUS 254 34#define KVM_MAX_VCPUS 254
35#define KVM_SOFT_MAX_VCPUS 160 35#define KVM_SOFT_MAX_VCPUS 160
36#define KVM_MEMORY_SLOTS 32 36#define KVM_USER_MEM_SLOTS 125
37/* memory slots that does not exposed to userspace */ 37/* memory slots that are not exposed to userspace */
38#define KVM_PRIVATE_MEM_SLOTS 4 38#define KVM_PRIVATE_MEM_SLOTS 3
39#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 39#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
40 40
41#define KVM_MMIO_SIZE 16 41#define KVM_MMIO_SIZE 16
42 42
@@ -219,11 +219,6 @@ struct kvm_mmu_page {
219 u64 *spt; 219 u64 *spt;
220 /* hold the gfn of each spte inside spt */ 220 /* hold the gfn of each spte inside spt */
221 gfn_t *gfns; 221 gfn_t *gfns;
222 /*
223 * One bit set per slot which has memory
224 * in this shadow page.
225 */
226 DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
227 bool unsync; 222 bool unsync;
228 int root_count; /* Currently serving as active root */ 223 int root_count; /* Currently serving as active root */
229 unsigned int unsync_children; 224 unsigned int unsync_children;
@@ -502,6 +497,13 @@ struct kvm_vcpu_arch {
502 u64 msr_val; 497 u64 msr_val;
503 struct gfn_to_hva_cache data; 498 struct gfn_to_hva_cache data;
504 } pv_eoi; 499 } pv_eoi;
500
501 /*
502 * Indicate whether the access faults on its page table in guest
503 * which is set when fix page fault and used to detect unhandeable
504 * instruction.
505 */
506 bool write_fault_to_shadow_pgtable;
505}; 507};
506 508
507struct kvm_lpage_info { 509struct kvm_lpage_info {
@@ -697,6 +699,11 @@ struct kvm_x86_ops {
697 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 699 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
698 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 700 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
699 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 701 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
702 int (*vm_has_apicv)(struct kvm *kvm);
703 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
704 void (*hwapic_isr_update)(struct kvm *kvm, int isr);
705 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
706 void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
700 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 707 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
701 int (*get_tdp_level)(void); 708 int (*get_tdp_level)(void);
702 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 709 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -991,6 +998,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva);
991int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); 998int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
992void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 999void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
993int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 1000int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
1001int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
994int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 1002int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
995int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 1003int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
996int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 1004int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65231e173baf..695399f2d5eb 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -27,7 +27,7 @@ static inline bool kvm_check_and_clear_guest_paused(void)
27 * 27 *
28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. 28 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
29 * The hypercall number should be placed in rax and the return value will be 29 * The hypercall number should be placed in rax and the return value will be
30 * placed in rax. No other registers will be clobbered unless explicited 30 * placed in rax. No other registers will be clobbered unless explicitly
31 * noted by the particular hypercall. 31 * noted by the particular hypercall.
32 */ 32 */
33 33
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 235b49fa554b..b6fbf860e398 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -57,9 +57,12 @@
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 58#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
59#define SECONDARY_EXEC_RDTSCP 0x00000008 59#define SECONDARY_EXEC_RDTSCP 0x00000008
60#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
60#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 61#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
61#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 62#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
62#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 63#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
64#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
65#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
63#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 66#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
64#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 67#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
65 68
@@ -97,6 +100,7 @@ enum vmcs_field {
97 GUEST_GS_SELECTOR = 0x0000080a, 100 GUEST_GS_SELECTOR = 0x0000080a,
98 GUEST_LDTR_SELECTOR = 0x0000080c, 101 GUEST_LDTR_SELECTOR = 0x0000080c,
99 GUEST_TR_SELECTOR = 0x0000080e, 102 GUEST_TR_SELECTOR = 0x0000080e,
103 GUEST_INTR_STATUS = 0x00000810,
100 HOST_ES_SELECTOR = 0x00000c00, 104 HOST_ES_SELECTOR = 0x00000c00,
101 HOST_CS_SELECTOR = 0x00000c02, 105 HOST_CS_SELECTOR = 0x00000c02,
102 HOST_SS_SELECTOR = 0x00000c04, 106 HOST_SS_SELECTOR = 0x00000c04,
@@ -124,6 +128,14 @@ enum vmcs_field {
124 APIC_ACCESS_ADDR_HIGH = 0x00002015, 128 APIC_ACCESS_ADDR_HIGH = 0x00002015,
125 EPT_POINTER = 0x0000201a, 129 EPT_POINTER = 0x0000201a,
126 EPT_POINTER_HIGH = 0x0000201b, 130 EPT_POINTER_HIGH = 0x0000201b,
131 EOI_EXIT_BITMAP0 = 0x0000201c,
132 EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
133 EOI_EXIT_BITMAP1 = 0x0000201e,
134 EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
135 EOI_EXIT_BITMAP2 = 0x00002020,
136 EOI_EXIT_BITMAP2_HIGH = 0x00002021,
137 EOI_EXIT_BITMAP3 = 0x00002022,
138 EOI_EXIT_BITMAP3_HIGH = 0x00002023,
127 GUEST_PHYSICAL_ADDRESS = 0x00002400, 139 GUEST_PHYSICAL_ADDRESS = 0x00002400,
128 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, 140 GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
129 VMCS_LINK_POINTER = 0x00002800, 141 VMCS_LINK_POINTER = 0x00002800,
@@ -346,9 +358,9 @@ enum vmcs_field {
346 358
347#define AR_RESERVD_MASK 0xfffe0f00 359#define AR_RESERVD_MASK 0xfffe0f00
348 360
349#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) 361#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
350#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) 362#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
351#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) 363#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
352 364
353#define VMX_NR_VPIDS (1 << 16) 365#define VMX_NR_VPIDS (1 << 16)
354#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 366#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 979d03bce135..2871fccfee68 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -62,10 +62,12 @@
62#define EXIT_REASON_MCE_DURING_VMENTRY 41 62#define EXIT_REASON_MCE_DURING_VMENTRY 41
63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 63#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
64#define EXIT_REASON_APIC_ACCESS 44 64#define EXIT_REASON_APIC_ACCESS 44
65#define EXIT_REASON_EOI_INDUCED 45
65#define EXIT_REASON_EPT_VIOLATION 48 66#define EXIT_REASON_EPT_VIOLATION 48
66#define EXIT_REASON_EPT_MISCONFIG 49 67#define EXIT_REASON_EPT_MISCONFIG 49
67#define EXIT_REASON_WBINVD 54 68#define EXIT_REASON_WBINVD 54
68#define EXIT_REASON_XSETBV 55 69#define EXIT_REASON_XSETBV 55
70#define EXIT_REASON_APIC_WRITE 56
69#define EXIT_REASON_INVPCID 58 71#define EXIT_REASON_INVPCID 58
70 72
71#define VMX_EXIT_REASONS \ 73#define VMX_EXIT_REASONS \
@@ -103,7 +105,12 @@
103 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ 105 { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
104 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ 106 { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
105 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ 107 { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
106 { EXIT_REASON_WBINVD, "WBINVD" } 108 { EXIT_REASON_WBINVD, "WBINVD" }, \
109 { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
110 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
111 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
112 { EXIT_REASON_INVD, "INVD" }, \
113 { EXIT_REASON_INVPCID, "INVPCID" }
107 114
108 115
109#endif /* _UAPIVMX_H */ 116#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 9f966dc0b9e4..0732f0089a3d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -218,6 +218,9 @@ static void kvm_shutdown(void)
218void __init kvmclock_init(void) 218void __init kvmclock_init(void)
219{ 219{
220 unsigned long mem; 220 unsigned long mem;
221 int size;
222
223 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
221 224
222 if (!kvm_para_available()) 225 if (!kvm_para_available())
223 return; 226 return;
@@ -231,16 +234,14 @@ void __init kvmclock_init(void)
231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 234 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
232 msr_kvm_system_time, msr_kvm_wall_clock); 235 msr_kvm_system_time, msr_kvm_wall_clock);
233 236
234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, 237 mem = memblock_alloc(size, PAGE_SIZE);
235 PAGE_SIZE);
236 if (!mem) 238 if (!mem)
237 return; 239 return;
238 hv_clock = __va(mem); 240 hv_clock = __va(mem);
239 241
240 if (kvm_register_clock("boot clock")) { 242 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL; 243 hv_clock = NULL;
242 memblock_free(mem, 244 memblock_free(mem, size);
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
244 return; 245 return;
245 } 246 }
246 pv_time_ops.sched_clock = kvm_clock_read; 247 pv_time_ops.sched_clock = kvm_clock_read;
@@ -275,7 +276,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
275 struct pvclock_vcpu_time_info *vcpu_time; 276 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size; 277 unsigned int size;
277 278
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; 279 size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
279 280
280 preempt_disable(); 281 preempt_disable();
281 cpu = smp_processor_id(); 282 cpu = smp_processor_id();
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a27e76371108..a335cc6cde72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -24,6 +24,7 @@
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include <linux/module.h> 25#include <linux/module.h>
26#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
27#include <linux/stringify.h>
27 28
28#include "x86.h" 29#include "x86.h"
29#include "tss.h" 30#include "tss.h"
@@ -43,7 +44,7 @@
43#define OpCL 9ull /* CL register (for shifts) */ 44#define OpCL 9ull /* CL register (for shifts) */
44#define OpImmByte 10ull /* 8-bit sign extended immediate */ 45#define OpImmByte 10ull /* 8-bit sign extended immediate */
45#define OpOne 11ull /* Implied 1 */ 46#define OpOne 11ull /* Implied 1 */
46#define OpImm 12ull /* Sign extended immediate */ 47#define OpImm 12ull /* Sign extended up to 32-bit immediate */
47#define OpMem16 13ull /* Memory operand (16-bit). */ 48#define OpMem16 13ull /* Memory operand (16-bit). */
48#define OpMem32 14ull /* Memory operand (32-bit). */ 49#define OpMem32 14ull /* Memory operand (32-bit). */
49#define OpImmU 15ull /* Immediate operand, zero extended */ 50#define OpImmU 15ull /* Immediate operand, zero extended */
@@ -58,6 +59,7 @@
58#define OpFS 24ull /* FS */ 59#define OpFS 24ull /* FS */
59#define OpGS 25ull /* GS */ 60#define OpGS 25ull /* GS */
60#define OpMem8 26ull /* 8-bit zero extended memory operand */ 61#define OpMem8 26ull /* 8-bit zero extended memory operand */
62#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
61 63
62#define OpBits 5 /* Width of operand field */ 64#define OpBits 5 /* Width of operand field */
63#define OpMask ((1ull << OpBits) - 1) 65#define OpMask ((1ull << OpBits) - 1)
@@ -101,6 +103,7 @@
101#define SrcMemFAddr (OpMemFAddr << SrcShift) 103#define SrcMemFAddr (OpMemFAddr << SrcShift)
102#define SrcAcc (OpAcc << SrcShift) 104#define SrcAcc (OpAcc << SrcShift)
103#define SrcImmU16 (OpImmU16 << SrcShift) 105#define SrcImmU16 (OpImmU16 << SrcShift)
106#define SrcImm64 (OpImm64 << SrcShift)
104#define SrcDX (OpDX << SrcShift) 107#define SrcDX (OpDX << SrcShift)
105#define SrcMem8 (OpMem8 << SrcShift) 108#define SrcMem8 (OpMem8 << SrcShift)
106#define SrcMask (OpMask << SrcShift) 109#define SrcMask (OpMask << SrcShift)
@@ -113,6 +116,7 @@
113#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ 116#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
114#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ 117#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
115#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ 118#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
119#define Escape (5<<15) /* Escape to coprocessor instruction */
116#define Sse (1<<18) /* SSE Vector instruction */ 120#define Sse (1<<18) /* SSE Vector instruction */
117/* Generic ModRM decode. */ 121/* Generic ModRM decode. */
118#define ModRM (1<<19) 122#define ModRM (1<<19)
@@ -146,6 +150,8 @@
146#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ 150#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
147#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ 151#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
148#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ 152#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
153#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
154#define NoWrite ((u64)1 << 45) /* No writeback */
149 155
150#define X2(x...) x, x 156#define X2(x...) x, x
151#define X3(x...) X2(x), x 157#define X3(x...) X2(x), x
@@ -156,6 +162,27 @@
156#define X8(x...) X4(x), X4(x) 162#define X8(x...) X4(x), X4(x)
157#define X16(x...) X8(x), X8(x) 163#define X16(x...) X8(x), X8(x)
158 164
165#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
166#define FASTOP_SIZE 8
167
168/*
169 * fastop functions have a special calling convention:
170 *
171 * dst: [rdx]:rax (in/out)
172 * src: rbx (in/out)
173 * src2: rcx (in)
174 * flags: rflags (in/out)
175 *
176 * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
177 * different operand sizes can be reached by calculation, rather than a jump
178 * table (which would be bigger than the code).
179 *
180 * fastop functions are declared as taking a never-defined fastop parameter,
181 * so they can't be called from C directly.
182 */
183
184struct fastop;
185
159struct opcode { 186struct opcode {
160 u64 flags : 56; 187 u64 flags : 56;
161 u64 intercept : 8; 188 u64 intercept : 8;
@@ -164,6 +191,8 @@ struct opcode {
164 const struct opcode *group; 191 const struct opcode *group;
165 const struct group_dual *gdual; 192 const struct group_dual *gdual;
166 const struct gprefix *gprefix; 193 const struct gprefix *gprefix;
194 const struct escape *esc;
195 void (*fastop)(struct fastop *fake);
167 } u; 196 } u;
168 int (*check_perm)(struct x86_emulate_ctxt *ctxt); 197 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
169}; 198};
@@ -180,6 +209,11 @@ struct gprefix {
180 struct opcode pfx_f3; 209 struct opcode pfx_f3;
181}; 210};
182 211
212struct escape {
213 struct opcode op[8];
214 struct opcode high[64];
215};
216
183/* EFLAGS bit definitions. */ 217/* EFLAGS bit definitions. */
184#define EFLG_ID (1<<21) 218#define EFLG_ID (1<<21)
185#define EFLG_VIP (1<<20) 219#define EFLG_VIP (1<<20)
@@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
407 } \ 441 } \
408 } while (0) 442 } while (0)
409 443
444static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
445
446#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
447#define FOP_RET "ret \n\t"
448
449#define FOP_START(op) \
450 extern void em_##op(struct fastop *fake); \
451 asm(".pushsection .text, \"ax\" \n\t" \
452 ".global em_" #op " \n\t" \
453 FOP_ALIGN \
454 "em_" #op ": \n\t"
455
456#define FOP_END \
457 ".popsection")
458
459#define FOPNOP() FOP_ALIGN FOP_RET
460
461#define FOP1E(op, dst) \
462 FOP_ALIGN #op " %" #dst " \n\t" FOP_RET
463
464#define FASTOP1(op) \
465 FOP_START(op) \
466 FOP1E(op##b, al) \
467 FOP1E(op##w, ax) \
468 FOP1E(op##l, eax) \
469 ON64(FOP1E(op##q, rax)) \
470 FOP_END
471
472#define FOP2E(op, dst, src) \
473 FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
474
475#define FASTOP2(op) \
476 FOP_START(op) \
477 FOP2E(op##b, al, bl) \
478 FOP2E(op##w, ax, bx) \
479 FOP2E(op##l, eax, ebx) \
480 ON64(FOP2E(op##q, rax, rbx)) \
481 FOP_END
482
483/* 2 operand, word only */
484#define FASTOP2W(op) \
485 FOP_START(op) \
486 FOPNOP() \
487 FOP2E(op##w, ax, bx) \
488 FOP2E(op##l, eax, ebx) \
489 ON64(FOP2E(op##q, rax, rbx)) \
490 FOP_END
491
492/* 2 operand, src is CL */
493#define FASTOP2CL(op) \
494 FOP_START(op) \
495 FOP2E(op##b, al, cl) \
496 FOP2E(op##w, ax, cl) \
497 FOP2E(op##l, eax, cl) \
498 ON64(FOP2E(op##q, rax, cl)) \
499 FOP_END
500
501#define FOP3E(op, dst, src, src2) \
502 FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
503
504/* 3-operand, word-only, src2=cl */
505#define FASTOP3WCL(op) \
506 FOP_START(op) \
507 FOPNOP() \
508 FOP3E(op##w, ax, bx, cl) \
509 FOP3E(op##l, eax, ebx, cl) \
510 ON64(FOP3E(op##q, rax, rbx, cl)) \
511 FOP_END
512
513/* Special case for SETcc - 1 instruction per cc */
514#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
515
516FOP_START(setcc)
517FOP_SETCC(seto)
518FOP_SETCC(setno)
519FOP_SETCC(setc)
520FOP_SETCC(setnc)
521FOP_SETCC(setz)
522FOP_SETCC(setnz)
523FOP_SETCC(setbe)
524FOP_SETCC(setnbe)
525FOP_SETCC(sets)
526FOP_SETCC(setns)
527FOP_SETCC(setp)
528FOP_SETCC(setnp)
529FOP_SETCC(setl)
530FOP_SETCC(setnl)
531FOP_SETCC(setle)
532FOP_SETCC(setnle)
533FOP_END;
534
410#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ 535#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
411 do { \ 536 do { \
412 unsigned long _tmp; \ 537 unsigned long _tmp; \
@@ -663,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
663 ulong la; 788 ulong la;
664 u32 lim; 789 u32 lim;
665 u16 sel; 790 u16 sel;
666 unsigned cpl, rpl; 791 unsigned cpl;
667 792
668 la = seg_base(ctxt, addr.seg) + addr.ea; 793 la = seg_base(ctxt, addr.seg) + addr.ea;
669 switch (ctxt->mode) { 794 switch (ctxt->mode) {
@@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
697 goto bad; 822 goto bad;
698 } 823 }
699 cpl = ctxt->ops->cpl(ctxt); 824 cpl = ctxt->ops->cpl(ctxt);
700 if (ctxt->mode == X86EMUL_MODE_REAL)
701 rpl = 0;
702 else
703 rpl = sel & 3;
704 cpl = max(cpl, rpl);
705 if (!(desc.type & 8)) { 825 if (!(desc.type & 8)) {
706 /* data segment */ 826 /* data segment */
707 if (cpl > desc.dpl) 827 if (cpl > desc.dpl)
@@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
852 return rc; 972 return rc;
853} 973}
854 974
855static int test_cc(unsigned int condition, unsigned int flags) 975FASTOP2(add);
856{ 976FASTOP2(or);
857 int rc = 0; 977FASTOP2(adc);
858 978FASTOP2(sbb);
859 switch ((condition & 15) >> 1) { 979FASTOP2(and);
860 case 0: /* o */ 980FASTOP2(sub);
861 rc |= (flags & EFLG_OF); 981FASTOP2(xor);
862 break; 982FASTOP2(cmp);
863 case 1: /* b/c/nae */ 983FASTOP2(test);
864 rc |= (flags & EFLG_CF); 984
865 break; 985FASTOP3WCL(shld);
866 case 2: /* z/e */ 986FASTOP3WCL(shrd);
867 rc |= (flags & EFLG_ZF); 987
868 break; 988FASTOP2W(imul);
869 case 3: /* be/na */ 989
870 rc |= (flags & (EFLG_CF|EFLG_ZF)); 990FASTOP1(not);
871 break; 991FASTOP1(neg);
872 case 4: /* s */ 992FASTOP1(inc);
873 rc |= (flags & EFLG_SF); 993FASTOP1(dec);
874 break; 994
875 case 5: /* p/pe */ 995FASTOP2CL(rol);
876 rc |= (flags & EFLG_PF); 996FASTOP2CL(ror);
877 break; 997FASTOP2CL(rcl);
878 case 7: /* le/ng */ 998FASTOP2CL(rcr);
879 rc |= (flags & EFLG_ZF); 999FASTOP2CL(shl);
880 /* fall through */ 1000FASTOP2CL(shr);
881 case 6: /* l/nge */ 1001FASTOP2CL(sar);
882 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); 1002
883 break; 1003FASTOP2W(bsf);
884 } 1004FASTOP2W(bsr);
885 1005FASTOP2W(bt);
886 /* Odd condition identifiers (lsb == 1) have inverted sense. */ 1006FASTOP2W(bts);
887 return (!!rc ^ (condition & 1)); 1007FASTOP2W(btr);
1008FASTOP2W(btc);
1009
1010static u8 test_cc(unsigned int condition, unsigned long flags)
1011{
1012 u8 rc;
1013 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1014
1015 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1016 asm("push %[flags]; popf; call *%[fastop]"
1017 : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1018 return rc;
888} 1019}
889 1020
890static void fetch_register_operand(struct operand *op) 1021static void fetch_register_operand(struct operand *op)
@@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
994 ctxt->ops->put_fpu(ctxt); 1125 ctxt->ops->put_fpu(ctxt);
995} 1126}
996 1127
1128static int em_fninit(struct x86_emulate_ctxt *ctxt)
1129{
1130 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1131 return emulate_nm(ctxt);
1132
1133 ctxt->ops->get_fpu(ctxt);
1134 asm volatile("fninit");
1135 ctxt->ops->put_fpu(ctxt);
1136 return X86EMUL_CONTINUE;
1137}
1138
1139static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
1140{
1141 u16 fcw;
1142
1143 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1144 return emulate_nm(ctxt);
1145
1146 ctxt->ops->get_fpu(ctxt);
1147 asm volatile("fnstcw %0": "+m"(fcw));
1148 ctxt->ops->put_fpu(ctxt);
1149
1150 /* force 2 byte destination */
1151 ctxt->dst.bytes = 2;
1152 ctxt->dst.val = fcw;
1153
1154 return X86EMUL_CONTINUE;
1155}
1156
1157static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
1158{
1159 u16 fsw;
1160
1161 if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
1162 return emulate_nm(ctxt);
1163
1164 ctxt->ops->get_fpu(ctxt);
1165 asm volatile("fnstsw %0": "+m"(fsw));
1166 ctxt->ops->put_fpu(ctxt);
1167
1168 /* force 2 byte destination */
1169 ctxt->dst.bytes = 2;
1170 ctxt->dst.val = fsw;
1171
1172 return X86EMUL_CONTINUE;
1173}
1174
997static void decode_register_operand(struct x86_emulate_ctxt *ctxt, 1175static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
998 struct operand *op) 1176 struct operand *op)
999{ 1177{
@@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
1534{ 1712{
1535 int rc; 1713 int rc;
1536 1714
1715 if (ctxt->d & NoWrite)
1716 return X86EMUL_CONTINUE;
1717
1537 switch (ctxt->dst.type) { 1718 switch (ctxt->dst.type) {
1538 case OP_REG: 1719 case OP_REG:
1539 write_register_operand(&ctxt->dst); 1720 write_register_operand(&ctxt->dst);
@@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1918 return X86EMUL_CONTINUE; 2099 return X86EMUL_CONTINUE;
1919} 2100}
1920 2101
1921static int em_grp2(struct x86_emulate_ctxt *ctxt)
1922{
1923 switch (ctxt->modrm_reg) {
1924 case 0: /* rol */
1925 emulate_2op_SrcB(ctxt, "rol");
1926 break;
1927 case 1: /* ror */
1928 emulate_2op_SrcB(ctxt, "ror");
1929 break;
1930 case 2: /* rcl */
1931 emulate_2op_SrcB(ctxt, "rcl");
1932 break;
1933 case 3: /* rcr */
1934 emulate_2op_SrcB(ctxt, "rcr");
1935 break;
1936 case 4: /* sal/shl */
1937 case 6: /* sal/shl */
1938 emulate_2op_SrcB(ctxt, "sal");
1939 break;
1940 case 5: /* shr */
1941 emulate_2op_SrcB(ctxt, "shr");
1942 break;
1943 case 7: /* sar */
1944 emulate_2op_SrcB(ctxt, "sar");
1945 break;
1946 }
1947 return X86EMUL_CONTINUE;
1948}
1949
1950static int em_not(struct x86_emulate_ctxt *ctxt)
1951{
1952 ctxt->dst.val = ~ctxt->dst.val;
1953 return X86EMUL_CONTINUE;
1954}
1955
1956static int em_neg(struct x86_emulate_ctxt *ctxt)
1957{
1958 emulate_1op(ctxt, "neg");
1959 return X86EMUL_CONTINUE;
1960}
1961
1962static int em_mul_ex(struct x86_emulate_ctxt *ctxt) 2102static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
1963{ 2103{
1964 u8 ex = 0; 2104 u8 ex = 0;
@@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
2000 int rc = X86EMUL_CONTINUE; 2140 int rc = X86EMUL_CONTINUE;
2001 2141
2002 switch (ctxt->modrm_reg) { 2142 switch (ctxt->modrm_reg) {
2003 case 0: /* inc */
2004 emulate_1op(ctxt, "inc");
2005 break;
2006 case 1: /* dec */
2007 emulate_1op(ctxt, "dec");
2008 break;
2009 case 2: /* call near abs */ { 2143 case 2: /* call near abs */ {
2010 long int old_eip; 2144 long int old_eip;
2011 old_eip = ctxt->_eip; 2145 old_eip = ctxt->_eip;
@@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
2075 /* Save real source value, then compare EAX against destination. */ 2209 /* Save real source value, then compare EAX against destination. */
2076 ctxt->src.orig_val = ctxt->src.val; 2210 ctxt->src.orig_val = ctxt->src.val;
2077 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); 2211 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
2078 emulate_2op_SrcV(ctxt, "cmp"); 2212 fastop(ctxt, em_cmp);
2079 2213
2080 if (ctxt->eflags & EFLG_ZF) { 2214 if (ctxt->eflags & EFLG_ZF) {
2081 /* Success: write back to memory. */ 2215 /* Success: write back to memory. */
@@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2843 ctxt->src.type = OP_IMM; 2977 ctxt->src.type = OP_IMM;
2844 ctxt->src.val = 0; 2978 ctxt->src.val = 0;
2845 ctxt->src.bytes = 1; 2979 ctxt->src.bytes = 1;
2846 emulate_2op_SrcV(ctxt, "or"); 2980 fastop(ctxt, em_or);
2847 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); 2981 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2848 if (cf) 2982 if (cf)
2849 ctxt->eflags |= X86_EFLAGS_CF; 2983 ctxt->eflags |= X86_EFLAGS_CF;
@@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
2852 return X86EMUL_CONTINUE; 2986 return X86EMUL_CONTINUE;
2853} 2987}
2854 2988
2989static int em_aad(struct x86_emulate_ctxt *ctxt)
2990{
2991 u8 al = ctxt->dst.val & 0xff;
2992 u8 ah = (ctxt->dst.val >> 8) & 0xff;
2993
2994 al = (al + (ah * ctxt->src.val)) & 0xff;
2995
2996 ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
2997
2998 /* Set PF, ZF, SF */
2999 ctxt->src.type = OP_IMM;
3000 ctxt->src.val = 0;
3001 ctxt->src.bytes = 1;
3002 fastop(ctxt, em_or);
3003
3004 return X86EMUL_CONTINUE;
3005}
3006
2855static int em_call(struct x86_emulate_ctxt *ctxt) 3007static int em_call(struct x86_emulate_ctxt *ctxt)
2856{ 3008{
2857 long rel = ctxt->src.val; 3009 long rel = ctxt->src.val;
@@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2900 return X86EMUL_CONTINUE; 3052 return X86EMUL_CONTINUE;
2901} 3053}
2902 3054
2903static int em_add(struct x86_emulate_ctxt *ctxt)
2904{
2905 emulate_2op_SrcV(ctxt, "add");
2906 return X86EMUL_CONTINUE;
2907}
2908
2909static int em_or(struct x86_emulate_ctxt *ctxt)
2910{
2911 emulate_2op_SrcV(ctxt, "or");
2912 return X86EMUL_CONTINUE;
2913}
2914
2915static int em_adc(struct x86_emulate_ctxt *ctxt)
2916{
2917 emulate_2op_SrcV(ctxt, "adc");
2918 return X86EMUL_CONTINUE;
2919}
2920
2921static int em_sbb(struct x86_emulate_ctxt *ctxt)
2922{
2923 emulate_2op_SrcV(ctxt, "sbb");
2924 return X86EMUL_CONTINUE;
2925}
2926
2927static int em_and(struct x86_emulate_ctxt *ctxt)
2928{
2929 emulate_2op_SrcV(ctxt, "and");
2930 return X86EMUL_CONTINUE;
2931}
2932
2933static int em_sub(struct x86_emulate_ctxt *ctxt)
2934{
2935 emulate_2op_SrcV(ctxt, "sub");
2936 return X86EMUL_CONTINUE;
2937}
2938
2939static int em_xor(struct x86_emulate_ctxt *ctxt)
2940{
2941 emulate_2op_SrcV(ctxt, "xor");
2942 return X86EMUL_CONTINUE;
2943}
2944
2945static int em_cmp(struct x86_emulate_ctxt *ctxt)
2946{
2947 emulate_2op_SrcV(ctxt, "cmp");
2948 /* Disable writeback. */
2949 ctxt->dst.type = OP_NONE;
2950 return X86EMUL_CONTINUE;
2951}
2952
2953static int em_test(struct x86_emulate_ctxt *ctxt)
2954{
2955 emulate_2op_SrcV(ctxt, "test");
2956 /* Disable writeback. */
2957 ctxt->dst.type = OP_NONE;
2958 return X86EMUL_CONTINUE;
2959}
2960
2961static int em_xchg(struct x86_emulate_ctxt *ctxt) 3055static int em_xchg(struct x86_emulate_ctxt *ctxt)
2962{ 3056{
2963 /* Write back the register source. */ 3057 /* Write back the register source. */
@@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
2970 return X86EMUL_CONTINUE; 3064 return X86EMUL_CONTINUE;
2971} 3065}
2972 3066
2973static int em_imul(struct x86_emulate_ctxt *ctxt)
2974{
2975 emulate_2op_SrcV_nobyte(ctxt, "imul");
2976 return X86EMUL_CONTINUE;
2977}
2978
2979static int em_imul_3op(struct x86_emulate_ctxt *ctxt) 3067static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2980{ 3068{
2981 ctxt->dst.val = ctxt->src2.val; 3069 ctxt->dst.val = ctxt->src2.val;
2982 return em_imul(ctxt); 3070 return fastop(ctxt, em_imul);
2983} 3071}
2984 3072
2985static int em_cwd(struct x86_emulate_ctxt *ctxt) 3073static int em_cwd(struct x86_emulate_ctxt *ctxt)
@@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
3300 return X86EMUL_CONTINUE; 3388 return X86EMUL_CONTINUE;
3301} 3389}
3302 3390
3303static int em_bt(struct x86_emulate_ctxt *ctxt)
3304{
3305 /* Disable writeback. */
3306 ctxt->dst.type = OP_NONE;
3307 /* only subword offset */
3308 ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
3309
3310 emulate_2op_SrcV_nobyte(ctxt, "bt");
3311 return X86EMUL_CONTINUE;
3312}
3313
3314static int em_bts(struct x86_emulate_ctxt *ctxt)
3315{
3316 emulate_2op_SrcV_nobyte(ctxt, "bts");
3317 return X86EMUL_CONTINUE;
3318}
3319
3320static int em_btr(struct x86_emulate_ctxt *ctxt)
3321{
3322 emulate_2op_SrcV_nobyte(ctxt, "btr");
3323 return X86EMUL_CONTINUE;
3324}
3325
3326static int em_btc(struct x86_emulate_ctxt *ctxt)
3327{
3328 emulate_2op_SrcV_nobyte(ctxt, "btc");
3329 return X86EMUL_CONTINUE;
3330}
3331
3332static int em_bsf(struct x86_emulate_ctxt *ctxt)
3333{
3334 emulate_2op_SrcV_nobyte(ctxt, "bsf");
3335 return X86EMUL_CONTINUE;
3336}
3337
3338static int em_bsr(struct x86_emulate_ctxt *ctxt)
3339{
3340 emulate_2op_SrcV_nobyte(ctxt, "bsr");
3341 return X86EMUL_CONTINUE;
3342}
3343
3344static int em_cpuid(struct x86_emulate_ctxt *ctxt) 3391static int em_cpuid(struct x86_emulate_ctxt *ctxt)
3345{ 3392{
3346 u32 eax, ebx, ecx, edx; 3393 u32 eax, ebx, ecx, edx;
@@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3572#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } 3619#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3573#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } 3620#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
3574#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } 3621#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
3622#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
3575#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3623#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3624#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
3576#define II(_f, _e, _i) \ 3625#define II(_f, _e, _i) \
3577 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } 3626 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3578#define IIP(_f, _e, _i, _p) \ 3627#define IIP(_f, _e, _i, _p) \
@@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
3583#define D2bv(_f) D((_f) | ByteOp), D(_f) 3632#define D2bv(_f) D((_f) | ByteOp), D(_f)
3584#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) 3633#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3585#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3634#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3635#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
3586#define I2bvIP(_f, _e, _i, _p) \ 3636#define I2bvIP(_f, _e, _i, _p) \
3587 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) 3637 IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
3588 3638
3589#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ 3639#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3590 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ 3640 F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3591 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) 3641 F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3592 3642
3593static const struct opcode group7_rm1[] = { 3643static const struct opcode group7_rm1[] = {
3594 DI(SrcNone | Priv, monitor), 3644 DI(SrcNone | Priv, monitor),
@@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = {
3614}; 3664};
3615 3665
3616static const struct opcode group1[] = { 3666static const struct opcode group1[] = {
3617 I(Lock, em_add), 3667 F(Lock, em_add),
3618 I(Lock | PageTable, em_or), 3668 F(Lock | PageTable, em_or),
3619 I(Lock, em_adc), 3669 F(Lock, em_adc),
3620 I(Lock, em_sbb), 3670 F(Lock, em_sbb),
3621 I(Lock | PageTable, em_and), 3671 F(Lock | PageTable, em_and),
3622 I(Lock, em_sub), 3672 F(Lock, em_sub),
3623 I(Lock, em_xor), 3673 F(Lock, em_xor),
3624 I(0, em_cmp), 3674 F(NoWrite, em_cmp),
3625}; 3675};
3626 3676
3627static const struct opcode group1A[] = { 3677static const struct opcode group1A[] = {
3628 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, 3678 I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
3629}; 3679};
3630 3680
3681static const struct opcode group2[] = {
3682 F(DstMem | ModRM, em_rol),
3683 F(DstMem | ModRM, em_ror),
3684 F(DstMem | ModRM, em_rcl),
3685 F(DstMem | ModRM, em_rcr),
3686 F(DstMem | ModRM, em_shl),
3687 F(DstMem | ModRM, em_shr),
3688 F(DstMem | ModRM, em_shl),
3689 F(DstMem | ModRM, em_sar),
3690};
3691
3631static const struct opcode group3[] = { 3692static const struct opcode group3[] = {
3632 I(DstMem | SrcImm, em_test), 3693 F(DstMem | SrcImm | NoWrite, em_test),
3633 I(DstMem | SrcImm, em_test), 3694 F(DstMem | SrcImm | NoWrite, em_test),
3634 I(DstMem | SrcNone | Lock, em_not), 3695 F(DstMem | SrcNone | Lock, em_not),
3635 I(DstMem | SrcNone | Lock, em_neg), 3696 F(DstMem | SrcNone | Lock, em_neg),
3636 I(SrcMem, em_mul_ex), 3697 I(SrcMem, em_mul_ex),
3637 I(SrcMem, em_imul_ex), 3698 I(SrcMem, em_imul_ex),
3638 I(SrcMem, em_div_ex), 3699 I(SrcMem, em_div_ex),
@@ -3640,14 +3701,14 @@ static const struct opcode group3[] = {
3640}; 3701};
3641 3702
3642static const struct opcode group4[] = { 3703static const struct opcode group4[] = {
3643 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3704 F(ByteOp | DstMem | SrcNone | Lock, em_inc),
3644 I(ByteOp | DstMem | SrcNone | Lock, em_grp45), 3705 F(ByteOp | DstMem | SrcNone | Lock, em_dec),
3645 N, N, N, N, N, N, 3706 N, N, N, N, N, N,
3646}; 3707};
3647 3708
3648static const struct opcode group5[] = { 3709static const struct opcode group5[] = {
3649 I(DstMem | SrcNone | Lock, em_grp45), 3710 F(DstMem | SrcNone | Lock, em_inc),
3650 I(DstMem | SrcNone | Lock, em_grp45), 3711 F(DstMem | SrcNone | Lock, em_dec),
3651 I(SrcMem | Stack, em_grp45), 3712 I(SrcMem | Stack, em_grp45),
3652 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), 3713 I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
3653 I(SrcMem | Stack, em_grp45), 3714 I(SrcMem | Stack, em_grp45),
@@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { {
3682 3743
3683static const struct opcode group8[] = { 3744static const struct opcode group8[] = {
3684 N, N, N, N, 3745 N, N, N, N,
3685 I(DstMem | SrcImmByte, em_bt), 3746 F(DstMem | SrcImmByte | NoWrite, em_bt),
3686 I(DstMem | SrcImmByte | Lock | PageTable, em_bts), 3747 F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
3687 I(DstMem | SrcImmByte | Lock, em_btr), 3748 F(DstMem | SrcImmByte | Lock, em_btr),
3688 I(DstMem | SrcImmByte | Lock | PageTable, em_btc), 3749 F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
3689}; 3750};
3690 3751
3691static const struct group_dual group9 = { { 3752static const struct group_dual group9 = { {
@@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = {
3707 I(0, em_mov), N, N, N, 3768 I(0, em_mov), N, N, N,
3708}; 3769};
3709 3770
3771static const struct escape escape_d9 = { {
3772 N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
3773}, {
3774 /* 0xC0 - 0xC7 */
3775 N, N, N, N, N, N, N, N,
3776 /* 0xC8 - 0xCF */
3777 N, N, N, N, N, N, N, N,
3778 /* 0xD0 - 0xC7 */
3779 N, N, N, N, N, N, N, N,
3780 /* 0xD8 - 0xDF */
3781 N, N, N, N, N, N, N, N,
3782 /* 0xE0 - 0xE7 */
3783 N, N, N, N, N, N, N, N,
3784 /* 0xE8 - 0xEF */
3785 N, N, N, N, N, N, N, N,
3786 /* 0xF0 - 0xF7 */
3787 N, N, N, N, N, N, N, N,
3788 /* 0xF8 - 0xFF */
3789 N, N, N, N, N, N, N, N,
3790} };
3791
3792static const struct escape escape_db = { {
3793 N, N, N, N, N, N, N, N,
3794}, {
3795 /* 0xC0 - 0xC7 */
3796 N, N, N, N, N, N, N, N,
3797 /* 0xC8 - 0xCF */
3798 N, N, N, N, N, N, N, N,
3799 /* 0xD0 - 0xC7 */
3800 N, N, N, N, N, N, N, N,
3801 /* 0xD8 - 0xDF */
3802 N, N, N, N, N, N, N, N,
3803 /* 0xE0 - 0xE7 */
3804 N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
3805 /* 0xE8 - 0xEF */
3806 N, N, N, N, N, N, N, N,
3807 /* 0xF0 - 0xF7 */
3808 N, N, N, N, N, N, N, N,
3809 /* 0xF8 - 0xFF */
3810 N, N, N, N, N, N, N, N,
3811} };
3812
3813static const struct escape escape_dd = { {
3814 N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
3815}, {
3816 /* 0xC0 - 0xC7 */
3817 N, N, N, N, N, N, N, N,
3818 /* 0xC8 - 0xCF */
3819 N, N, N, N, N, N, N, N,
3820 /* 0xD0 - 0xC7 */
3821 N, N, N, N, N, N, N, N,
3822 /* 0xD8 - 0xDF */
3823 N, N, N, N, N, N, N, N,
3824 /* 0xE0 - 0xE7 */
3825 N, N, N, N, N, N, N, N,
3826 /* 0xE8 - 0xEF */
3827 N, N, N, N, N, N, N, N,
3828 /* 0xF0 - 0xF7 */
3829 N, N, N, N, N, N, N, N,
3830 /* 0xF8 - 0xFF */
3831 N, N, N, N, N, N, N, N,
3832} };
3833
3710static const struct opcode opcode_table[256] = { 3834static const struct opcode opcode_table[256] = {
3711 /* 0x00 - 0x07 */ 3835 /* 0x00 - 0x07 */
3712 I6ALU(Lock, em_add), 3836 F6ALU(Lock, em_add),
3713 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), 3837 I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
3714 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), 3838 I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
3715 /* 0x08 - 0x0F */ 3839 /* 0x08 - 0x0F */
3716 I6ALU(Lock | PageTable, em_or), 3840 F6ALU(Lock | PageTable, em_or),
3717 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), 3841 I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
3718 N, 3842 N,
3719 /* 0x10 - 0x17 */ 3843 /* 0x10 - 0x17 */
3720 I6ALU(Lock, em_adc), 3844 F6ALU(Lock, em_adc),
3721 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), 3845 I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
3722 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), 3846 I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
3723 /* 0x18 - 0x1F */ 3847 /* 0x18 - 0x1F */
3724 I6ALU(Lock, em_sbb), 3848 F6ALU(Lock, em_sbb),
3725 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), 3849 I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
3726 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), 3850 I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
3727 /* 0x20 - 0x27 */ 3851 /* 0x20 - 0x27 */
3728 I6ALU(Lock | PageTable, em_and), N, N, 3852 F6ALU(Lock | PageTable, em_and), N, N,
3729 /* 0x28 - 0x2F */ 3853 /* 0x28 - 0x2F */
3730 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), 3854 F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3731 /* 0x30 - 0x37 */ 3855 /* 0x30 - 0x37 */
3732 I6ALU(Lock, em_xor), N, N, 3856 F6ALU(Lock, em_xor), N, N,
3733 /* 0x38 - 0x3F */ 3857 /* 0x38 - 0x3F */
3734 I6ALU(0, em_cmp), N, N, 3858 F6ALU(NoWrite, em_cmp), N, N,
3735 /* 0x40 - 0x4F */ 3859 /* 0x40 - 0x4F */
3736 X16(D(DstReg)), 3860 X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
3737 /* 0x50 - 0x57 */ 3861 /* 0x50 - 0x57 */
3738 X8(I(SrcReg | Stack, em_push)), 3862 X8(I(SrcReg | Stack, em_push)),
3739 /* 0x58 - 0x5F */ 3863 /* 0x58 - 0x5F */
@@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = {
3757 G(DstMem | SrcImm, group1), 3881 G(DstMem | SrcImm, group1),
3758 G(ByteOp | DstMem | SrcImm | No64, group1), 3882 G(ByteOp | DstMem | SrcImm | No64, group1),
3759 G(DstMem | SrcImmByte, group1), 3883 G(DstMem | SrcImmByte, group1),
3760 I2bv(DstMem | SrcReg | ModRM, em_test), 3884 F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
3761 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), 3885 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
3762 /* 0x88 - 0x8F */ 3886 /* 0x88 - 0x8F */
3763 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), 3887 I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
@@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = {
3777 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3901 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3778 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), 3902 I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
3779 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3903 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3780 I2bv(SrcSI | DstDI | String, em_cmp), 3904 F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
3781 /* 0xA8 - 0xAF */ 3905 /* 0xA8 - 0xAF */
3782 I2bv(DstAcc | SrcImm, em_test), 3906 F2bv(DstAcc | SrcImm | NoWrite, em_test),
3783 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3907 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3784 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3908 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3785 I2bv(SrcAcc | DstDI | String, em_cmp), 3909 F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
3786 /* 0xB0 - 0xB7 */ 3910 /* 0xB0 - 0xB7 */
3787 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), 3911 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
3788 /* 0xB8 - 0xBF */ 3912 /* 0xB8 - 0xBF */
3789 X8(I(DstReg | SrcImm | Mov, em_mov)), 3913 X8(I(DstReg | SrcImm64 | Mov, em_mov)),
3790 /* 0xC0 - 0xC7 */ 3914 /* 0xC0 - 0xC7 */
3791 D2bv(DstMem | SrcImmByte | ModRM), 3915 G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
3792 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), 3916 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3793 I(ImplicitOps | Stack, em_ret), 3917 I(ImplicitOps | Stack, em_ret),
3794 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), 3918 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
@@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = {
3800 D(ImplicitOps), DI(SrcImmByte, intn), 3924 D(ImplicitOps), DI(SrcImmByte, intn),
3801 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), 3925 D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
3802 /* 0xD0 - 0xD7 */ 3926 /* 0xD0 - 0xD7 */
3803 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3927 G(Src2One | ByteOp, group2), G(Src2One, group2),
3804 N, N, N, N, 3928 G(Src2CL | ByteOp, group2), G(Src2CL, group2),
3929 N, I(DstAcc | SrcImmByte | No64, em_aad), N, N,
3805 /* 0xD8 - 0xDF */ 3930 /* 0xD8 - 0xDF */
3806 N, N, N, N, N, N, N, N, 3931 N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
3807 /* 0xE0 - 0xE7 */ 3932 /* 0xE0 - 0xE7 */
3808 X3(I(SrcImmByte, em_loop)), 3933 X3(I(SrcImmByte, em_loop)),
3809 I(SrcImmByte, em_jcxz), 3934 I(SrcImmByte, em_jcxz),
@@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = {
3870 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3995 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3871 /* 0xA0 - 0xA7 */ 3996 /* 0xA0 - 0xA7 */
3872 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), 3997 I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
3873 II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), 3998 II(ImplicitOps, em_cpuid, cpuid),
3874 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3999 F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
3875 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 4000 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
4001 F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
3876 /* 0xA8 - 0xAF */ 4002 /* 0xA8 - 0xAF */
3877 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), 4003 I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
3878 DI(ImplicitOps, rsm), 4004 DI(ImplicitOps, rsm),
3879 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4005 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
3880 D(DstMem | SrcReg | Src2ImmByte | ModRM), 4006 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
3881 D(DstMem | SrcReg | Src2CL | ModRM), 4007 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
3882 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 4008 D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
3883 /* 0xB0 - 0xB7 */ 4009 /* 0xB0 - 0xB7 */
3884 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4010 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
3885 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), 4011 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
3886 I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), 4012 F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
3887 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), 4013 I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
3888 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), 4014 I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
3889 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4015 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3890 /* 0xB8 - 0xBF */ 4016 /* 0xB8 - 0xBF */
3891 N, N, 4017 N, N,
3892 G(BitOp, group8), 4018 G(BitOp, group8),
3893 I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), 4019 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
3894 I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), 4020 F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
3895 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), 4021 D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3896 /* 0xC0 - 0xC7 */ 4022 /* 0xC0 - 0xC7 */
3897 D2bv(DstMem | SrcReg | ModRM | Lock), 4023 D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3950 case 4: 4076 case 4:
3951 op->val = insn_fetch(s32, ctxt); 4077 op->val = insn_fetch(s32, ctxt);
3952 break; 4078 break;
4079 case 8:
4080 op->val = insn_fetch(s64, ctxt);
4081 break;
3953 } 4082 }
3954 if (!sign_extension) { 4083 if (!sign_extension) {
3955 switch (op->bytes) { 4084 switch (op->bytes) {
@@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
4028 case OpImm: 4157 case OpImm:
4029 rc = decode_imm(ctxt, op, imm_size(ctxt), true); 4158 rc = decode_imm(ctxt, op, imm_size(ctxt), true);
4030 break; 4159 break;
4160 case OpImm64:
4161 rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
4162 break;
4031 case OpMem8: 4163 case OpMem8:
4032 ctxt->memop.bytes = 1; 4164 ctxt->memop.bytes = 1;
4033 goto mem_common; 4165 goto mem_common;
@@ -4222,6 +4354,12 @@ done_prefixes:
4222 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; 4354 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
4223 } 4355 }
4224 break; 4356 break;
4357 case Escape:
4358 if (ctxt->modrm > 0xbf)
4359 opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
4360 else
4361 opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
4362 break;
4225 default: 4363 default:
4226 return EMULATION_FAILED; 4364 return EMULATION_FAILED;
4227 } 4365 }
@@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
4354 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); 4492 read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
4355} 4493}
4356 4494
4495static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
4496{
4497 ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
4498 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
4499 asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
4500 : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
4501 : "c"(ctxt->src2.val), [fastop]"S"(fop));
4502 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
4503 return X86EMUL_CONTINUE;
4504}
4357 4505
4358int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 4506int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
4359{ 4507{
@@ -4483,6 +4631,13 @@ special_insn:
4483 } 4631 }
4484 4632
4485 if (ctxt->execute) { 4633 if (ctxt->execute) {
4634 if (ctxt->d & Fastop) {
4635 void (*fop)(struct fastop *) = (void *)ctxt->execute;
4636 rc = fastop(ctxt, fop);
4637 if (rc != X86EMUL_CONTINUE)
4638 goto done;
4639 goto writeback;
4640 }
4486 rc = ctxt->execute(ctxt); 4641 rc = ctxt->execute(ctxt);
4487 if (rc != X86EMUL_CONTINUE) 4642 if (rc != X86EMUL_CONTINUE)
4488 goto done; 4643 goto done;
@@ -4493,12 +4648,6 @@ special_insn:
4493 goto twobyte_insn; 4648 goto twobyte_insn;
4494 4649
4495 switch (ctxt->b) { 4650 switch (ctxt->b) {
4496 case 0x40 ... 0x47: /* inc r16/r32 */
4497 emulate_1op(ctxt, "inc");
4498 break;
4499 case 0x48 ... 0x4f: /* dec r16/r32 */
4500 emulate_1op(ctxt, "dec");
4501 break;
4502 case 0x63: /* movsxd */ 4651 case 0x63: /* movsxd */
4503 if (ctxt->mode != X86EMUL_MODE_PROT64) 4652 if (ctxt->mode != X86EMUL_MODE_PROT64)
4504 goto cannot_emulate; 4653 goto cannot_emulate;
@@ -4523,9 +4672,6 @@ special_insn:
4523 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; 4672 case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
4524 } 4673 }
4525 break; 4674 break;
4526 case 0xc0 ... 0xc1:
4527 rc = em_grp2(ctxt);
4528 break;
4529 case 0xcc: /* int3 */ 4675 case 0xcc: /* int3 */
4530 rc = emulate_int(ctxt, 3); 4676 rc = emulate_int(ctxt, 3);
4531 break; 4677 break;
@@ -4536,13 +4682,6 @@ special_insn:
4536 if (ctxt->eflags & EFLG_OF) 4682 if (ctxt->eflags & EFLG_OF)
4537 rc = emulate_int(ctxt, 4); 4683 rc = emulate_int(ctxt, 4);
4538 break; 4684 break;
4539 case 0xd0 ... 0xd1: /* Grp2 */
4540 rc = em_grp2(ctxt);
4541 break;
4542 case 0xd2 ... 0xd3: /* Grp2 */
4543 ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX);
4544 rc = em_grp2(ctxt);
4545 break;
4546 case 0xe9: /* jmp rel */ 4685 case 0xe9: /* jmp rel */
4547 case 0xeb: /* jmp rel short */ 4686 case 0xeb: /* jmp rel short */
4548 jmp_rel(ctxt, ctxt->src.val); 4687 jmp_rel(ctxt, ctxt->src.val);
@@ -4661,14 +4800,6 @@ twobyte_insn:
4661 case 0x90 ... 0x9f: /* setcc r/m8 */ 4800 case 0x90 ... 0x9f: /* setcc r/m8 */
4662 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 4801 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
4663 break; 4802 break;
4664 case 0xa4: /* shld imm8, r, r/m */
4665 case 0xa5: /* shld cl, r, r/m */
4666 emulate_2op_cl(ctxt, "shld");
4667 break;
4668 case 0xac: /* shrd imm8, r, r/m */
4669 case 0xad: /* shrd cl, r, r/m */
4670 emulate_2op_cl(ctxt, "shrd");
4671 break;
4672 case 0xae: /* clflush */ 4803 case 0xae: /* clflush */
4673 break; 4804 break;
4674 case 0xb6 ... 0xb7: /* movzx */ 4805 case 0xb6 ... 0xb7: /* movzx */
@@ -4682,7 +4813,7 @@ twobyte_insn:
4682 (s16) ctxt->src.val; 4813 (s16) ctxt->src.val;
4683 break; 4814 break;
4684 case 0xc0 ... 0xc1: /* xadd */ 4815 case 0xc0 ... 0xc1: /* xadd */
4685 emulate_2op_SrcV(ctxt, "add"); 4816 fastop(ctxt, em_add);
4686 /* Write back the register source. */ 4817 /* Write back the register source. */
4687 ctxt->src.val = ctxt->dst.orig_val; 4818 ctxt->src.val = ctxt->dst.orig_val;
4688 write_register_operand(&ctxt->src); 4819 write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 11300d2fa714..c1d30b2fc9bb 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm)
122 */ 122 */
123 remaining = hrtimer_get_remaining(&ps->timer); 123 remaining = hrtimer_get_remaining(&ps->timer);
124 elapsed = ps->period - ktime_to_ns(remaining); 124 elapsed = ps->period - ktime_to_ns(remaining);
125 elapsed = mod_64(elapsed, ps->period);
126 125
127 return elapsed; 126 return elapsed;
128} 127}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 848206df0967..cc31f7c06d3d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
241 int irq, irq2, intno; 241 int irq, irq2, intno;
242 struct kvm_pic *s = pic_irqchip(kvm); 242 struct kvm_pic *s = pic_irqchip(kvm);
243 243
244 s->output = 0;
245
244 pic_lock(s); 246 pic_lock(s);
245 irq = pic_get_irq(&s->pics[0]); 247 irq = pic_get_irq(&s->pics[0]);
246 if (irq >= 0) { 248 if (irq >= 0) {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1618bd..484bc874688b 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
38EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
39 39
40/* 40/*
41 * check if there is pending interrupt from
42 * non-APIC source without intack.
43 */
44static int kvm_cpu_has_extint(struct kvm_vcpu *v)
45{
46 if (kvm_apic_accept_pic_intr(v))
47 return pic_irqchip(v->kvm)->output; /* PIC */
48 else
49 return 0;
50}
51
52/*
53 * check if there is injectable interrupt:
54 * when virtual interrupt delivery enabled,
55 * interrupt from apic will handled by hardware,
56 * we don't need to check it here.
57 */
58int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
59{
60 if (!irqchip_in_kernel(v->kvm))
61 return v->arch.interrupt.pending;
62
63 if (kvm_cpu_has_extint(v))
64 return 1;
65
66 if (kvm_apic_vid_enabled(v->kvm))
67 return 0;
68
69 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
70}
71
72/*
41 * check if there is pending interrupt without 73 * check if there is pending interrupt without
42 * intack. 74 * intack.
43 */ 75 */
44int kvm_cpu_has_interrupt(struct kvm_vcpu *v) 76int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
45{ 77{
46 struct kvm_pic *s;
47
48 if (!irqchip_in_kernel(v->kvm)) 78 if (!irqchip_in_kernel(v->kvm))
49 return v->arch.interrupt.pending; 79 return v->arch.interrupt.pending;
50 80
51 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ 81 if (kvm_cpu_has_extint(v))
52 if (kvm_apic_accept_pic_intr(v)) { 82 return 1;
53 s = pic_irqchip(v->kvm); /* PIC */ 83
54 return s->output; 84 return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
55 } else
56 return 0;
57 }
58 return 1;
59} 85}
60EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); 86EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
61 87
62/* 88/*
89 * Read pending interrupt(from non-APIC source)
90 * vector and intack.
91 */
92static int kvm_cpu_get_extint(struct kvm_vcpu *v)
93{
94 if (kvm_cpu_has_extint(v))
95 return kvm_pic_read_irq(v->kvm); /* PIC */
96 return -1;
97}
98
99/*
63 * Read pending interrupt vector and intack. 100 * Read pending interrupt vector and intack.
64 */ 101 */
65int kvm_cpu_get_interrupt(struct kvm_vcpu *v) 102int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
66{ 103{
67 struct kvm_pic *s;
68 int vector; 104 int vector;
69 105
70 if (!irqchip_in_kernel(v->kvm)) 106 if (!irqchip_in_kernel(v->kvm))
71 return v->arch.interrupt.nr; 107 return v->arch.interrupt.nr;
72 108
73 vector = kvm_get_apic_interrupt(v); /* APIC */ 109 vector = kvm_cpu_get_extint(v);
74 if (vector == -1) { 110
75 if (kvm_apic_accept_pic_intr(v)) { 111 if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
76 s = pic_irqchip(v->kvm); 112 return vector; /* PIC */
77 s->output = 0; /* PIC */ 113
78 vector = kvm_pic_read_irq(v->kvm); 114 return kvm_get_apic_interrupt(v); /* APIC */
79 }
80 }
81 return vector;
82} 115}
83EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
84 116
85void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 117void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
86{ 118{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f527f107..02b51dd4e4ad 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic)
140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ 140 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) 141 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
142 142
143static inline int apic_x2apic_mode(struct kvm_lapic *apic)
144{
145 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
146}
147
148static inline int kvm_apic_id(struct kvm_lapic *apic) 143static inline int kvm_apic_id(struct kvm_lapic *apic)
149{ 144{
150 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; 145 return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
151} 146}
152 147
153static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) 148void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
149 struct kvm_lapic_irq *irq,
150 u64 *eoi_exit_bitmap)
154{ 151{
155 u16 cid; 152 struct kvm_lapic **dst;
156 ldr >>= 32 - map->ldr_bits; 153 struct kvm_apic_map *map;
157 cid = (ldr >> map->cid_shift) & map->cid_mask; 154 unsigned long bitmap = 1;
155 int i;
158 156
159 BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); 157 rcu_read_lock();
158 map = rcu_dereference(vcpu->kvm->arch.apic_map);
160 159
161 return cid; 160 if (unlikely(!map)) {
162} 161 __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
162 goto out;
163 }
163 164
164static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) 165 if (irq->dest_mode == 0) { /* physical mode */
165{ 166 if (irq->delivery_mode == APIC_DM_LOWEST ||
166 ldr >>= (32 - map->ldr_bits); 167 irq->dest_id == 0xff) {
167 return ldr & map->lid_mask; 168 __set_bit(irq->vector,
169 (unsigned long *)eoi_exit_bitmap);
170 goto out;
171 }
172 dst = &map->phys_map[irq->dest_id & 0xff];
173 } else {
174 u32 mda = irq->dest_id << (32 - map->ldr_bits);
175
176 dst = map->logical_map[apic_cluster_id(map, mda)];
177
178 bitmap = apic_logical_id(map, mda);
179 }
180
181 for_each_set_bit(i, &bitmap, 16) {
182 if (!dst[i])
183 continue;
184 if (dst[i]->vcpu == vcpu) {
185 __set_bit(irq->vector,
186 (unsigned long *)eoi_exit_bitmap);
187 break;
188 }
189 }
190
191out:
192 rcu_read_unlock();
168} 193}
169 194
170static void recalculate_apic_map(struct kvm *kvm) 195static void recalculate_apic_map(struct kvm *kvm)
@@ -230,6 +255,8 @@ out:
230 255
231 if (old) 256 if (old)
232 kfree_rcu(old, rcu); 257 kfree_rcu(old, rcu);
258
259 kvm_ioapic_make_eoibitmap_request(kvm);
233} 260}
234 261
235static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) 262static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
345{ 372{
346 int result; 373 int result;
347 374
375 /*
376 * Note that irr_pending is just a hint. It will be always
377 * true with virtual interrupt delivery enabled.
378 */
348 if (!apic->irr_pending) 379 if (!apic->irr_pending)
349 return -1; 380 return -1;
350 381
@@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
461static inline int apic_find_highest_isr(struct kvm_lapic *apic) 492static inline int apic_find_highest_isr(struct kvm_lapic *apic)
462{ 493{
463 int result; 494 int result;
495
496 /* Note that isr_count is always 1 with vid enabled */
464 if (!apic->isr_count) 497 if (!apic->isr_count)
465 return -1; 498 return -1;
466 if (likely(apic->highest_isr_cache != -1)) 499 if (likely(apic->highest_isr_cache != -1))
@@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
740 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 773 return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
741} 774}
742 775
776static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
777{
778 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
779 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
780 int trigger_mode;
781 if (apic_test_vector(vector, apic->regs + APIC_TMR))
782 trigger_mode = IOAPIC_LEVEL_TRIG;
783 else
784 trigger_mode = IOAPIC_EDGE_TRIG;
785 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
786 }
787}
788
743static int apic_set_eoi(struct kvm_lapic *apic) 789static int apic_set_eoi(struct kvm_lapic *apic)
744{ 790{
745 int vector = apic_find_highest_isr(apic); 791 int vector = apic_find_highest_isr(apic);
@@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic)
756 apic_clear_isr(vector, apic); 802 apic_clear_isr(vector, apic);
757 apic_update_ppr(apic); 803 apic_update_ppr(apic);
758 804
759 if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && 805 kvm_ioapic_send_eoi(apic, vector);
760 kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
761 int trigger_mode;
762 if (apic_test_vector(vector, apic->regs + APIC_TMR))
763 trigger_mode = IOAPIC_LEVEL_TRIG;
764 else
765 trigger_mode = IOAPIC_EDGE_TRIG;
766 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
767 }
768 kvm_make_request(KVM_REQ_EVENT, apic->vcpu); 806 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
769 return vector; 807 return vector;
770} 808}
771 809
810/*
811 * this interface assumes a trap-like exit, which has already finished
812 * desired side effect including vISR and vPPR update.
813 */
814void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
815{
816 struct kvm_lapic *apic = vcpu->arch.apic;
817
818 trace_kvm_eoi(apic, vector);
819
820 kvm_ioapic_send_eoi(apic, vector);
821 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
822}
823EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
824
772static void apic_send_ipi(struct kvm_lapic *apic) 825static void apic_send_ipi(struct kvm_lapic *apic)
773{ 826{
774 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); 827 u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR);
@@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
1212} 1265}
1213EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 1266EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
1214 1267
1268/* emulate APIC access in a trap manner */
1269void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
1270{
1271 u32 val = 0;
1272
1273 /* hw has done the conditional check and inst decode */
1274 offset &= 0xff0;
1275
1276 apic_reg_read(vcpu->arch.apic, offset, 4, &val);
1277
1278 /* TODO: optimize to just emulate side effect w/o one more write */
1279 apic_reg_write(vcpu->arch.apic, offset, val);
1280}
1281EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
1282
1215void kvm_free_lapic(struct kvm_vcpu *vcpu) 1283void kvm_free_lapic(struct kvm_vcpu *vcpu)
1216{ 1284{
1217 struct kvm_lapic *apic = vcpu->arch.apic; 1285 struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
1288 1356
1289void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 1357void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1290{ 1358{
1359 u64 old_value = vcpu->arch.apic_base;
1291 struct kvm_lapic *apic = vcpu->arch.apic; 1360 struct kvm_lapic *apic = vcpu->arch.apic;
1292 1361
1293 if (!apic) { 1362 if (!apic) {
@@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
1309 value &= ~MSR_IA32_APICBASE_BSP; 1378 value &= ~MSR_IA32_APICBASE_BSP;
1310 1379
1311 vcpu->arch.apic_base = value; 1380 vcpu->arch.apic_base = value;
1312 if (apic_x2apic_mode(apic)) { 1381 if ((old_value ^ value) & X2APIC_ENABLE) {
1313 u32 id = kvm_apic_id(apic); 1382 if (value & X2APIC_ENABLE) {
1314 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); 1383 u32 id = kvm_apic_id(apic);
1315 kvm_apic_set_ldr(apic, ldr); 1384 u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
1385 kvm_apic_set_ldr(apic, ldr);
1386 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
1387 } else
1388 kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
1316 } 1389 }
1390
1317 apic->base_address = apic->vcpu->arch.apic_base & 1391 apic->base_address = apic->vcpu->arch.apic_base &
1318 MSR_IA32_APICBASE_BASE; 1392 MSR_IA32_APICBASE_BASE;
1319 1393
@@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
1359 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 1433 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
1360 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 1434 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
1361 } 1435 }
1362 apic->irr_pending = false; 1436 apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
1363 apic->isr_count = 0; 1437 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
1364 apic->highest_isr_cache = -1; 1438 apic->highest_isr_cache = -1;
1365 update_divide_count(apic); 1439 update_divide_count(apic);
1366 atomic_set(&apic->lapic_timer.pending, 0); 1440 atomic_set(&apic->lapic_timer.pending, 0);
@@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
1575 update_divide_count(apic); 1649 update_divide_count(apic);
1576 start_apic_timer(apic); 1650 start_apic_timer(apic);
1577 apic->irr_pending = true; 1651 apic->irr_pending = true;
1578 apic->isr_count = count_vectors(apic->regs + APIC_ISR); 1652 apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1653 1 : count_vectors(apic->regs + APIC_ISR);
1579 apic->highest_isr_cache = -1; 1654 apic->highest_isr_cache = -1;
1655 kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
1580 kvm_make_request(KVM_REQ_EVENT, vcpu); 1656 kvm_make_request(KVM_REQ_EVENT, vcpu);
1581} 1657}
1582 1658
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e5ebf9f3571f..1676d34ddb4e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); 64u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); 65void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
66 66
67void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
68void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
69
67void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); 70void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
68void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 71void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
69void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 72void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
@@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
124 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); 127 return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
125} 128}
126 129
130static inline int apic_x2apic_mode(struct kvm_lapic *apic)
131{
132 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
133}
134
135static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
136{
137 return kvm_x86_ops->vm_has_apicv(kvm);
138}
139
140static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
141{
142 u16 cid;
143 ldr >>= 32 - map->ldr_bits;
144 cid = (ldr >> map->cid_shift) & map->cid_mask;
145
146 BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
147
148 return cid;
149}
150
151static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
152{
153 ldr >>= (32 - map->ldr_bits);
154 return ldr & map->lid_mask;
155}
156
157void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
158 struct kvm_lapic_irq *irq,
159 u64 *eoi_bitmap);
160
127#endif 161#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01d7c2ad05f5..4ed3edbe06bd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
448 448
449static bool spte_is_locklessly_modifiable(u64 spte) 449static bool spte_is_locklessly_modifiable(u64 spte)
450{ 450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); 451 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
452 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
452} 453}
453 454
454static bool spte_has_volatile_bits(u64 spte) 455static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
831 if (host_level == PT_PAGE_TABLE_LEVEL) 832 if (host_level == PT_PAGE_TABLE_LEVEL)
832 return host_level; 833 return host_level;
833 834
834 max_level = kvm_x86_ops->get_lpage_level() < host_level ? 835 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
835 kvm_x86_ops->get_lpage_level() : host_level;
836 836
837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 837 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 838 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1142} 1142}
1143 1143
1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1144static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1145 int level, bool pt_protect) 1145 bool pt_protect)
1146{ 1146{
1147 u64 *sptep; 1147 u64 *sptep;
1148 struct rmap_iterator iter; 1148 struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1180 while (mask) { 1180 while (mask) {
1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1181 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1182 PT_PAGE_TABLE_LEVEL, slot); 1182 PT_PAGE_TABLE_LEVEL, slot);
1183 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); 1183 __rmap_write_protect(kvm, rmapp, false);
1184 1184
1185 /* clear the first set bit */ 1185 /* clear the first set bit */
1186 mask &= mask - 1; 1186 mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1199 for (i = PT_PAGE_TABLE_LEVEL; 1199 for (i = PT_PAGE_TABLE_LEVEL;
1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1200 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1201 rmapp = __gfn_to_rmap(gfn, i, slot); 1201 rmapp = __gfn_to_rmap(gfn, i, slot);
1202 write_protected |= __rmap_write_protect(kvm, rmapp, i, true); 1202 write_protected |= __rmap_write_protect(kvm, rmapp, true);
1203 } 1203 }
1204 1204
1205 return write_protected; 1205 return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1460 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1461} 1461}
1462 1462
1463/* 1463static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1464 * Remove the sp from shadow page cache, after call it,
1465 * we can not find this sp from the cache, and the shadow
1466 * page table is still valid.
1467 * It should be under the protection of mmu lock.
1468 */
1469static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1470{ 1464{
1471 ASSERT(is_empty_shadow_page(sp->spt)); 1465 ASSERT(is_empty_shadow_page(sp->spt));
1472 hlist_del(&sp->hash_link); 1466 hlist_del(&sp->hash_link);
1473 if (!sp->role.direct)
1474 free_page((unsigned long)sp->gfns);
1475}
1476
1477/*
1478 * Free the shadow page table and the sp, we can do it
1479 * out of the protection of mmu lock.
1480 */
1481static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1482{
1483 list_del(&sp->link); 1467 list_del(&sp->link);
1484 free_page((unsigned long)sp->spt); 1468 free_page((unsigned long)sp->spt);
1469 if (!sp->role.direct)
1470 free_page((unsigned long)sp->gfns);
1485 kmem_cache_free(mmu_page_header_cache, sp); 1471 kmem_cache_free(mmu_page_header_cache, sp);
1486} 1472}
1487 1473
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1522 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1508 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1523 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1509 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1524 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1510 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1525 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
1526 sp->parent_ptes = 0; 1511 sp->parent_ptes = 0;
1527 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1512 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1528 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1513 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1973,9 +1958,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1973{ 1958{
1974 u64 spte; 1959 u64 spte;
1975 1960
1976 spte = __pa(sp->spt) 1961 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
1977 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1962 shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
1978 | PT_WRITABLE_MASK | PT_USER_MASK; 1963
1979 mmu_spte_set(sptep, spte); 1964 mmu_spte_set(sptep, spte);
1980} 1965}
1981 1966
@@ -2126,7 +2111,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2126 do { 2111 do {
2127 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 2112 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
2128 WARN_ON(!sp->role.invalid || sp->root_count); 2113 WARN_ON(!sp->role.invalid || sp->root_count);
2129 kvm_mmu_isolate_page(sp);
2130 kvm_mmu_free_page(sp); 2114 kvm_mmu_free_page(sp);
2131 } while (!list_empty(invalid_list)); 2115 } while (!list_empty(invalid_list));
2132} 2116}
@@ -2144,6 +2128,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2144 * change the value 2128 * change the value
2145 */ 2129 */
2146 2130
2131 spin_lock(&kvm->mmu_lock);
2132
2147 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2133 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2148 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && 2134 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
2149 !list_empty(&kvm->arch.active_mmu_pages)) { 2135 !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,6 +2144,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
2158 } 2144 }
2159 2145
2160 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2146 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2147
2148 spin_unlock(&kvm->mmu_lock);
2161} 2149}
2162 2150
2163int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2151int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2183,14 +2171,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2183} 2171}
2184EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2172EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2185 2173
2186static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
2187{
2188 int slot = memslot_id(kvm, gfn);
2189 struct kvm_mmu_page *sp = page_header(__pa(pte));
2190
2191 __set_bit(slot, sp->slot_bitmap);
2192}
2193
2194/* 2174/*
2195 * The function is based on mtrr_type_lookup() in 2175 * The function is based on mtrr_type_lookup() in
2196 * arch/x86/kernel/cpu/mtrr/generic.c 2176 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2332,9 +2312,8 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2332 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2312 if (s->role.level != PT_PAGE_TABLE_LEVEL)
2333 return 1; 2313 return 1;
2334 2314
2335 if (!need_unsync && !s->unsync) { 2315 if (!s->unsync)
2336 need_unsync = true; 2316 need_unsync = true;
2337 }
2338 } 2317 }
2339 if (need_unsync) 2318 if (need_unsync)
2340 kvm_unsync_pages(vcpu, gfn); 2319 kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2321,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2342} 2321}
2343 2322
2344static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2323static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2345 unsigned pte_access, int user_fault, 2324 unsigned pte_access, int level,
2346 int write_fault, int level,
2347 gfn_t gfn, pfn_t pfn, bool speculative, 2325 gfn_t gfn, pfn_t pfn, bool speculative,
2348 bool can_unsync, bool host_writable) 2326 bool can_unsync, bool host_writable)
2349{ 2327{
@@ -2378,20 +2356,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2378 2356
2379 spte |= (u64)pfn << PAGE_SHIFT; 2357 spte |= (u64)pfn << PAGE_SHIFT;
2380 2358
2381 if ((pte_access & ACC_WRITE_MASK) 2359 if (pte_access & ACC_WRITE_MASK) {
2382 || (!vcpu->arch.mmu.direct_map && write_fault
2383 && !is_write_protection(vcpu) && !user_fault)) {
2384 2360
2385 /* 2361 /*
2386 * There are two cases: 2362 * Other vcpu creates new sp in the window between
2387 * - the one is other vcpu creates new sp in the window 2363 * mapping_level() and acquiring mmu-lock. We can
2388 * between mapping_level() and acquiring mmu-lock. 2364 * allow guest to retry the access, the mapping can
2389 * - the another case is the new sp is created by itself 2365 * be fixed if guest refault.
2390 * (page-fault path) when guest uses the target gfn as
2391 * its page table.
2392 * Both of these cases can be fixed by allowing guest to
2393 * retry the access, it will refault, then we can establish
2394 * the mapping by using small page.
2395 */ 2366 */
2396 if (level > PT_PAGE_TABLE_LEVEL && 2367 if (level > PT_PAGE_TABLE_LEVEL &&
2397 has_wrprotected_page(vcpu->kvm, gfn, level)) 2368 has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2370,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2399 2370
2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2371 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2401 2372
2402 if (!vcpu->arch.mmu.direct_map
2403 && !(pte_access & ACC_WRITE_MASK)) {
2404 spte &= ~PT_USER_MASK;
2405 /*
2406 * If we converted a user page to a kernel page,
2407 * so that the kernel can write to it when cr0.wp=0,
2408 * then we should prevent the kernel from executing it
2409 * if SMEP is enabled.
2410 */
2411 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2412 spte |= PT64_NX_MASK;
2413 }
2414
2415 /* 2373 /*
2416 * Optimization: for pte sync, if spte was writable the hash 2374 * Optimization: for pte sync, if spte was writable the hash
2417 * lookup is unnecessary (and expensive). Write protection 2375 * lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2399,15 @@ done:
2441} 2399}
2442 2400
2443static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2401static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2444 unsigned pt_access, unsigned pte_access, 2402 unsigned pte_access, int write_fault, int *emulate,
2445 int user_fault, int write_fault, 2403 int level, gfn_t gfn, pfn_t pfn, bool speculative,
2446 int *emulate, int level, gfn_t gfn,
2447 pfn_t pfn, bool speculative,
2448 bool host_writable) 2404 bool host_writable)
2449{ 2405{
2450 int was_rmapped = 0; 2406 int was_rmapped = 0;
2451 int rmap_count; 2407 int rmap_count;
2452 2408
2453 pgprintk("%s: spte %llx access %x write_fault %d" 2409 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
2454 " user_fault %d gfn %llx\n", 2410 *sptep, write_fault, gfn);
2455 __func__, *sptep, pt_access,
2456 write_fault, user_fault, gfn);
2457 2411
2458 if (is_rmap_spte(*sptep)) { 2412 if (is_rmap_spte(*sptep)) {
2459 /* 2413 /*
@@ -2477,9 +2431,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2477 was_rmapped = 1; 2431 was_rmapped = 1;
2478 } 2432 }
2479 2433
2480 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2434 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
2481 level, gfn, pfn, speculative, true, 2435 true, host_writable)) {
2482 host_writable)) {
2483 if (write_fault) 2436 if (write_fault)
2484 *emulate = 1; 2437 *emulate = 1;
2485 kvm_mmu_flush_tlb(vcpu); 2438 kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2450,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2497 ++vcpu->kvm->stat.lpages; 2450 ++vcpu->kvm->stat.lpages;
2498 2451
2499 if (is_shadow_present_pte(*sptep)) { 2452 if (is_shadow_present_pte(*sptep)) {
2500 page_header_update_slot(vcpu->kvm, sptep, gfn);
2501 if (!was_rmapped) { 2453 if (!was_rmapped) {
2502 rmap_count = rmap_add(vcpu, sptep, gfn); 2454 rmap_count = rmap_add(vcpu, sptep, gfn);
2503 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2455 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2523,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2571 return -1; 2523 return -1;
2572 2524
2573 for (i = 0; i < ret; i++, gfn++, start++) 2525 for (i = 0; i < ret; i++, gfn++, start++)
2574 mmu_set_spte(vcpu, start, ACC_ALL, 2526 mmu_set_spte(vcpu, start, access, 0, NULL,
2575 access, 0, 0, NULL, 2527 sp->role.level, gfn, page_to_pfn(pages[i]),
2576 sp->role.level, gfn, 2528 true, true);
2577 page_to_pfn(pages[i]), true, true);
2578 2529
2579 return 0; 2530 return 0;
2580} 2531}
@@ -2633,11 +2584,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2633 2584
2634 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2585 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2635 if (iterator.level == level) { 2586 if (iterator.level == level) {
2636 unsigned pte_access = ACC_ALL; 2587 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
2637 2588 write, &emulate, level, gfn, pfn,
2638 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2589 prefault, map_writable);
2639 0, write, &emulate,
2640 level, gfn, pfn, prefault, map_writable);
2641 direct_pte_prefetch(vcpu, iterator.sptep); 2590 direct_pte_prefetch(vcpu, iterator.sptep);
2642 ++vcpu->stat.pf_fixed; 2591 ++vcpu->stat.pf_fixed;
2643 break; 2592 break;
@@ -2652,11 +2601,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2652 iterator.level - 1, 2601 iterator.level - 1,
2653 1, ACC_ALL, iterator.sptep); 2602 1, ACC_ALL, iterator.sptep);
2654 2603
2655 mmu_spte_set(iterator.sptep, 2604 link_shadow_page(iterator.sptep, sp);
2656 __pa(sp->spt)
2657 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2658 | shadow_user_mask | shadow_x_mask
2659 | shadow_accessed_mask);
2660 } 2605 }
2661 } 2606 }
2662 return emulate; 2607 return emulate;
@@ -3719,6 +3664,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3719 else 3664 else
3720 r = paging32_init_context(vcpu, context); 3665 r = paging32_init_context(vcpu, context);
3721 3666
3667 vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
3722 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3668 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3723 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3669 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3724 vcpu->arch.mmu.base_role.smep_andnot_wp 3670 vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3831,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
3885 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3831 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
3886 *gpa &= ~(gpa_t)7; 3832 *gpa &= ~(gpa_t)7;
3887 *bytes = 8; 3833 *bytes = 8;
3888 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); 3834 r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
3889 if (r) 3835 if (r)
3890 gentry = 0; 3836 gentry = 0;
3891 new = (const u8 *)&gentry; 3837 new = (const u8 *)&gentry;
@@ -4039,7 +3985,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
4039 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3985 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
4040 & mask.word) && rmap_can_add(vcpu)) 3986 & mask.word) && rmap_can_add(vcpu))
4041 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 3987 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
4042 if (!remote_flush && need_remote_flush(entry, *spte)) 3988 if (need_remote_flush(entry, *spte))
4043 remote_flush = true; 3989 remote_flush = true;
4044 ++spte; 3990 ++spte;
4045 } 3991 }
@@ -4198,26 +4144,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
4198 4144
4199void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4145void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4200{ 4146{
4201 struct kvm_mmu_page *sp; 4147 struct kvm_memory_slot *memslot;
4202 bool flush = false; 4148 gfn_t last_gfn;
4149 int i;
4203 4150
4204 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4151 memslot = id_to_memslot(kvm->memslots, slot);
4205 int i; 4152 last_gfn = memslot->base_gfn + memslot->npages - 1;
4206 u64 *pt;
4207 4153
4208 if (!test_bit(slot, sp->slot_bitmap)) 4154 spin_lock(&kvm->mmu_lock);
4209 continue;
4210 4155
4211 pt = sp->spt; 4156 for (i = PT_PAGE_TABLE_LEVEL;
4212 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 4157 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
4213 if (!is_shadow_present_pte(pt[i]) || 4158 unsigned long *rmapp;
4214 !is_last_spte(pt[i], sp->role.level)) 4159 unsigned long last_index, index;
4215 continue;
4216 4160
4217 spte_write_protect(kvm, &pt[i], &flush, false); 4161 rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
4162 last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
4163
4164 for (index = 0; index <= last_index; ++index, ++rmapp) {
4165 if (*rmapp)
4166 __rmap_write_protect(kvm, rmapp, false);
4167
4168 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
4169 kvm_flush_remote_tlbs(kvm);
4170 cond_resched_lock(&kvm->mmu_lock);
4171 }
4218 } 4172 }
4219 } 4173 }
4174
4220 kvm_flush_remote_tlbs(kvm); 4175 kvm_flush_remote_tlbs(kvm);
4176 spin_unlock(&kvm->mmu_lock);
4221} 4177}
4222 4178
4223void kvm_mmu_zap_all(struct kvm *kvm) 4179void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index cd6e98333ba3..b8f6172f4174 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 TP_ARGS(sp) 195 TP_ARGS(sp)
196); 196);
197 197
198DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
199 TP_PROTO(struct kvm_mmu_page *sp),
200
201 TP_ARGS(sp)
202);
203
204TRACE_EVENT( 198TRACE_EVENT(
205 mark_mmio_spte, 199 mark_mmio_spte,
206 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), 200 TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 891eb6d93b8b..105dd5bd550e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
151 pt_element_t pte; 151 pt_element_t pte;
152 pt_element_t __user *uninitialized_var(ptep_user); 152 pt_element_t __user *uninitialized_var(ptep_user);
153 gfn_t table_gfn; 153 gfn_t table_gfn;
154 unsigned index, pt_access, pte_access, accessed_dirty, shift; 154 unsigned index, pt_access, pte_access, accessed_dirty;
155 gpa_t pte_gpa; 155 gpa_t pte_gpa;
156 int offset; 156 int offset;
157 const int write_fault = access & PFERR_WRITE_MASK; 157 const int write_fault = access & PFERR_WRITE_MASK;
@@ -249,16 +249,12 @@ retry_walk:
249 249
250 if (!write_fault) 250 if (!write_fault)
251 protect_clean_gpte(&pte_access, pte); 251 protect_clean_gpte(&pte_access, pte);
252 252 else
253 /* 253 /*
254 * On a write fault, fold the dirty bit into accessed_dirty by shifting it one 254 * On a write fault, fold the dirty bit into accessed_dirty by
255 * place right. 255 * shifting it one place right.
256 * 256 */
257 * On a read fault, do nothing. 257 accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
258 */
259 shift = write_fault >> ilog2(PFERR_WRITE_MASK);
260 shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
261 accessed_dirty &= pte >> shift;
262 258
263 if (unlikely(!accessed_dirty)) { 259 if (unlikely(!accessed_dirty)) {
264 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); 260 ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
@@ -330,8 +326,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
330 * we call mmu_set_spte() with host_writable = true because 326 * we call mmu_set_spte() with host_writable = true because
331 * pte_prefetch_gfn_to_pfn always gets a writable pfn. 327 * pte_prefetch_gfn_to_pfn always gets a writable pfn.
332 */ 328 */
333 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 329 mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
334 NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); 330 gfn, pfn, true, true);
335 331
336 return true; 332 return true;
337} 333}
@@ -405,7 +401,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
405 */ 401 */
406static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 402static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
407 struct guest_walker *gw, 403 struct guest_walker *gw,
408 int user_fault, int write_fault, int hlevel, 404 int write_fault, int hlevel,
409 pfn_t pfn, bool map_writable, bool prefault) 405 pfn_t pfn, bool map_writable, bool prefault)
410{ 406{
411 struct kvm_mmu_page *sp = NULL; 407 struct kvm_mmu_page *sp = NULL;
@@ -413,9 +409,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
413 unsigned direct_access, access = gw->pt_access; 409 unsigned direct_access, access = gw->pt_access;
414 int top_level, emulate = 0; 410 int top_level, emulate = 0;
415 411
416 if (!is_present_gpte(gw->ptes[gw->level - 1]))
417 return 0;
418
419 direct_access = gw->pte_access; 412 direct_access = gw->pte_access;
420 413
421 top_level = vcpu->arch.mmu.root_level; 414 top_level = vcpu->arch.mmu.root_level;
@@ -477,9 +470,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
477 } 470 }
478 471
479 clear_sp_write_flooding_count(it.sptep); 472 clear_sp_write_flooding_count(it.sptep);
480 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 473 mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
481 user_fault, write_fault, &emulate, it.level, 474 it.level, gw->gfn, pfn, prefault, map_writable);
482 gw->gfn, pfn, prefault, map_writable);
483 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 475 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
484 476
485 return emulate; 477 return emulate;
@@ -491,6 +483,46 @@ out_gpte_changed:
491 return 0; 483 return 0;
492} 484}
493 485
486 /*
487 * To see whether the mapped gfn can write its page table in the current
488 * mapping.
489 *
490 * It is the helper function of FNAME(page_fault). When guest uses large page
491 * size to map the writable gfn which is used as current page table, we should
492 * force kvm to use small page size to map it because new shadow page will be
493 * created when kvm establishes shadow page table that stop kvm using large
494 * page size. Do it early can avoid unnecessary #PF and emulation.
495 *
496 * @write_fault_to_shadow_pgtable will return true if the fault gfn is
497 * currently used as its page table.
498 *
499 * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
500 * since the PDPT is always shadowed, that means, we can not use large page
501 * size to map the gfn which is used as PDPT.
502 */
503static bool
504FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
505 struct guest_walker *walker, int user_fault,
506 bool *write_fault_to_shadow_pgtable)
507{
508 int level;
509 gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
510 bool self_changed = false;
511
512 if (!(walker->pte_access & ACC_WRITE_MASK ||
513 (!is_write_protection(vcpu) && !user_fault)))
514 return false;
515
516 for (level = walker->level; level <= walker->max_level; level++) {
517 gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
518
519 self_changed |= !(gfn & mask);
520 *write_fault_to_shadow_pgtable |= !gfn;
521 }
522
523 return self_changed;
524}
525
494/* 526/*
495 * Page fault handler. There are several causes for a page fault: 527 * Page fault handler. There are several causes for a page fault:
496 * - there is no shadow pte for the guest pte 528 * - there is no shadow pte for the guest pte
@@ -516,7 +548,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
516 int level = PT_PAGE_TABLE_LEVEL; 548 int level = PT_PAGE_TABLE_LEVEL;
517 int force_pt_level; 549 int force_pt_level;
518 unsigned long mmu_seq; 550 unsigned long mmu_seq;
519 bool map_writable; 551 bool map_writable, is_self_change_mapping;
520 552
521 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 553 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
522 554
@@ -544,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
544 return 0; 576 return 0;
545 } 577 }
546 578
579 vcpu->arch.write_fault_to_shadow_pgtable = false;
580
581 is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
582 &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
583
547 if (walker.level >= PT_DIRECTORY_LEVEL) 584 if (walker.level >= PT_DIRECTORY_LEVEL)
548 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); 585 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
586 || is_self_change_mapping;
549 else 587 else
550 force_pt_level = 1; 588 force_pt_level = 1;
551 if (!force_pt_level) { 589 if (!force_pt_level) {
@@ -564,6 +602,26 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
564 walker.gfn, pfn, walker.pte_access, &r)) 602 walker.gfn, pfn, walker.pte_access, &r))
565 return r; 603 return r;
566 604
605 /*
606 * Do not change pte_access if the pfn is a mmio page, otherwise
607 * we will cache the incorrect access into mmio spte.
608 */
609 if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
610 !is_write_protection(vcpu) && !user_fault &&
611 !is_noslot_pfn(pfn)) {
612 walker.pte_access |= ACC_WRITE_MASK;
613 walker.pte_access &= ~ACC_USER_MASK;
614
615 /*
616 * If we converted a user page to a kernel page,
617 * so that the kernel can write to it when cr0.wp=0,
618 * then we should prevent the kernel from executing it
619 * if SMEP is enabled.
620 */
621 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
622 walker.pte_access &= ~ACC_EXEC_MASK;
623 }
624
567 spin_lock(&vcpu->kvm->mmu_lock); 625 spin_lock(&vcpu->kvm->mmu_lock);
568 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 626 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
569 goto out_unlock; 627 goto out_unlock;
@@ -572,7 +630,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
572 kvm_mmu_free_some_pages(vcpu); 630 kvm_mmu_free_some_pages(vcpu);
573 if (!force_pt_level) 631 if (!force_pt_level)
574 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 632 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
575 r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 633 r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
576 level, pfn, map_writable, prefault); 634 level, pfn, map_writable, prefault);
577 ++vcpu->stat.pf_fixed; 635 ++vcpu->stat.pf_fixed;
578 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 636 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
@@ -747,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
747 805
748 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; 806 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
749 807
750 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 808 set_spte(vcpu, &sp->spt[i], pte_access,
751 PT_PAGE_TABLE_LEVEL, gfn, 809 PT_PAGE_TABLE_LEVEL, gfn,
752 spte_to_pfn(sp->spt[i]), true, false, 810 spte_to_pfn(sp->spt[i]), true, false,
753 host_writable); 811 host_writable);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d29d3cd1c156..e1b1ce21bc00 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3571,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE); 3571 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3572} 3572}
3573 3573
3574static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3575{
3576 return;
3577}
3578
3579static int svm_vm_has_apicv(struct kvm *kvm)
3580{
3581 return 0;
3582}
3583
3584static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3585{
3586 return;
3587}
3588
3589static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
3590{
3591 return;
3592}
3593
3574static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3594static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3575{ 3595{
3576 struct vcpu_svm *svm = to_svm(vcpu); 3596 struct vcpu_svm *svm = to_svm(vcpu);
@@ -4290,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = {
4290 .enable_nmi_window = enable_nmi_window, 4310 .enable_nmi_window = enable_nmi_window,
4291 .enable_irq_window = enable_irq_window, 4311 .enable_irq_window = enable_irq_window,
4292 .update_cr8_intercept = update_cr8_intercept, 4312 .update_cr8_intercept = update_cr8_intercept,
4313 .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4314 .vm_has_apicv = svm_vm_has_apicv,
4315 .load_eoi_exitmap = svm_load_eoi_exitmap,
4316 .hwapic_isr_update = svm_hwapic_isr_update,
4293 4317
4294 .set_tss_addr = svm_set_tss_addr, 4318 .set_tss_addr = svm_set_tss_addr,
4295 .get_tdp_level = get_npt_level, 4319 .get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9120ae1901e4..6667042714cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -84,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
84static bool __read_mostly fasteoi = 1; 84static bool __read_mostly fasteoi = 1;
85module_param(fasteoi, bool, S_IRUGO); 85module_param(fasteoi, bool, S_IRUGO);
86 86
87static bool __read_mostly enable_apicv_reg_vid;
88
87/* 89/*
88 * If nested=1, nested virtualization is supported, i.e., guests may use 90 * If nested=1, nested virtualization is supported, i.e., guests may use
89 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 91 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -92,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO);
92static bool __read_mostly nested = 0; 94static bool __read_mostly nested = 0;
93module_param(nested, bool, S_IRUGO); 95module_param(nested, bool, S_IRUGO);
94 96
95#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 97#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
96 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 98#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
97#define KVM_GUEST_CR0_MASK \
98 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
99#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
100 (X86_CR0_WP | X86_CR0_NE)
101#define KVM_VM_CR0_ALWAYS_ON \ 99#define KVM_VM_CR0_ALWAYS_ON \
102 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 100 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
103#define KVM_CR4_GUEST_OWNED_BITS \ 101#define KVM_CR4_GUEST_OWNED_BITS \
@@ -624,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
624 struct kvm_segment *var, int seg); 622 struct kvm_segment *var, int seg);
625static void vmx_get_segment(struct kvm_vcpu *vcpu, 623static void vmx_get_segment(struct kvm_vcpu *vcpu,
626 struct kvm_segment *var, int seg); 624 struct kvm_segment *var, int seg);
625static bool guest_state_valid(struct kvm_vcpu *vcpu);
626static u32 vmx_segment_access_rights(struct kvm_segment *var);
627 627
628static DEFINE_PER_CPU(struct vmcs *, vmxarea); 628static DEFINE_PER_CPU(struct vmcs *, vmxarea);
629static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 629static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -638,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a;
638static unsigned long *vmx_io_bitmap_b; 638static unsigned long *vmx_io_bitmap_b;
639static unsigned long *vmx_msr_bitmap_legacy; 639static unsigned long *vmx_msr_bitmap_legacy;
640static unsigned long *vmx_msr_bitmap_longmode; 640static unsigned long *vmx_msr_bitmap_longmode;
641static unsigned long *vmx_msr_bitmap_legacy_x2apic;
642static unsigned long *vmx_msr_bitmap_longmode_x2apic;
641 643
642static bool cpu_has_load_ia32_efer; 644static bool cpu_has_load_ia32_efer;
643static bool cpu_has_load_perf_global_ctrl; 645static bool cpu_has_load_perf_global_ctrl;
@@ -762,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
762 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 764 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
763} 765}
764 766
767static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
768{
769 return vmcs_config.cpu_based_2nd_exec_ctrl &
770 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
771}
772
773static inline bool cpu_has_vmx_apic_register_virt(void)
774{
775 return vmcs_config.cpu_based_2nd_exec_ctrl &
776 SECONDARY_EXEC_APIC_REGISTER_VIRT;
777}
778
779static inline bool cpu_has_vmx_virtual_intr_delivery(void)
780{
781 return vmcs_config.cpu_based_2nd_exec_ctrl &
782 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
783}
784
765static inline bool cpu_has_vmx_flexpriority(void) 785static inline bool cpu_has_vmx_flexpriority(void)
766{ 786{
767 return cpu_has_vmx_tpr_shadow() && 787 return cpu_has_vmx_tpr_shadow() &&
@@ -1694,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1694static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1714static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1695{ 1715{
1696 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1716 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1697 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1698 to_vmx(vcpu)->rflags = rflags; 1717 to_vmx(vcpu)->rflags = rflags;
1699 if (to_vmx(vcpu)->rmode.vm86_active) { 1718 if (to_vmx(vcpu)->rmode.vm86_active) {
1700 to_vmx(vcpu)->rmode.save_rflags = rflags; 1719 to_vmx(vcpu)->rmode.save_rflags = rflags;
@@ -1820,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1820 vmx->guest_msrs[from] = tmp; 1839 vmx->guest_msrs[from] = tmp;
1821} 1840}
1822 1841
1842static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
1843{
1844 unsigned long *msr_bitmap;
1845
1846 if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
1847 if (is_long_mode(vcpu))
1848 msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
1849 else
1850 msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
1851 } else {
1852 if (is_long_mode(vcpu))
1853 msr_bitmap = vmx_msr_bitmap_longmode;
1854 else
1855 msr_bitmap = vmx_msr_bitmap_legacy;
1856 }
1857
1858 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1859}
1860
1823/* 1861/*
1824 * Set up the vmcs to automatically save and restore system 1862 * Set up the vmcs to automatically save and restore system
1825 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1863 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
@@ -1828,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
1828static void setup_msrs(struct vcpu_vmx *vmx) 1866static void setup_msrs(struct vcpu_vmx *vmx)
1829{ 1867{
1830 int save_nmsrs, index; 1868 int save_nmsrs, index;
1831 unsigned long *msr_bitmap;
1832 1869
1833 save_nmsrs = 0; 1870 save_nmsrs = 0;
1834#ifdef CONFIG_X86_64 1871#ifdef CONFIG_X86_64
@@ -1860,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx)
1860 1897
1861 vmx->save_nmsrs = save_nmsrs; 1898 vmx->save_nmsrs = save_nmsrs;
1862 1899
1863 if (cpu_has_vmx_msr_bitmap()) { 1900 if (cpu_has_vmx_msr_bitmap())
1864 if (is_long_mode(&vmx->vcpu)) 1901 vmx_set_msr_bitmap(&vmx->vcpu);
1865 msr_bitmap = vmx_msr_bitmap_longmode;
1866 else
1867 msr_bitmap = vmx_msr_bitmap_legacy;
1868
1869 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
1870 }
1871} 1902}
1872 1903
1873/* 1904/*
@@ -2533,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2533 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2564 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2534 min2 = 0; 2565 min2 = 0;
2535 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2566 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2567 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2536 SECONDARY_EXEC_WBINVD_EXITING | 2568 SECONDARY_EXEC_WBINVD_EXITING |
2537 SECONDARY_EXEC_ENABLE_VPID | 2569 SECONDARY_EXEC_ENABLE_VPID |
2538 SECONDARY_EXEC_ENABLE_EPT | 2570 SECONDARY_EXEC_ENABLE_EPT |
2539 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2571 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2540 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2572 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
2541 SECONDARY_EXEC_RDTSCP | 2573 SECONDARY_EXEC_RDTSCP |
2542 SECONDARY_EXEC_ENABLE_INVPCID; 2574 SECONDARY_EXEC_ENABLE_INVPCID |
2575 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2576 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
2543 if (adjust_vmx_controls(min2, opt2, 2577 if (adjust_vmx_controls(min2, opt2,
2544 MSR_IA32_VMX_PROCBASED_CTLS2, 2578 MSR_IA32_VMX_PROCBASED_CTLS2,
2545 &_cpu_based_2nd_exec_control) < 0) 2579 &_cpu_based_2nd_exec_control) < 0)
@@ -2550,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2550 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2584 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2551 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2585 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2552#endif 2586#endif
2587
2588 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2589 _cpu_based_2nd_exec_control &= ~(
2590 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2591 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2592 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2593
2553 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2594 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
2554 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2595 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2555 enabled */ 2596 enabled */
@@ -2747,6 +2788,15 @@ static __init int hardware_setup(void)
2747 if (!cpu_has_vmx_ple()) 2788 if (!cpu_has_vmx_ple())
2748 ple_gap = 0; 2789 ple_gap = 0;
2749 2790
2791 if (!cpu_has_vmx_apic_register_virt() ||
2792 !cpu_has_vmx_virtual_intr_delivery())
2793 enable_apicv_reg_vid = 0;
2794
2795 if (enable_apicv_reg_vid)
2796 kvm_x86_ops->update_cr8_intercept = NULL;
2797 else
2798 kvm_x86_ops->hwapic_irr_update = NULL;
2799
2750 if (nested) 2800 if (nested)
2751 nested_vmx_setup_ctls_msrs(); 2801 nested_vmx_setup_ctls_msrs();
2752 2802
@@ -2758,18 +2808,28 @@ static __exit void hardware_unsetup(void)
2758 free_kvm_area(); 2808 free_kvm_area();
2759} 2809}
2760 2810
2761static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) 2811static bool emulation_required(struct kvm_vcpu *vcpu)
2762{ 2812{
2763 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2813 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2764 struct kvm_segment tmp = *save; 2814}
2765 2815
2766 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2816static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
2767 tmp.base = vmcs_readl(sf->base); 2817 struct kvm_segment *save)
2768 tmp.selector = vmcs_read16(sf->selector); 2818{
2769 tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; 2819 if (!emulate_invalid_guest_state) {
2770 tmp.s = 1; 2820 /*
2821 * CS and SS RPL should be equal during guest entry according
2822 * to VMX spec, but in reality it is not always so. Since vcpu
2823 * is in the middle of the transition from real mode to
2824 * protected mode it is safe to assume that RPL 0 is a good
2825 * default value.
2826 */
2827 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
2828 save->selector &= ~SELECTOR_RPL_MASK;
2829 save->dpl = save->selector & SELECTOR_RPL_MASK;
2830 save->s = 1;
2771 } 2831 }
2772 vmx_set_segment(vcpu, &tmp, seg); 2832 vmx_set_segment(vcpu, save, seg);
2773} 2833}
2774 2834
2775static void enter_pmode(struct kvm_vcpu *vcpu) 2835static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2777,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2777 unsigned long flags; 2837 unsigned long flags;
2778 struct vcpu_vmx *vmx = to_vmx(vcpu); 2838 struct vcpu_vmx *vmx = to_vmx(vcpu);
2779 2839
2780 vmx->emulation_required = 1; 2840 /*
2841 * Update real mode segment cache. It may be not up-to-date if sement
2842 * register was written while vcpu was in a guest mode.
2843 */
2844 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2845 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2846 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2847 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2848 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2849 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2850
2781 vmx->rmode.vm86_active = 0; 2851 vmx->rmode.vm86_active = 0;
2782 2852
2783 vmx_segment_cache_clear(vmx); 2853 vmx_segment_cache_clear(vmx);
@@ -2794,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
2794 2864
2795 update_exception_bitmap(vcpu); 2865 update_exception_bitmap(vcpu);
2796 2866
2797 if (emulate_invalid_guest_state) 2867 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2798 return; 2868 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2799 2869 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2800 fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2870 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2801 fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2871 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2802 fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2872 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2803 fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2804
2805 vmx_segment_cache_clear(vmx);
2806 2873
2807 vmcs_write16(GUEST_SS_SELECTOR, 0); 2874 /* CPL is always 0 when CPU enters protected mode */
2808 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 2875 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2809 2876 vmx->cpl = 0;
2810 vmcs_write16(GUEST_CS_SELECTOR,
2811 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
2812 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2813} 2877}
2814 2878
2815static gva_t rmode_tss_base(struct kvm *kvm) 2879static gva_t rmode_tss_base(struct kvm *kvm)
@@ -2831,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm)
2831static void fix_rmode_seg(int seg, struct kvm_segment *save) 2895static void fix_rmode_seg(int seg, struct kvm_segment *save)
2832{ 2896{
2833 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2897 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2834 2898 struct kvm_segment var = *save;
2835 vmcs_write16(sf->selector, save->base >> 4); 2899
2836 vmcs_write32(sf->base, save->base & 0xffff0); 2900 var.dpl = 0x3;
2837 vmcs_write32(sf->limit, 0xffff); 2901 if (seg == VCPU_SREG_CS)
2838 vmcs_write32(sf->ar_bytes, 0xf3); 2902 var.type = 0x3;
2839 if (save->base & 0xf) 2903
2840 printk_once(KERN_WARNING "kvm: segment base is not paragraph" 2904 if (!emulate_invalid_guest_state) {
2841 " aligned when entering protected mode (seg=%d)", 2905 var.selector = var.base >> 4;
2842 seg); 2906 var.base = var.base & 0xffff0;
2907 var.limit = 0xffff;
2908 var.g = 0;
2909 var.db = 0;
2910 var.present = 1;
2911 var.s = 1;
2912 var.l = 0;
2913 var.unusable = 0;
2914 var.type = 0x3;
2915 var.avl = 0;
2916 if (save->base & 0xf)
2917 printk_once(KERN_WARNING "kvm: segment base is not "
2918 "paragraph aligned when entering "
2919 "protected mode (seg=%d)", seg);
2920 }
2921
2922 vmcs_write16(sf->selector, var.selector);
2923 vmcs_write32(sf->base, var.base);
2924 vmcs_write32(sf->limit, var.limit);
2925 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
2843} 2926}
2844 2927
2845static void enter_rmode(struct kvm_vcpu *vcpu) 2928static void enter_rmode(struct kvm_vcpu *vcpu)
2846{ 2929{
2847 unsigned long flags; 2930 unsigned long flags;
2848 struct vcpu_vmx *vmx = to_vmx(vcpu); 2931 struct vcpu_vmx *vmx = to_vmx(vcpu);
2849 struct kvm_segment var;
2850
2851 if (enable_unrestricted_guest)
2852 return;
2853 2932
2854 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2933 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2855 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2934 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2856 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2935 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2857 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2936 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2858 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2937 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2938 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2939 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2859 2940
2860 vmx->emulation_required = 1;
2861 vmx->rmode.vm86_active = 1; 2941 vmx->rmode.vm86_active = 1;
2862 2942
2863
2864 /* 2943 /*
2865 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2944 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2866 * vcpu. Call it here with phys address pointing 16M below 4G. 2945 * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2888,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
2888 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 2967 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
2889 update_exception_bitmap(vcpu); 2968 update_exception_bitmap(vcpu);
2890 2969
2891 if (emulate_invalid_guest_state) 2970 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2892 goto continue_rmode; 2971 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2893 2972 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2894 vmx_get_segment(vcpu, &var, VCPU_SREG_SS); 2973 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2895 vmx_set_segment(vcpu, &var, VCPU_SREG_SS); 2974 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2896 2975 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2897 vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
2898 vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
2899
2900 vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
2901 vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
2902
2903 vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
2904 vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
2905 2976
2906 vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
2907 vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
2908
2909 vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
2910 vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
2911
2912continue_rmode:
2913 kvm_mmu_reset_context(vcpu); 2977 kvm_mmu_reset_context(vcpu);
2914} 2978}
2915 2979
@@ -3068,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3068 struct vcpu_vmx *vmx = to_vmx(vcpu); 3132 struct vcpu_vmx *vmx = to_vmx(vcpu);
3069 unsigned long hw_cr0; 3133 unsigned long hw_cr0;
3070 3134
3135 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3071 if (enable_unrestricted_guest) 3136 if (enable_unrestricted_guest)
3072 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) 3137 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3073 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3138 else {
3074 else 3139 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3075 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
3076 3140
3077 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3141 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3078 enter_pmode(vcpu); 3142 enter_pmode(vcpu);
3079 3143
3080 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3144 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3081 enter_rmode(vcpu); 3145 enter_rmode(vcpu);
3146 }
3082 3147
3083#ifdef CONFIG_X86_64 3148#ifdef CONFIG_X86_64
3084 if (vcpu->arch.efer & EFER_LME) { 3149 if (vcpu->arch.efer & EFER_LME) {
@@ -3098,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3098 vmcs_writel(CR0_READ_SHADOW, cr0); 3163 vmcs_writel(CR0_READ_SHADOW, cr0);
3099 vmcs_writel(GUEST_CR0, hw_cr0); 3164 vmcs_writel(GUEST_CR0, hw_cr0);
3100 vcpu->arch.cr0 = cr0; 3165 vcpu->arch.cr0 = cr0;
3101 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3166
3167 /* depends on vcpu->arch.cr0 to be set to a new value */
3168 vmx->emulation_required = emulation_required(vcpu);
3102} 3169}
3103 3170
3104static u64 construct_eptp(unsigned long root_hpa) 3171static u64 construct_eptp(unsigned long root_hpa)
@@ -3155,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3155 if (!is_paging(vcpu)) { 3222 if (!is_paging(vcpu)) {
3156 hw_cr4 &= ~X86_CR4_PAE; 3223 hw_cr4 &= ~X86_CR4_PAE;
3157 hw_cr4 |= X86_CR4_PSE; 3224 hw_cr4 |= X86_CR4_PSE;
3225 /*
3226 * SMEP is disabled if CPU is in non-paging mode in
3227 * hardware. However KVM always uses paging mode to
3228 * emulate guest non-paging mode with TDP.
3229 * To emulate this behavior, SMEP needs to be manually
3230 * disabled when guest switches to non-paging mode.
3231 */
3232 hw_cr4 &= ~X86_CR4_SMEP;
3158 } else if (!(cr4 & X86_CR4_PAE)) { 3233 } else if (!(cr4 & X86_CR4_PAE)) {
3159 hw_cr4 &= ~X86_CR4_PAE; 3234 hw_cr4 &= ~X86_CR4_PAE;
3160 } 3235 }
@@ -3171,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3171 struct vcpu_vmx *vmx = to_vmx(vcpu); 3246 struct vcpu_vmx *vmx = to_vmx(vcpu);
3172 u32 ar; 3247 u32 ar;
3173 3248
3174 if (vmx->rmode.vm86_active 3249 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3175 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
3176 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
3177 || seg == VCPU_SREG_GS)) {
3178 *var = vmx->rmode.segs[seg]; 3250 *var = vmx->rmode.segs[seg];
3179 if (seg == VCPU_SREG_TR 3251 if (seg == VCPU_SREG_TR
3180 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3252 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
@@ -3187,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
3187 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3259 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3188 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3260 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3189 ar = vmx_read_guest_seg_ar(vmx, seg); 3261 ar = vmx_read_guest_seg_ar(vmx, seg);
3190 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
3191 ar = 0;
3192 var->type = ar & 15; 3262 var->type = ar & 15;
3193 var->s = (ar >> 4) & 1; 3263 var->s = (ar >> 4) & 1;
3194 var->dpl = (ar >> 5) & 3; 3264 var->dpl = (ar >> 5) & 3;
@@ -3211,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3211 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3281 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3212} 3282}
3213 3283
3214static int __vmx_get_cpl(struct kvm_vcpu *vcpu) 3284static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3215{ 3285{
3286 struct vcpu_vmx *vmx = to_vmx(vcpu);
3287
3216 if (!is_protmode(vcpu)) 3288 if (!is_protmode(vcpu))
3217 return 0; 3289 return 0;
3218 3290
@@ -3220,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
3220 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ 3292 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
3221 return 3; 3293 return 3;
3222 3294
3223 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
3224}
3225
3226static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3227{
3228 struct vcpu_vmx *vmx = to_vmx(vcpu);
3229
3230 /*
3231 * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
3232 * fail; use the cache instead.
3233 */
3234 if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
3235 return vmx->cpl;
3236 }
3237
3238 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { 3295 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
3239 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); 3296 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3240 vmx->cpl = __vmx_get_cpl(vcpu); 3297 vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3;
3241 } 3298 }
3242 3299
3243 return vmx->cpl; 3300 return vmx->cpl;
@@ -3269,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3269{ 3326{
3270 struct vcpu_vmx *vmx = to_vmx(vcpu); 3327 struct vcpu_vmx *vmx = to_vmx(vcpu);
3271 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3328 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3272 u32 ar;
3273 3329
3274 vmx_segment_cache_clear(vmx); 3330 vmx_segment_cache_clear(vmx);
3331 if (seg == VCPU_SREG_CS)
3332 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3275 3333
3276 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 3334 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3277 vmcs_write16(sf->selector, var->selector); 3335 vmx->rmode.segs[seg] = *var;
3278 vmx->rmode.segs[VCPU_SREG_TR] = *var; 3336 if (seg == VCPU_SREG_TR)
3279 return; 3337 vmcs_write16(sf->selector, var->selector);
3338 else if (var->s)
3339 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3340 goto out;
3280 } 3341 }
3342
3281 vmcs_writel(sf->base, var->base); 3343 vmcs_writel(sf->base, var->base);
3282 vmcs_write32(sf->limit, var->limit); 3344 vmcs_write32(sf->limit, var->limit);
3283 vmcs_write16(sf->selector, var->selector); 3345 vmcs_write16(sf->selector, var->selector);
3284 if (vmx->rmode.vm86_active && var->s) {
3285 vmx->rmode.segs[seg] = *var;
3286 /*
3287 * Hack real-mode segments into vm86 compatibility.
3288 */
3289 if (var->base == 0xffff0000 && var->selector == 0xf000)
3290 vmcs_writel(sf->base, 0xf0000);
3291 ar = 0xf3;
3292 } else
3293 ar = vmx_segment_access_rights(var);
3294 3346
3295 /* 3347 /*
3296 * Fix the "Accessed" bit in AR field of segment registers for older 3348 * Fix the "Accessed" bit in AR field of segment registers for older
@@ -3304,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
3304 * kvm hack. 3356 * kvm hack.
3305 */ 3357 */
3306 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3358 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3307 ar |= 0x1; /* Accessed */ 3359 var->type |= 0x1; /* Accessed */
3308 3360
3309 vmcs_write32(sf->ar_bytes, ar); 3361 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3310 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
3311 3362
3312 /* 3363out:
3313 * Fix segments for real mode guest in hosts that don't have 3364 vmx->emulation_required |= emulation_required(vcpu);
3314 * "unrestricted_mode" or it was disabled.
3315 * This is done to allow migration of the guests from hosts with
3316 * unrestricted guest like Westmere to older host that don't have
3317 * unrestricted guest like Nehelem.
3318 */
3319 if (vmx->rmode.vm86_active) {
3320 switch (seg) {
3321 case VCPU_SREG_CS:
3322 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
3323 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
3324 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
3325 vmcs_writel(GUEST_CS_BASE, 0xf0000);
3326 vmcs_write16(GUEST_CS_SELECTOR,
3327 vmcs_readl(GUEST_CS_BASE) >> 4);
3328 break;
3329 case VCPU_SREG_ES:
3330 case VCPU_SREG_DS:
3331 case VCPU_SREG_GS:
3332 case VCPU_SREG_FS:
3333 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3334 break;
3335 case VCPU_SREG_SS:
3336 vmcs_write16(GUEST_SS_SELECTOR,
3337 vmcs_readl(GUEST_SS_BASE) >> 4);
3338 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
3339 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
3340 break;
3341 }
3342 }
3343} 3365}
3344 3366
3345static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3367static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3380,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3380 u32 ar; 3402 u32 ar;
3381 3403
3382 vmx_get_segment(vcpu, &var, seg); 3404 vmx_get_segment(vcpu, &var, seg);
3405 var.dpl = 0x3;
3406 if (seg == VCPU_SREG_CS)
3407 var.type = 0x3;
3383 ar = vmx_segment_access_rights(&var); 3408 ar = vmx_segment_access_rights(&var);
3384 3409
3385 if (var.base != (var.selector << 4)) 3410 if (var.base != (var.selector << 4))
3386 return false; 3411 return false;
3387 if (var.limit < 0xffff) 3412 if (var.limit != 0xffff)
3388 return false; 3413 return false;
3389 if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) 3414 if (ar != 0xf3)
3390 return false; 3415 return false;
3391 3416
3392 return true; 3417 return true;
@@ -3521,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3521 */ 3546 */
3522static bool guest_state_valid(struct kvm_vcpu *vcpu) 3547static bool guest_state_valid(struct kvm_vcpu *vcpu)
3523{ 3548{
3549 if (enable_unrestricted_guest)
3550 return true;
3551
3524 /* real mode guest state checks */ 3552 /* real mode guest state checks */
3525 if (!is_protmode(vcpu)) { 3553 if (!is_protmode(vcpu)) {
3526 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3554 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3644,12 +3672,9 @@ static void seg_setup(int seg)
3644 vmcs_write16(sf->selector, 0); 3672 vmcs_write16(sf->selector, 0);
3645 vmcs_writel(sf->base, 0); 3673 vmcs_writel(sf->base, 0);
3646 vmcs_write32(sf->limit, 0xffff); 3674 vmcs_write32(sf->limit, 0xffff);
3647 if (enable_unrestricted_guest) { 3675 ar = 0x93;
3648 ar = 0x93; 3676 if (seg == VCPU_SREG_CS)
3649 if (seg == VCPU_SREG_CS) 3677 ar |= 0x08; /* code segment */
3650 ar |= 0x08; /* code segment */
3651 } else
3652 ar = 0xf3;
3653 3678
3654 vmcs_write32(sf->ar_bytes, ar); 3679 vmcs_write32(sf->ar_bytes, ar);
3655} 3680}
@@ -3667,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
3667 kvm_userspace_mem.flags = 0; 3692 kvm_userspace_mem.flags = 0;
3668 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; 3693 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
3669 kvm_userspace_mem.memory_size = PAGE_SIZE; 3694 kvm_userspace_mem.memory_size = PAGE_SIZE;
3670 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3695 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3671 if (r) 3696 if (r)
3672 goto out; 3697 goto out;
3673 3698
@@ -3697,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
3697 kvm_userspace_mem.guest_phys_addr = 3722 kvm_userspace_mem.guest_phys_addr =
3698 kvm->arch.ept_identity_map_addr; 3723 kvm->arch.ept_identity_map_addr;
3699 kvm_userspace_mem.memory_size = PAGE_SIZE; 3724 kvm_userspace_mem.memory_size = PAGE_SIZE;
3700 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 3725 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
3701 if (r) 3726 if (r)
3702 goto out; 3727 goto out;
3703 3728
@@ -3739,7 +3764,10 @@ static void free_vpid(struct vcpu_vmx *vmx)
3739 spin_unlock(&vmx_vpid_lock); 3764 spin_unlock(&vmx_vpid_lock);
3740} 3765}
3741 3766
3742static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 3767#define MSR_TYPE_R 1
3768#define MSR_TYPE_W 2
3769static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3770 u32 msr, int type)
3743{ 3771{
3744 int f = sizeof(unsigned long); 3772 int f = sizeof(unsigned long);
3745 3773
@@ -3752,20 +3780,93 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
3752 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3780 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3753 */ 3781 */
3754 if (msr <= 0x1fff) { 3782 if (msr <= 0x1fff) {
3755 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ 3783 if (type & MSR_TYPE_R)
3756 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ 3784 /* read-low */
3785 __clear_bit(msr, msr_bitmap + 0x000 / f);
3786
3787 if (type & MSR_TYPE_W)
3788 /* write-low */
3789 __clear_bit(msr, msr_bitmap + 0x800 / f);
3790
3757 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3791 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3758 msr &= 0x1fff; 3792 msr &= 0x1fff;
3759 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ 3793 if (type & MSR_TYPE_R)
3760 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ 3794 /* read-high */
3795 __clear_bit(msr, msr_bitmap + 0x400 / f);
3796
3797 if (type & MSR_TYPE_W)
3798 /* write-high */
3799 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3800
3801 }
3802}
3803
3804static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3805 u32 msr, int type)
3806{
3807 int f = sizeof(unsigned long);
3808
3809 if (!cpu_has_vmx_msr_bitmap())
3810 return;
3811
3812 /*
3813 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3814 * have the write-low and read-high bitmap offsets the wrong way round.
3815 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3816 */
3817 if (msr <= 0x1fff) {
3818 if (type & MSR_TYPE_R)
3819 /* read-low */
3820 __set_bit(msr, msr_bitmap + 0x000 / f);
3821
3822 if (type & MSR_TYPE_W)
3823 /* write-low */
3824 __set_bit(msr, msr_bitmap + 0x800 / f);
3825
3826 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3827 msr &= 0x1fff;
3828 if (type & MSR_TYPE_R)
3829 /* read-high */
3830 __set_bit(msr, msr_bitmap + 0x400 / f);
3831
3832 if (type & MSR_TYPE_W)
3833 /* write-high */
3834 __set_bit(msr, msr_bitmap + 0xc00 / f);
3835
3761 } 3836 }
3762} 3837}
3763 3838
3764static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 3839static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3765{ 3840{
3766 if (!longmode_only) 3841 if (!longmode_only)
3767 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); 3842 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
3768 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); 3843 msr, MSR_TYPE_R | MSR_TYPE_W);
3844 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
3845 msr, MSR_TYPE_R | MSR_TYPE_W);
3846}
3847
3848static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
3849{
3850 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3851 msr, MSR_TYPE_R);
3852 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3853 msr, MSR_TYPE_R);
3854}
3855
3856static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
3857{
3858 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3859 msr, MSR_TYPE_R);
3860 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3861 msr, MSR_TYPE_R);
3862}
3863
3864static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
3865{
3866 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3867 msr, MSR_TYPE_W);
3868 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3869 msr, MSR_TYPE_W);
3769} 3870}
3770 3871
3771/* 3872/*
@@ -3844,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3844 return exec_control; 3945 return exec_control;
3845} 3946}
3846 3947
3948static int vmx_vm_has_apicv(struct kvm *kvm)
3949{
3950 return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
3951}
3952
3847static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 3953static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3848{ 3954{
3849 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3955 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3861,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
3861 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3967 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
3862 if (!ple_gap) 3968 if (!ple_gap)
3863 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3969 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
3970 if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
3971 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
3972 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3973 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
3864 return exec_control; 3974 return exec_control;
3865} 3975}
3866 3976
@@ -3905,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3905 vmx_secondary_exec_control(vmx)); 4015 vmx_secondary_exec_control(vmx));
3906 } 4016 }
3907 4017
4018 if (enable_apicv_reg_vid) {
4019 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4020 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4021 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4022 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4023
4024 vmcs_write16(GUEST_INTR_STATUS, 0);
4025 }
4026
3908 if (ple_gap) { 4027 if (ple_gap) {
3909 vmcs_write32(PLE_GAP, ple_gap); 4028 vmcs_write32(PLE_GAP, ple_gap);
3910 vmcs_write32(PLE_WINDOW, ple_window); 4029 vmcs_write32(PLE_WINDOW, ple_window);
@@ -3990,14 +4109,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
3990 vmx_segment_cache_clear(vmx); 4109 vmx_segment_cache_clear(vmx);
3991 4110
3992 seg_setup(VCPU_SREG_CS); 4111 seg_setup(VCPU_SREG_CS);
3993 /* 4112 if (kvm_vcpu_is_bsp(&vmx->vcpu))
3994 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
3995 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
3996 */
3997 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
3998 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4113 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
3999 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 4114 else {
4000 } else {
4001 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 4115 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
4002 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 4116 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
4003 } 4117 }
@@ -4073,9 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4073 4187
4074 ret = 0; 4188 ret = 0;
4075 4189
4076 /* HACK: Don't enable emulation on guest boot/reset */
4077 vmx->emulation_required = 0;
4078
4079 return ret; 4190 return ret;
4080} 4191}
4081 4192
@@ -4251,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4251 .flags = 0, 4362 .flags = 0,
4252 }; 4363 };
4253 4364
4254 ret = kvm_set_memory_region(kvm, &tss_mem, 0); 4365 ret = kvm_set_memory_region(kvm, &tss_mem, false);
4255 if (ret) 4366 if (ret)
4256 return ret; 4367 return ret;
4257 kvm->arch.tss_addr = addr; 4368 kvm->arch.tss_addr = addr;
@@ -4261,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4261 return 0; 4372 return 0;
4262} 4373}
4263 4374
4264static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4375static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4265 int vec, u32 err_code)
4266{ 4376{
4267 /*
4268 * Instruction with address size override prefix opcode 0x67
4269 * Cause the #SS fault with 0 error code in VM86 mode.
4270 */
4271 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
4272 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
4273 return 1;
4274 /*
4275 * Forward all other exceptions that are valid in real mode.
4276 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4277 * the required debugging infrastructure rework.
4278 */
4279 switch (vec) { 4377 switch (vec) {
4280 case DB_VECTOR:
4281 if (vcpu->guest_debug &
4282 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4283 return 0;
4284 kvm_queue_exception(vcpu, vec);
4285 return 1;
4286 case BP_VECTOR: 4378 case BP_VECTOR:
4287 /* 4379 /*
4288 * Update instruction length as we may reinject the exception 4380 * Update instruction length as we may reinject the exception
@@ -4291,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4291 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4383 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4384 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4293 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4385 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
4294 return 0; 4386 return false;
4387 /* fall through */
4388 case DB_VECTOR:
4389 if (vcpu->guest_debug &
4390 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4391 return false;
4295 /* fall through */ 4392 /* fall through */
4296 case DE_VECTOR: 4393 case DE_VECTOR:
4297 case OF_VECTOR: 4394 case OF_VECTOR:
@@ -4301,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4301 case SS_VECTOR: 4398 case SS_VECTOR:
4302 case GP_VECTOR: 4399 case GP_VECTOR:
4303 case MF_VECTOR: 4400 case MF_VECTOR:
4304 kvm_queue_exception(vcpu, vec); 4401 return true;
4305 return 1; 4402 break;
4306 } 4403 }
4307 return 0; 4404 return false;
4405}
4406
4407static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4408 int vec, u32 err_code)
4409{
4410 /*
4411 * Instruction with address size override prefix opcode 0x67
4412 * Cause the #SS fault with 0 error code in VM86 mode.
4413 */
4414 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
4415 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
4416 if (vcpu->arch.halt_request) {
4417 vcpu->arch.halt_request = 0;
4418 return kvm_emulate_halt(vcpu);
4419 }
4420 return 1;
4421 }
4422 return 0;
4423 }
4424
4425 /*
4426 * Forward all other exceptions that are valid in real mode.
4427 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4428 * the required debugging infrastructure rework.
4429 */
4430 kvm_queue_exception(vcpu, vec);
4431 return 1;
4308} 4432}
4309 4433
4310/* 4434/*
@@ -4392,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
4392 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 4516 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
4393 } 4517 }
4394 4518
4395 if (vmx->rmode.vm86_active &&
4396 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
4397 error_code)) {
4398 if (vcpu->arch.halt_request) {
4399 vcpu->arch.halt_request = 0;
4400 return kvm_emulate_halt(vcpu);
4401 }
4402 return 1;
4403 }
4404
4405 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4519 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
4520
4521 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4522 return handle_rmode_exception(vcpu, ex_no, error_code);
4523
4406 switch (ex_no) { 4524 switch (ex_no) {
4407 case DB_VECTOR: 4525 case DB_VECTOR:
4408 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4526 dr6 = vmcs_readl(EXIT_QUALIFICATION);
@@ -4820,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
4820 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 4938 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
4821} 4939}
4822 4940
4941static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
4942{
4943 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4944 int vector = exit_qualification & 0xff;
4945
4946 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
4947 kvm_apic_set_eoi_accelerated(vcpu, vector);
4948 return 1;
4949}
4950
4951static int handle_apic_write(struct kvm_vcpu *vcpu)
4952{
4953 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4954 u32 offset = exit_qualification & 0xfff;
4955
4956 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
4957 kvm_apic_write_nodecode(vcpu, offset);
4958 return 1;
4959}
4960
4823static int handle_task_switch(struct kvm_vcpu *vcpu) 4961static int handle_task_switch(struct kvm_vcpu *vcpu)
4824{ 4962{
4825 struct vcpu_vmx *vmx = to_vmx(vcpu); 4963 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5065,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5065 schedule(); 5203 schedule();
5066 } 5204 }
5067 5205
5068 vmx->emulation_required = !guest_state_valid(vcpu); 5206 vmx->emulation_required = emulation_required(vcpu);
5069out: 5207out:
5070 return ret; 5208 return ret;
5071} 5209}
@@ -5754,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
5754 [EXIT_REASON_VMON] = handle_vmon, 5892 [EXIT_REASON_VMON] = handle_vmon,
5755 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5893 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5756 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5894 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5895 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5896 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5757 [EXIT_REASON_WBINVD] = handle_wbinvd, 5897 [EXIT_REASON_WBINVD] = handle_wbinvd,
5758 [EXIT_REASON_XSETBV] = handle_xsetbv, 5898 [EXIT_REASON_XSETBV] = handle_xsetbv,
5759 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5899 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
@@ -5780,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5780 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 5920 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
5781 gpa_t bitmap; 5921 gpa_t bitmap;
5782 5922
5783 if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) 5923 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5784 return 1; 5924 return 1;
5785 5925
5786 /* 5926 /*
@@ -6008,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
6008 u32 vectoring_info = vmx->idt_vectoring_info; 6148 u32 vectoring_info = vmx->idt_vectoring_info;
6009 6149
6010 /* If guest state is invalid, start emulating */ 6150 /* If guest state is invalid, start emulating */
6011 if (vmx->emulation_required && emulate_invalid_guest_state) 6151 if (vmx->emulation_required)
6012 return handle_invalid_guest_state(vcpu); 6152 return handle_invalid_guest_state(vcpu);
6013 6153
6014 /* 6154 /*
@@ -6103,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6103 vmcs_write32(TPR_THRESHOLD, irr); 6243 vmcs_write32(TPR_THRESHOLD, irr);
6104} 6244}
6105 6245
6246static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
6247{
6248 u32 sec_exec_control;
6249
6250 /*
6251 * There is not point to enable virtualize x2apic without enable
6252 * apicv
6253 */
6254 if (!cpu_has_vmx_virtualize_x2apic_mode() ||
6255 !vmx_vm_has_apicv(vcpu->kvm))
6256 return;
6257
6258 if (!vm_need_tpr_shadow(vcpu->kvm))
6259 return;
6260
6261 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6262
6263 if (set) {
6264 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6265 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6266 } else {
6267 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6268 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6269 }
6270 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
6271
6272 vmx_set_msr_bitmap(vcpu);
6273}
6274
6275static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
6276{
6277 u16 status;
6278 u8 old;
6279
6280 if (!vmx_vm_has_apicv(kvm))
6281 return;
6282
6283 if (isr == -1)
6284 isr = 0;
6285
6286 status = vmcs_read16(GUEST_INTR_STATUS);
6287 old = status >> 8;
6288 if (isr != old) {
6289 status &= 0xff;
6290 status |= isr << 8;
6291 vmcs_write16(GUEST_INTR_STATUS, status);
6292 }
6293}
6294
6295static void vmx_set_rvi(int vector)
6296{
6297 u16 status;
6298 u8 old;
6299
6300 status = vmcs_read16(GUEST_INTR_STATUS);
6301 old = (u8)status & 0xff;
6302 if ((u8)vector != old) {
6303 status &= ~0xff;
6304 status |= (u8)vector;
6305 vmcs_write16(GUEST_INTR_STATUS, status);
6306 }
6307}
6308
6309static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6310{
6311 if (max_irr == -1)
6312 return;
6313
6314 vmx_set_rvi(max_irr);
6315}
6316
6317static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6318{
6319 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6320 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6321 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6322 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6323}
6324
6106static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6325static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
6107{ 6326{
6108 u32 exit_intr_info; 6327 u32 exit_intr_info;
@@ -6291,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
6291 6510
6292 /* Don't enter VMX if guest state is invalid, let the exit handler 6511 /* Don't enter VMX if guest state is invalid, let the exit handler
6293 start emulation until we arrive back to a valid state */ 6512 start emulation until we arrive back to a valid state */
6294 if (vmx->emulation_required && emulate_invalid_guest_state) 6513 if (vmx->emulation_required)
6295 return; 6514 return;
6296 6515
6297 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6516 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -7366,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
7366 .enable_nmi_window = enable_nmi_window, 7585 .enable_nmi_window = enable_nmi_window,
7367 .enable_irq_window = enable_irq_window, 7586 .enable_irq_window = enable_irq_window,
7368 .update_cr8_intercept = update_cr8_intercept, 7587 .update_cr8_intercept = update_cr8_intercept,
7588 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
7589 .vm_has_apicv = vmx_vm_has_apicv,
7590 .load_eoi_exitmap = vmx_load_eoi_exitmap,
7591 .hwapic_irr_update = vmx_hwapic_irr_update,
7592 .hwapic_isr_update = vmx_hwapic_isr_update,
7369 7593
7370 .set_tss_addr = vmx_set_tss_addr, 7594 .set_tss_addr = vmx_set_tss_addr,
7371 .get_tdp_level = get_ept_level, 7595 .get_tdp_level = get_ept_level,
@@ -7398,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
7398 7622
7399static int __init vmx_init(void) 7623static int __init vmx_init(void)
7400{ 7624{
7401 int r, i; 7625 int r, i, msr;
7402 7626
7403 rdmsrl_safe(MSR_EFER, &host_efer); 7627 rdmsrl_safe(MSR_EFER, &host_efer);
7404 7628
@@ -7419,11 +7643,19 @@ static int __init vmx_init(void)
7419 if (!vmx_msr_bitmap_legacy) 7643 if (!vmx_msr_bitmap_legacy)
7420 goto out1; 7644 goto out1;
7421 7645
7646 vmx_msr_bitmap_legacy_x2apic =
7647 (unsigned long *)__get_free_page(GFP_KERNEL);
7648 if (!vmx_msr_bitmap_legacy_x2apic)
7649 goto out2;
7422 7650
7423 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 7651 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
7424 if (!vmx_msr_bitmap_longmode) 7652 if (!vmx_msr_bitmap_longmode)
7425 goto out2; 7653 goto out3;
7426 7654
7655 vmx_msr_bitmap_longmode_x2apic =
7656 (unsigned long *)__get_free_page(GFP_KERNEL);
7657 if (!vmx_msr_bitmap_longmode_x2apic)
7658 goto out4;
7427 7659
7428 /* 7660 /*
7429 * Allow direct access to the PC debug port (it is often used for I/O 7661 * Allow direct access to the PC debug port (it is often used for I/O
@@ -7455,6 +7687,28 @@ static int __init vmx_init(void)
7455 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 7687 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
7456 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 7688 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
7457 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 7689 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
7690 memcpy(vmx_msr_bitmap_legacy_x2apic,
7691 vmx_msr_bitmap_legacy, PAGE_SIZE);
7692 memcpy(vmx_msr_bitmap_longmode_x2apic,
7693 vmx_msr_bitmap_longmode, PAGE_SIZE);
7694
7695 if (enable_apicv_reg_vid) {
7696 for (msr = 0x800; msr <= 0x8ff; msr++)
7697 vmx_disable_intercept_msr_read_x2apic(msr);
7698
7699 /* According SDM, in x2apic mode, the whole id reg is used.
7700 * But in KVM, it only use the highest eight bits. Need to
7701 * intercept it */
7702 vmx_enable_intercept_msr_read_x2apic(0x802);
7703 /* TMCCT */
7704 vmx_enable_intercept_msr_read_x2apic(0x839);
7705 /* TPR */
7706 vmx_disable_intercept_msr_write_x2apic(0x808);
7707 /* EOI */
7708 vmx_disable_intercept_msr_write_x2apic(0x80b);
7709 /* SELF-IPI */
7710 vmx_disable_intercept_msr_write_x2apic(0x83f);
7711 }
7458 7712
7459 if (enable_ept) { 7713 if (enable_ept) {
7460 kvm_mmu_set_mask_ptes(0ull, 7714 kvm_mmu_set_mask_ptes(0ull,
@@ -7468,8 +7722,10 @@ static int __init vmx_init(void)
7468 7722
7469 return 0; 7723 return 0;
7470 7724
7471out3: 7725out4:
7472 free_page((unsigned long)vmx_msr_bitmap_longmode); 7726 free_page((unsigned long)vmx_msr_bitmap_longmode);
7727out3:
7728 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7473out2: 7729out2:
7474 free_page((unsigned long)vmx_msr_bitmap_legacy); 7730 free_page((unsigned long)vmx_msr_bitmap_legacy);
7475out1: 7731out1:
@@ -7481,6 +7737,8 @@ out:
7481 7737
7482static void __exit vmx_exit(void) 7738static void __exit vmx_exit(void)
7483{ 7739{
7740 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
7741 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
7484 free_page((unsigned long)vmx_msr_bitmap_legacy); 7742 free_page((unsigned long)vmx_msr_bitmap_legacy);
7485 free_page((unsigned long)vmx_msr_bitmap_longmode); 7743 free_page((unsigned long)vmx_msr_bitmap_longmode);
7486 free_page((unsigned long)vmx_io_bitmap_b); 7744 free_page((unsigned long)vmx_io_bitmap_b);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37040079cd6b..f71500af1f81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
872 872
873 kvm_x86_ops->set_efer(vcpu, efer); 873 kvm_x86_ops->set_efer(vcpu, efer);
874 874
875 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
876
877 /* Update reserved bits */ 875 /* Update reserved bits */
878 if ((efer ^ old_efer) & EFER_NX) 876 if ((efer ^ old_efer) & EFER_NX)
879 kvm_mmu_reset_context(vcpu); 877 kvm_mmu_reset_context(vcpu);
@@ -2522,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2522 r = KVM_MAX_VCPUS; 2520 r = KVM_MAX_VCPUS;
2523 break; 2521 break;
2524 case KVM_CAP_NR_MEMSLOTS: 2522 case KVM_CAP_NR_MEMSLOTS:
2525 r = KVM_MEMORY_SLOTS; 2523 r = KVM_USER_MEM_SLOTS;
2526 break; 2524 break;
2527 case KVM_CAP_PV_MMU: /* obsolete */ 2525 case KVM_CAP_PV_MMU: /* obsolete */
2528 r = 0; 2526 r = 0;
@@ -3274,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3274 return -EINVAL; 3272 return -EINVAL;
3275 3273
3276 mutex_lock(&kvm->slots_lock); 3274 mutex_lock(&kvm->slots_lock);
3277 spin_lock(&kvm->mmu_lock);
3278 3275
3279 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 3276 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3280 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 3277 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3281 3278
3282 spin_unlock(&kvm->mmu_lock);
3283 mutex_unlock(&kvm->slots_lock); 3279 mutex_unlock(&kvm->slots_lock);
3284 return 0; 3280 return 0;
3285} 3281}
@@ -3439,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3439 mutex_lock(&kvm->slots_lock); 3435 mutex_lock(&kvm->slots_lock);
3440 3436
3441 r = -EINVAL; 3437 r = -EINVAL;
3442 if (log->slot >= KVM_MEMORY_SLOTS) 3438 if (log->slot >= KVM_USER_MEM_SLOTS)
3443 goto out; 3439 goto out;
3444 3440
3445 memslot = id_to_memslot(kvm->memslots, log->slot); 3441 memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4495,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4495 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); 4491 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4496 *selector = var.selector; 4492 *selector = var.selector;
4497 4493
4498 if (var.unusable) 4494 if (var.unusable) {
4495 memset(desc, 0, sizeof(*desc));
4499 return false; 4496 return false;
4497 }
4500 4498
4501 if (var.g) 4499 if (var.g)
4502 var.limit >>= 12; 4500 var.limit >>= 12;
@@ -4757,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4757 return r; 4755 return r;
4758} 4756}
4759 4757
4760static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4758static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
4759 bool write_fault_to_shadow_pgtable)
4761{ 4760{
4762 gpa_t gpa; 4761 gpa_t gpa = cr2;
4763 pfn_t pfn; 4762 pfn_t pfn;
4764 4763
4765 if (tdp_enabled) 4764 if (!vcpu->arch.mmu.direct_map) {
4766 return false; 4765 /*
4767 4766 * Write permission should be allowed since only
4768 /* 4767 * write access need to be emulated.
4769 * if emulation was due to access to shadowed page table 4768 */
4770 * and it failed try to unshadow page and re-enter the 4769 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4771 * guest to let CPU execute the instruction.
4772 */
4773 if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4774 return true;
4775
4776 gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
4777 4770
4778 if (gpa == UNMAPPED_GVA) 4771 /*
4779 return true; /* let cpu generate fault */ 4772 * If the mapping is invalid in guest, let cpu retry
4773 * it to generate fault.
4774 */
4775 if (gpa == UNMAPPED_GVA)
4776 return true;
4777 }
4780 4778
4781 /* 4779 /*
4782 * Do not retry the unhandleable instruction if it faults on the 4780 * Do not retry the unhandleable instruction if it faults on the
@@ -4785,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4785 * instruction -> ... 4783 * instruction -> ...
4786 */ 4784 */
4787 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4785 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4788 if (!is_error_noslot_pfn(pfn)) { 4786
4789 kvm_release_pfn_clean(pfn); 4787 /*
4788 * If the instruction failed on the error pfn, it can not be fixed,
4789 * report the error to userspace.
4790 */
4791 if (is_error_noslot_pfn(pfn))
4792 return false;
4793
4794 kvm_release_pfn_clean(pfn);
4795
4796 /* The instructions are well-emulated on direct mmu. */
4797 if (vcpu->arch.mmu.direct_map) {
4798 unsigned int indirect_shadow_pages;
4799
4800 spin_lock(&vcpu->kvm->mmu_lock);
4801 indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
4802 spin_unlock(&vcpu->kvm->mmu_lock);
4803
4804 if (indirect_shadow_pages)
4805 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4806
4790 return true; 4807 return true;
4791 } 4808 }
4792 4809
4793 return false; 4810 /*
4811 * if emulation was due to access to shadowed page table
4812 * and it failed try to unshadow page and re-enter the
4813 * guest to let CPU execute the instruction.
4814 */
4815 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4816
4817 /*
4818 * If the access faults on its page table, it can not
4819 * be fixed by unprotecting shadow page and it should
4820 * be reported to userspace.
4821 */
4822 return !write_fault_to_shadow_pgtable;
4794} 4823}
4795 4824
4796static bool retry_instruction(struct x86_emulate_ctxt *ctxt, 4825static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4832,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4832 if (!vcpu->arch.mmu.direct_map) 4861 if (!vcpu->arch.mmu.direct_map)
4833 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); 4862 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4834 4863
4835 kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4864 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4836 4865
4837 return true; 4866 return true;
4838} 4867}
@@ -4849,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4849 int r; 4878 int r;
4850 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4879 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4851 bool writeback = true; 4880 bool writeback = true;
4881 bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
4852 4882
4883 /*
4884 * Clear write_fault_to_shadow_pgtable here to ensure it is
4885 * never reused.
4886 */
4887 vcpu->arch.write_fault_to_shadow_pgtable = false;
4853 kvm_clear_exception_queue(vcpu); 4888 kvm_clear_exception_queue(vcpu);
4854 4889
4855 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4890 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4868,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4868 if (r != EMULATION_OK) { 4903 if (r != EMULATION_OK) {
4869 if (emulation_type & EMULTYPE_TRAP_UD) 4904 if (emulation_type & EMULTYPE_TRAP_UD)
4870 return EMULATE_FAIL; 4905 return EMULATE_FAIL;
4871 if (reexecute_instruction(vcpu, cr2)) 4906 if (reexecute_instruction(vcpu, cr2,
4907 write_fault_to_spt))
4872 return EMULATE_DONE; 4908 return EMULATE_DONE;
4873 if (emulation_type & EMULTYPE_SKIP) 4909 if (emulation_type & EMULTYPE_SKIP)
4874 return EMULATE_FAIL; 4910 return EMULATE_FAIL;
@@ -4898,7 +4934,7 @@ restart:
4898 return EMULATE_DONE; 4934 return EMULATE_DONE;
4899 4935
4900 if (r == EMULATION_FAILED) { 4936 if (r == EMULATION_FAILED) {
4901 if (reexecute_instruction(vcpu, cr2)) 4937 if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
4902 return EMULATE_DONE; 4938 return EMULATE_DONE;
4903 4939
4904 return handle_emulation_failure(vcpu); 4940 return handle_emulation_failure(vcpu);
@@ -5541,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5541 vcpu->arch.nmi_injected = true; 5577 vcpu->arch.nmi_injected = true;
5542 kvm_x86_ops->set_nmi(vcpu); 5578 kvm_x86_ops->set_nmi(vcpu);
5543 } 5579 }
5544 } else if (kvm_cpu_has_interrupt(vcpu)) { 5580 } else if (kvm_cpu_has_injectable_intr(vcpu)) {
5545 if (kvm_x86_ops->interrupt_allowed(vcpu)) { 5581 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5546 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), 5582 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5547 false); 5583 false);
@@ -5609,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5609#endif 5645#endif
5610} 5646}
5611 5647
5648static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
5649{
5650 u64 eoi_exit_bitmap[4];
5651
5652 memset(eoi_exit_bitmap, 0, 32);
5653
5654 kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
5655 kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
5656}
5657
5612static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5658static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5613{ 5659{
5614 int r; 5660 int r;
@@ -5662,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5662 kvm_handle_pmu_event(vcpu); 5708 kvm_handle_pmu_event(vcpu);
5663 if (kvm_check_request(KVM_REQ_PMI, vcpu)) 5709 if (kvm_check_request(KVM_REQ_PMI, vcpu))
5664 kvm_deliver_pmi(vcpu); 5710 kvm_deliver_pmi(vcpu);
5711 if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
5712 update_eoi_exitmap(vcpu);
5665 } 5713 }
5666 5714
5667 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5715 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5670,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5670 /* enable NMI/IRQ window open exits if needed */ 5718 /* enable NMI/IRQ window open exits if needed */
5671 if (vcpu->arch.nmi_pending) 5719 if (vcpu->arch.nmi_pending)
5672 kvm_x86_ops->enable_nmi_window(vcpu); 5720 kvm_x86_ops->enable_nmi_window(vcpu);
5673 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5721 else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
5674 kvm_x86_ops->enable_irq_window(vcpu); 5722 kvm_x86_ops->enable_irq_window(vcpu);
5675 5723
5676 if (kvm_lapic_enabled(vcpu)) { 5724 if (kvm_lapic_enabled(vcpu)) {
5725 /*
5726 * Update architecture specific hints for APIC
5727 * virtual interrupt delivery.
5728 */
5729 if (kvm_x86_ops->hwapic_irr_update)
5730 kvm_x86_ops->hwapic_irr_update(vcpu,
5731 kvm_lapic_find_highest_irr(vcpu));
5677 update_cr8_intercept(vcpu); 5732 update_cr8_intercept(vcpu);
5678 kvm_lapic_sync_to_vapic(vcpu); 5733 kvm_lapic_sync_to_vapic(vcpu);
5679 } 5734 }
@@ -6853,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6853 struct kvm_memory_slot *memslot, 6908 struct kvm_memory_slot *memslot,
6854 struct kvm_memory_slot old, 6909 struct kvm_memory_slot old,
6855 struct kvm_userspace_memory_region *mem, 6910 struct kvm_userspace_memory_region *mem,
6856 int user_alloc) 6911 bool user_alloc)
6857{ 6912{
6858 int npages = memslot->npages; 6913 int npages = memslot->npages;
6859 int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
6860 6914
6861 /* Prevent internal slot pages from being moved by fork()/COW. */ 6915 /*
6862 if (memslot->id >= KVM_MEMORY_SLOTS) 6916 * Only private memory slots need to be mapped here since
6863 map_flags = MAP_SHARED | MAP_ANONYMOUS; 6917 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
6864
6865 /*To keep backward compatibility with older userspace,
6866 *x86 needs to handle !user_alloc case.
6867 */ 6918 */
6868 if (!user_alloc) { 6919 if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
6869 if (npages && !old.npages) { 6920 unsigned long userspace_addr;
6870 unsigned long userspace_addr;
6871 6921
6872 userspace_addr = vm_mmap(NULL, 0, 6922 /*
6873 npages * PAGE_SIZE, 6923 * MAP_SHARED to prevent internal slot pages from being moved
6874 PROT_READ | PROT_WRITE, 6924 * by fork()/COW.
6875 map_flags, 6925 */
6876 0); 6926 userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
6927 PROT_READ | PROT_WRITE,
6928 MAP_SHARED | MAP_ANONYMOUS, 0);
6877 6929
6878 if (IS_ERR((void *)userspace_addr)) 6930 if (IS_ERR((void *)userspace_addr))
6879 return PTR_ERR((void *)userspace_addr); 6931 return PTR_ERR((void *)userspace_addr);
6880 6932
6881 memslot->userspace_addr = userspace_addr; 6933 memslot->userspace_addr = userspace_addr;
6882 }
6883 } 6934 }
6884 6935
6885
6886 return 0; 6936 return 0;
6887} 6937}
6888 6938
6889void kvm_arch_commit_memory_region(struct kvm *kvm, 6939void kvm_arch_commit_memory_region(struct kvm *kvm,
6890 struct kvm_userspace_memory_region *mem, 6940 struct kvm_userspace_memory_region *mem,
6891 struct kvm_memory_slot old, 6941 struct kvm_memory_slot old,
6892 int user_alloc) 6942 bool user_alloc)
6893{ 6943{
6894 6944
6895 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; 6945 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6896 6946
6897 if (!user_alloc && !old.user_alloc && old.npages && !npages) { 6947 if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
6898 int ret; 6948 int ret;
6899 6949
6900 ret = vm_munmap(old.userspace_addr, 6950 ret = vm_munmap(old.userspace_addr,
@@ -6908,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6908 if (!kvm->arch.n_requested_mmu_pages) 6958 if (!kvm->arch.n_requested_mmu_pages)
6909 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 6959 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6910 6960
6911 spin_lock(&kvm->mmu_lock);
6912 if (nr_mmu_pages) 6961 if (nr_mmu_pages)
6913 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6962 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6914 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6963 /*
6915 spin_unlock(&kvm->mmu_lock); 6964 * Write protect all pages for dirty logging.
6965 * Existing largepage mappings are destroyed here and new ones will
6966 * not be created until the end of the logging.
6967 */
6968 if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
6969 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6916 /* 6970 /*
6917 * If memory slot is created, or moved, we need to clear all 6971 * If memory slot is created, or moved, we need to clear all
6918 * mmio sptes. 6972 * mmio sptes.